In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [8]:
def log_entropy_weight(matrix):
    if type(matrix) is not np.ndarray:
        matrix = matrix.toarray()
    normalized = matrix / (1 + np.sum(matrix, axis=0))
    nr_docs, _ = matrix.shape
    '''
        g_i = 1 + sum     p_ij * log(p_ij + 1)   
                 j=1,N  ------------------------
                               log(N)                              
    '''
    entropy = 1 + np.sum(np.multiply(normalized, np.log(normalized + 1)), axis=0)/np.log(nr_docs)
    '''
        logent_ij = gi * log(tf_ij + 1)
    '''
    log_ent = entropy * np.log(matrix + 1)
    return log_ent


In [9]:
langs = ["nl","da","de","it","es",]

model = LogisticRegression(penalty='l2',
                           dual=False,
                           max_iter=10000,
                           tol=0.0001,
                           solver='liblinear',
                           C=1,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           class_weight=None,
                           random_state=1)

cvc = CountVectorizer(max_features=2000,
                      ngram_range=(1, 5))

ans = pd.DataFrame()


def simpleModel(X_train, y_train, X_test):
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    X = cvc.fit_transform(list(X_train) + list(X_test))
    X = log_entropy_weight(X)
    model.fit(X[:len(X_train)], y_train)
    res = model.predict(X[len(X_train):])
    return res

acc = []
f1 = []

In [10]:
all_train_data = pd.DataFrame()
all_test_data = pd.DataFrame()

for language in langs:
    all_train_data = pd.concat([all_train_data, pd.read_csv(f"corpus/train/{language}/{language}_pos.csv")])
    all_test_data = pd.concat([all_test_data, pd.read_csv(f"corpus/test/{language}/{language}_pos.csv")])

all_train_data["sencount"] = all_train_data.text.str.count("SENTSEP")
all_test_data["sencount"] = all_test_data.text.str.count("SENTSEP")

max_sen_count = max(all_train_data["sencount"].max(), all_test_data["sencount"].max())
for l, r in [(0, 10), (10, 1000)]:
    print(f"sentences of {l} <= len < {r}")
    train_data = all_train_data.loc[all_train_data.sencount.between(l, r, inclusive="left")].copy()
    test_data = all_test_data.loc[all_test_data.sencount.between(l, r, inclusive="left")].copy()

    res = simpleModel(train_data.text, train_data.label, test_data.text)
    test_data["label"] = res
    test_data = test_data.drop("text", axis=1)
    ans = pd.concat([ans, test_data])

    X_train, X_test, y_train, y_test = train_test_split(train_data.text, train_data.label, test_size=0.25, random_state=21, shuffle=True)
    res = simpleModel(X_train, y_train, X_test)
    acc.append(accuracy_score(res, y_test))
    f1.append(f1_score(res, y_test,  average='weighted'))
    print("acc:", acc[-1])
    print("f1:", f1[-1])

print(np.round(np.mean(acc), 2))
print(np.round(np.mean(f1), 2))
results = ans.copy()

sentences of 0 <= len < 10
Train size: 16261, Test size: 5511
Train size: 12195, Test size: 4066
acc: 0.5122970978848992
f1: 0.5242117554489256
sentences of 10 <= len < 1000
Train size: 25307, Test size: 8349
Train size: 18980, Test size: 6327
acc: 0.5621937727200885
f1: 0.5983211106430901
0.54
0.56


In [11]:
ans["index"] += 1
ans = ans.sort_values(by=["index"])
ans = ans.rename(columns={"index": "id"})
ans = ans.drop("sencount", axis=1)
print(ans.head())
ans.to_csv("idc_lang_pos.csv",index=False)

   id     label
0   1   England
0   2   England
1   3   England
0   4  Scotland
1   5  Scotland


In [12]:
# kfold = KFold(n_splits=5, shuffle=True, random_state=21)

# for language in langs:
#     print("working on:", language)
#     train = pd.read_csv(f"corpus/train/{language}/{language}_noent.csv")
#     acc = []
#     f1 = []
#     for train_index, test_index in kfold.split(train):
#         X_train, X_test = train.text[train_index], train.text[test_index]
#         y_train, y_test = train.label[train_index], train.label[test_index]

#         X = cvc.fit_transform(list(X_train) + list(X_test))
#         X = log_entropy_weight(X)

#         model.fit(X[:len(X_train)], y_train)
#         res = model.predict(X[len(X_train):])
#         acc.append(accuracy_score(res, y_test))
#         f1.append(f1_score(res, y_test, average='weighted'))
#     print("Acc:", acc)
#     print("F1: ", f1)
