In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score


In [2]:
def log_entropy_weight(matrix):
    if type(matrix) is not np.ndarray:
        matrix = matrix.toarray()
    normalized = matrix / (1 + np.sum(matrix, axis=0))
    nr_docs, _ = matrix.shape
    '''
        g_i = 1 + sum     p_ij * log(p_ij + 1)   
                 j=1,N  ------------------------
                               log(N)                              
    '''
    entropy = 1 + np.sum(np.multiply(normalized, np.log(normalized + 1)), axis=0)/np.log(nr_docs)
    '''
        logent_ij = gi * log(tf_ij + 1)
    '''
    log_ent = entropy * np.log(matrix + 1)
    return log_ent


In [3]:
langs = ["nl","da","de","it","es",]

model = LogisticRegression(penalty='l2',
                           dual=False,
                           max_iter=10000,
                           tol=0.0001,
                           solver='liblinear',
                           C=1,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           class_weight=None,
                           random_state=1)

cvc = CountVectorizer(max_features=2000,
                      strip_accents='unicode',
                      ngram_range=(1, 5))

ans = pd.DataFrame()


def simpleModel(X_train, y_train, X_test):
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    X = cvc.fit_transform(list(X_train) + list(X_test))
    X = log_entropy_weight(X)
    model.fit(X[:len(X_train)], y_train)
    res = model.predict(X[len(X_train):])
    return res


In [4]:
for language in langs:
    print("working on:", language)
    all_train_data = pd.read_csv(f"../corpus/train/{language}/{language}_pos.csv")
    all_test_data = pd.read_csv(f"../corpus/test/{language}/{language}_pos.csv")

    all_train_data["sencount"] = all_train_data.text.str.count("SENTSEP")
    all_test_data["sencount"] = all_test_data.text.str.count("SENTSEP")
    max_sen_count = max(all_train_data["sencount"].max(), all_test_data["sencount"].max())
    for l, r in [(0, 1000)]:
        print(f"sentences of {l} <= len < {r}")
        train_data = all_train_data.loc[all_train_data.sencount.between(l, r, inclusive="left")].copy()
        test_data = all_test_data.loc[all_test_data.sencount.between(l, r, inclusive="left")].copy()

        res = simpleModel(train_data.text, train_data.label, test_data.text)
        
        test_data["label"] = res
        test_data = test_data.drop("text", axis=1)
        ans = pd.concat([ans, test_data])
results = ans.copy()


working on: nl
sentences of 0 <= len < 1000
Train size: 8313, Test size: 2772
working on: da
sentences of 0 <= len < 1000
Train size: 8314, Test size: 2772
working on: de
sentences of 0 <= len < 1000
Train size: 8313, Test size: 2772
working on: it
sentences of 0 <= len < 1000
Train size: 8314, Test size: 2772
working on: es
sentences of 0 <= len < 1000
Train size: 8314, Test size: 2772


In [5]:
ans["index"] += 1
ans = ans.sort_values(by=["index"])
ans = ans.rename(columns={"index": "id"})
ans = ans.drop("sencount", axis=1)
print(ans.head())
ans.to_csv("../submissions/Regression_log-entropy_pos.csv",index=False)

   id     label
0   1   England
0   2   England
1   3   England
0   4  Scotland
1   5  Scotland


In [7]:
kfold = KFold(n_splits=5, shuffle=True, random_state=21)

for language in langs:
    print("working on:", language)
    train = pd.read_csv(f"../corpus/train/{language}/{language}_pos.csv")
    acc = []
    f1 = []
    for train_index, test_index in kfold.split(train):
        X_train, X_test = train.text[train_index], train.text[test_index]
        y_train, y_test = list(train.label[train_index]), list(train.label[test_index])

        res = simpleModel(X_train, y_train, X_test)

        acc.append(accuracy_score(res, y_test))
        f1.append(f1_score(res, y_test, average="weighted"))

    print("Acc:", acc)
    print("F1: ", f1)


working on: nl
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6651, Test size: 1662
Train size: 6651, Test size: 1662
Acc: [0.49188214070956104, 0.49188214070956104, 0.4834636199639206, 0.5096269554753309, 0.5006016847172082]
F1:  [0.49766950430433, 0.49810022050823727, 0.48963373028580487, 0.5161640848417768, 0.5057445474199934]
working on: da
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6652, Test size: 1662
Acc: [0.49969933854479853, 0.523752254960914, 0.4954900781719784, 0.5063138905592303, 0.52647412755716]
F1:  [0.508251380928599, 0.5341123855336218, 0.5018842873692266, 0.5105248653177683, 0.5302379486557843]
working on: de
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6651, Test size: 1662
Train size: 6651, Test size: 1662
Acc: [0.49248346361996