In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pprint
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, make_scorer, precision_score, recall_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import advertools as adv
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score,  cross_val_predict
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)


In [29]:
def log_entropy_weight(matrix):
    if type(matrix) is not np.ndarray:
        matrix = matrix.toarray()
    normalized = matrix / (1 + np.sum(matrix, axis=0))
    nr_docs, _ = matrix.shape
    '''
        g_i = 1 + sum     p_ij * log(p_ij + 1)   
                 j=1,N  ------------------------
                               log(N)                              
    '''
    entropy = 1 + np.sum(np.multiply(normalized, np.log(normalized + 1)), axis=0)/np.log(nr_docs)
    '''
        logent_ij = gi * log(tf_ij + 1)
    '''
    log_ent = entropy * np.log(matrix + 1)
    return log_ent


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


langs = ["nl","da","de","it","es",]
stop_words = []
for key in ["danish", "german", "dutch", "italian", "spanish"]:
    stop_words += list(adv.stopwords[key])

model = LogisticRegression(penalty='l2',
                           dual=False,
                           max_iter=10000,
                           tol=0.0001,
                           solver='liblinear',
                           C=1,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           class_weight=None,
                           random_state=1)
# model = LinearSVC(max_iter=5000, random_state=21, C=1,  penalty="l1", dual=False, class_weight="balanced")

cvc = CountVectorizer(max_features=2000,
                      strip_accents='unicode',
                      ngram_range=(1, 5))
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 5), stop_words=stop_words)
ans = pd.DataFrame()


def simpleModel(X_train, y_train, X_test):
    print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")
    X = tfidf.fit_transform(list(X_train) + list(X_test))
    
    # X = cvc.fit_transform(list(X_train) + list(X_test))
    # X = log_entropy_weight(X)
    model.fit(X[:len(X_train)], y_train)
    res = model.predict(X[len(X_train):])
    return res


In [31]:
# for language in langs:
#     print("working on:", language)
#     all_train_data = pd.read_csv(f"../corpus/train/{language}/{language}_pos.csv")
#     all_test_data = pd.read_csv(f"../corpus/test/{language}/{language}_pos.csv")

#     all_train_data["sencount"] = all_train_data.text.str.count("SENTSEP")
#     all_test_data["sencount"] = all_test_data.text.str.count("SENTSEP")
#     max_sen_count = max(all_train_data["sencount"].max(), all_test_data["sencount"].max())
#     for l, r in [(0, 1000)]:
#         print(f"sentences of {l} <= len < {r}")
#         train_data = all_train_data.loc[all_train_data.sencount.between(l, r, inclusive="left")].copy()
#         test_data = all_test_data.loc[all_test_data.sencount.between(l, r, inclusive="left")].copy()

#         res = simpleModel(train_data.text, train_data.label, test_data.text)
        
#         test_data["label"] = res
#         test_data = test_data.drop("text", axis=1)
#         ans = pd.concat([ans, test_data])
# results = ans.copy()


In [32]:
# ans["index"] += 1
# ans = ans.sort_values(by=["index"])
# ans = ans.rename(columns={"index": "id"})
# ans = ans.drop("sencount", axis=1)
# print(ans.head())
# ans.to_csv("../submissions/beepboop.csv",index=False)

In [34]:
kfold = KFold(n_splits=5, shuffle=True, random_state=21)

# # LANGUAGE
# for language in langs:
#     print("working on:", language)
#     train = pd.read_csv(f"../corpus/train/{language}/{language}_pos.csv")
#     acc = []
#     f1 = []
#     for train_index, test_index in kfold.split(train):
#         X_train, X_test = train.text[train_index], train.text[test_index]
#         y_train, y_test = list(train.label[train_index]), list(train.label[test_index])

#         res = simpleModel(X_train, y_train, X_test)

#         acc.append(accuracy_score(res, y_test))
#         f1.append(f1_score(res, y_test, average="weighted"))

#     print("Acc:", acc)
#     print("F1: ", f1)


# # NO LANG



working on: nl
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6651, Test size: 1662
Train size: 6651, Test size: 1662
Acc: [0.5646422128683103, 0.5923030667468431, 0.571256764882742, 0.5860409145607701, 0.5896510228640193]
F1:  [0.661508384536065, 0.6940999123775468, 0.6610206028383783, 0.6785856462364872, 0.6794223317949042]
working on: da
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6651, Test size: 1663
Train size: 6652, Test size: 1662
Acc: [0.5670475045099218, 0.5917017438364401, 0.5598316295850871, 0.5796752856283824, 0.5998796630565584]
F1:  [0.6582690238292602, 0.6723787389940379, 0.6463430710358398, 0.6616516633241912, 0.6755742763309389]
working on: de
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6650, Test size: 1663
Train size: 6651, Test size: 1662
Train size: 6651, Test size: 1662
Acc: [0.574263379434756

In [None]:
# nolang svm tf-idf = ~0.70

# regresion log-entropy noent = ~0.6
# regresion log-entropy pos = ~0.5
# svm log-entropy pos = ~ failed to converge
# svm tf-idf pos = ~ failed to converge
# svm log-entropy noent = ~ failed to converge
# svm tf-idf noent = ~ 0.6
# regresion tf-idf noent =  ~ 0.61
# regresion tf-idf pos = ~0.59

# nolang regression log-entropy noent = ~ 0.58
# nolang regression log-entropy pos = ~ 0.56
# nolang svm log-entropy noent = ~
# nolang svm log-entropy pos = ~ failed to converge
# nolang svm tf-idf noent = ~
# nolang svm tf-idf pos = ~
