```python
python -m spacy download nl_core_news_lg
python -m spacy download da_core_news_lg
python -m spacy download de_core_news_lg
python -m spacy download it_core_news_lg
python -m spacy download es_core_news_lg
```

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score
import spacy

In [None]:
def preprocess(data):
    data.text.replace(r'\s+|\\n', ' ', regex=True, inplace=True)
    data.text = data.text.str.strip(" -.,")
    # data.text = data.text.str.lower()
    data.text.replace(r'\n', '', regex=True, inplace=True)
    data.text.replace('', np.nan, regex=True, inplace=True)
    data.dropna(inplace=True)
    return data

# eng = pd.read_csv("old/England.csv")
# ire = pd.read_csv("old/Ireland.csv")
# scot = pd.read_csv("old/Scotland.csv")
# train_data = pd.concat([eng, scot, ire])
train_data = pd.read_csv("train_data.csv")
train_data = preprocess(train_data)
train_data.reset_index(inplace=True)

In [None]:
def noent(nlp, row):
    try:
        doc = nlp(row)
        sentence = " SENTSEP ".join([" ".join([t.text if t.ent_type == 0 else t.ent_type_ for t in d]) for d in doc.sents])
        return sentence
    except:
        print(row)

def make_noent(df, nlp, path):
    df["text"] = df.text.apply(lambda row: noent(nlp, row))
    df.to_csv(path+"_noent.csv", index=False)


def pos(nlp, row):
    try:
        doc = nlp(row)
        sentence = " SENTSEP ".join([" ".join([t.pos_ for t in d]) for d in doc.sents])
        return sentence
    except:
        print(row)

def make_pos(df, nlp, path):
    df["text"] = df.text.apply(lambda row: pos(nlp, row))
    df.to_csv(path+"_pos.csv", index=False)


In [None]:
nlp = spacy.load("fr_core_news_lg")

print(f"ready to work 'fr'")
make_noent(train_data.copy(), nlp, "fr")

print("work work")
make_pos(train_data.copy(), nlp, "fr")


In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=21)
model = LogisticRegression(penalty='l2',
                           dual=False,
                           max_iter=10000,
                           tol=0.0001,
                           solver='liblinear',
                           C=1,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           class_weight=None,
                           random_state=1)

cvc = CountVectorizer(max_features=500,
                      ngram_range=(1, 5))


def log_entropy_weight(matrix):
    if type(matrix) is not np.ndarray:
        matrix = matrix.toarray()
    normalized = matrix / (1 + np.sum(matrix, axis=0))
    nr_docs, _ = matrix.shape
    '''
        g_i = 1 + sum     p_ij * log(p_ij + 1)   
                 j=1,N  ------------------------
                               log(N)                              
    '''
    entropy = 1 + np.sum(np.multiply(normalized, np.log(normalized + 1)), axis=0)/np.log(nr_docs)
    '''
        logent_ij = gi * log(tf_ij + 1)
    '''
    log_ent = entropy * np.log(matrix + 1)
    return log_ent




In [None]:
from sklearn.preprocessing import LabelEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


train = pd.read_csv("fr_noent.csv")

acc = []
f1 = []
for train_index, test_index in kfold.split(train):
        X_train, X_test = train.text[train_index], train.text[test_index]
        y_train, y_test = train.label[train_index], train.label[test_index]

        # X_train = np.array(X_train).reshape(-1, 1)
        # us = RandomUnderSampler(random_state=21)
        # os = RandomOverSampler(random_state=21)
        # X_train, y_train = os.fit_resample(X_train, y_train)
        # X_train = X_train.flatten()

        X = cvc.fit_transform(list(X_train) + list(X_test))
        # X = log_entropy_weight(X)

        model.fit(X[:len(X_train)], y_train)
        res = model.predict(X[len(X_train):])
        acc.append(accuracy_score(res, y_test))
        f1.append(f1_score(res, y_test, average='weighted'))

print("Acc:", acc)
print("F1: ", f1)