In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [12]:
def log_entropy_weight(matrix):
    if type(matrix) is not np.ndarray:
        matrix = matrix.toarray()
    normalized = matrix / (1 + np.sum(matrix, axis=0))
    nr_docs, _ = matrix.shape
    '''
        g_i = 1 + sum     p_ij * log(p_ij + 1)   
                 j=1,N  ------------------------
                               log(N)                              
    '''
    entropy = 1 + np.sum(np.multiply(normalized, np.log(normalized + 1)), axis=0)/np.log(nr_docs)
    '''
        logent_ij = gi * log(tf_ij + 1)
    '''
    log_ent = entropy * np.log(matrix + 1)
    return log_ent


In [13]:
langs = ["nl","da","de","it","es",]

model = LogisticRegression(penalty='l2',
                           dual=False,
                           max_iter=10000,
                           tol=0.0001,
                           solver='liblinear',
                           C=1,
                           fit_intercept=True,
                           intercept_scaling=1.0,
                           class_weight=None,
                           random_state=1)

cvc = CountVectorizer(max_features=500,
                      strip_accents='unicode',
                      analyzer='word',
                      lowercase=True,
                      token_pattern=r'\w{1,}',
                      ngram_range=(1, 5))
ans = pd.DataFrame()
for language in langs:
    print("working on:", language)
    train_data = pd.read_csv(f"corpus/train/{language}/{language}_pos.csv")
    test_data = pd.read_csv(f"corpus/test/{language}/{language}_pos.csv")

    X = cvc.fit_transform(list(train_data.text) + list(test_data.text))

    print("ey i'm log entropying 'ere")
    X = log_entropy_weight(X)

    model.fit(X[:len(train_data.text)], train_data.label)
    res = model.predict(X[len(train_data.text):])

    test_data["label"] = res
    test_data = test_data.drop("text", axis=1)
    ans = pd.concat([ans, test_data])


working on: nl
ey i'm log entropying 'ere
working on: da
ey i'm log entropying 'ere
working on: de
ey i'm log entropying 'ere
working on: it
ey i'm log entropying 'ere
working on: es
ey i'm log entropying 'ere


In [14]:
results = ans.copy()

In [15]:
ans["index"] += 1
ans = ans.sort_values(by=["index"])
ans = ans.rename(columns={"index": "id"})
print(ans.head())
ans.to_csv("submission_model_Regression_pos.csv",index=False)

   id    label
0   1  England
0   2  England
1   3  England
0   4  England
1   5  England


11086