In [9]:
import numpy as np
import pprint
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import advertools as adv
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.model_selection import KFold, cross_validate
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)


In [10]:
def preprocess(data):
    data.text = data.text.apply(lambda x: x.lower())
    data.text = data.text.replace(r'\s+|\\n', '', regex=True)
    return data

In [11]:
train_data = pd.read_csv("data/train_data.csv")
train_data = preprocess(train_data)
print(train_data.head())

test_data = pd.read_csv("data/test_data.csv")
test_data = preprocess(test_data)
print(test_data.head())


  language                                               text    label
0    dansk   dette er et fremragende initiativ, og jeg stø...  Ireland
1    dansk   hr. formand, jeg er sikker på, at alle her er...  Ireland
2    dansk   hr. formand, folk på den nordlige halvkugle t...  England
3    dansk   hr. formand, med forbehold af nogle få ændrin...  England
4    dansk   - hr. formand, jeg må protestere mod den lemf...  England
                                                text
0   hr. formand, selv om vi i høj grad sympatiser...
1   quiero dejar constancia de mi apoyo a este in...
2   . – el comercio ilegal de riñones humanos se ...
3   signor presidente, per introdurre una nota di...
4   jeg stemte for meddelelsen af decharge til fæ...


In [12]:
le = preprocessing.LabelEncoder()
mappings = []
for col in ["language", "label"]:
    train_data[col] = le.fit_transform(train_data[col])
    mappings.append(dict(zip(le.transform(le.classes_), le.classes_)))

print(mappings)

In [13]:
stop_words = []
for key in ["danish", "german", "dutch", "italian", "spanish"]:
    stop_words += list(adv.stopwords[key])

# count_vec = TfidfVectorizer(stop_words=stop_words)
count_vec = CountVectorizer(stop_words=stop_words)

train_bow = count_vec.fit_transform(train_data.text)
test_bow = count_vec.transform(test_data.text)



In [14]:
model = ComplementNB().fit(train_bow, train_data.label)
predicted_labels = model.predict(test_bow)
print(predicted_labels)

[0 1 1 ... 0 1 2]


In [15]:
final_data = {"id": test_data.index+1, "label": predicted_labels}

submission = pd.DataFrame(data=final_data).set_index("id")
submission = submission.label.apply(lambda x: mappings[1][x])
submission.to_csv("submissions/submission_NB.csv")
submission.head()

id
1     England
2     Ireland
3     Ireland
4     England
5    Scotland
Name: label, dtype: object

In [16]:
def compare_models(_X, _y, models, _cv=5):

    acc = []
    _scoring = {'accuracy': make_scorer(accuracy_score),
                'precision': make_scorer(precision_score, average='macro'),
                'recall': make_scorer(recall_score, average='macro'),
                'f1': make_scorer(f1_score, average='macro')
                }

    kfold = KFold(n_splits=10, shuffle=True, random_state=69420)
    for model in models:
        results = cross_validate(estimator=model,
                                 X=_X,
                                 y=_y,
                                 cv=kfold,
                                 scoring=_scoring,
                                 return_train_score=True)

        pp.pprint({"Model": type(model).__name__,
                   "Training Accuracy scores": results['train_accuracy'],
                   "Mean Training Accuracy": results['train_accuracy'].mean()*100,
                   "Training Precision scores": results['train_precision'],
                   "Mean Training Precision": results['train_precision'].mean(),
                   "Training Recall scores": results['train_recall'],
                   "Mean Training Recall": results['train_recall'].mean(),
                   "Training F1 scores": results['train_f1'],
                   "Mean Training F1 Score": results['train_f1'].mean(),
                   "Validation Accuracy scores": results['test_accuracy'],
                   "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
                   "Validation Precision scores": results['test_precision'],
                   "Mean Validation Precision": results['test_precision'].mean(),
                   "Validation Recall scores": results['test_recall'],
                   "Mean Validation Recall": results['test_recall'].mean(),
                   "Validation F1 scores": results['test_f1'],
                   "Mean Validation F1 Score": results['test_f1'].mean()
                   })
        acc.append(results['test_f1'].mean()*100)
    return acc


models = [ComplementNB(), BernoulliNB(), MultinomialNB()]
print(compare_models(train_bow, train_data.label, models))


In [17]:
X = train_bow
y = train_data.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

model = ComplementNB().fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('F1 score: ', f1_score(y_test, y_pred, average="macro"))


Accuracy: 0.6904017320182824
F1 score: 0.6256641475662816
