In [None]:
import numpy as np
import pprint
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, make_scorer, precision_score, recall_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
import advertools as adv
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)


In [None]:
train_data = pd.read_csv("corpus/nolang_dif/train_data.csv",index_col="index")
print(train_data.head())


test_data = pd.read_csv("corpus/nolang_dif/test_data.csv", index_col="index")
print(test_data.head())


In [None]:
train_data = train_data.drop("language", axis=1)
train_data = train_data[pd.notnull(train_data["text"])]
train_data["category_id"] = train_data["label"].astype("category")
train_data["category_id"] = train_data["category_id"].cat.codes

train_data.head()

category_id_df = train_data[["label", "category_id"]].drop_duplicates().sort_values("category_id")
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[["category_id", "label"]].values)


In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8, 6))
train_data.groupby("label").text.count().plot.bar(ylim=0)
plt.show()

In [None]:
from collections import Counter
import matplotlib.pyplot as plt



train_sent_len = np.array(train_data.text.apply(lambda x: len(x)))
test_sent_len = np.array(test_data.text.apply(lambda x: len(x)))

plt.rcParams["figure.figsize"] = 10, 5
plt.hist(train_sent_len[np.where(train_sent_len < 7500)], alpha=0.5, label="train")
plt.hist(test_sent_len[np.where(test_sent_len < 7500)], alpha=0.5, label="test")
plt.legend(loc="upper right")
plt.title("Number of chars in paragraph")
plt.show()


In [None]:
import advertools as adv
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = []
for key in ["danish", "german", "dutch", "italian", "spanish"]:
    stop_words += list(adv.stopwords[key])


tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm="l2", encoding="latin-1", ngram_range=(1, 2), stop_words=stop_words)

features = tfidf.fit_transform(train_data.text)
labels = train_data.category_id
features.shape


In [None]:
from sklearn.feature_selection import chi2
import numpy as np
N = 2
for dialect, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names_out())[indices]
  unigrams = [v for v in feature_names if len(v.split(" ")) == 1]
  bigrams = [v for v in feature_names if len(v.split(" ")) == 2]
  # trigrams = [v for v in feature_names if len(v.split(" ")) == 3]
  print("# '{}':".format(dialect))
  print("  . Most correlated unigrams:\n. {}".format("\n. ".join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format("\n. ".join(bigrams[-N:])))
  # print("  . Most correlated trigrams:\n. {}".format("\n. ".join(trigrams[-N:])))


In [None]:
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(max_iter=5000),
    ComplementNB(),
    LogisticRegression(random_state=0, max_iter=5000),

    LinearSVC(max_iter=5000, random_state=21, C=1.5,  penalty="l1", dual=False, class_weight="balanced"),
    
]
kfold = KFold(n_splits=5, shuffle=True, random_state=69420)
entries = []
_scoring = make_scorer(f1_score, average="macro")

for i, model in enumerate(models):
  model_name = f"{model.__class__.__name__}_{i}"
  scores = cross_val_score(model, features, labels, scoring=_scoring, cv=kfold)
  for fold_idx, score in enumerate(scores):
    entries.append((model_name, fold_idx, score))

cv_df = pd.DataFrame(entries, columns=["model_name", "fold_idx", "score"])
sns.boxplot(x="model_name", y="score", data=cv_df)
sns.stripplot(x="model_name", y="score", data=cv_df,
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
print(cv_df.groupby("model_name").score.mean())


In [None]:
model = LinearSVC()
params = {  "penalty": ["l1","l2"],
            "dual": [True, False],
            "max_iter": [6009],
            "C": [1,5],
            "loss": ['hinge', 'squared_hinge'],
            "multi_class": ["ovr", "crammer_singer"],
            "class_weight": ["balanced", None],
            "random_state": [21]
} 
clf = GridSearchCV(model, params, scoring=_scoring, cv=kfold, return_train_score=True, n_jobs=-1, verbose=2)
clf.fit(features, labels)

In [None]:
clf.best_params_