In [3]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

In [4]:
data = get_data("../../data/authors.csv")

In [5]:
tt = TweetTokenizer()

estimators = [
        ('svm', LinearSVC(random_state=42, max_iter=10000)),
        ('lr_l1', LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        ('rf'), RandomForestClassifier(random_state=42))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, penalty="l2", solver="liblinear"))

grid = [(MultinomialNB(), CountVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize)),
        (LogisticRegression(random_state=42, penalty="l1", solver="liblinear"), TfidfVectorizer(ngram_range=(1,5), analyzer="char")),
        (LogisticRegression(random_state=42, penalty="l2", solver="liblinear"), TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize)),
        (LinearSVC(random_state=42, max_iter=10000), TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize)),
        (SVC(random_state=42), TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize)),
        (DecisionTreeClassifier(random_state=42), CountVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize)),
        (RandomForestClassifier(random_state=42), CountVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize)),
        (AdaBoostClassifier(random_state=42), CountVectorizer(ngram_range=(1,5), analyzer="char")),
        (GradientBoostingClassifier(random_state=42), CountVectorizer(ngram_range=(1,3), analyzer="char")),
        (stacking, TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize))
        ]

In [6]:
from joblib import Parallel, delayed
def process(clf, vectorizer):
    clf_str = clf.__str__()
    print(f"Running ===> {clf_str}")
    evaluation = list()
    usernames = list(np.unique(data["username"]))

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)

            author_clf = AuthorClassifier(clf=clf, vectorizer=vectorizer, scaler=MaxAbsScaler())
            author_clf.fit(X_train["comment"], y_train)
            y_pred = author_clf.predict(X_test["comment"])
            evaluation.append(author_clf.evaluate(y_test, y_pred))

    metrics = pd.DataFrame(evaluation)
    metrics["clf"] = [clf_str for i in range(len(metrics))]

    print(f"Finish <=== {clf_str}")
    return (clf_str, metrics)
        
results = Parallel(n_jobs=6)(delayed(process)(clf, vectorizer) for clf, vectorizer in grid)

Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
Running ===> MultinomialNB()
Running ===> LogisticRegression(random_state=42, solver='liblinear')
Running ===> LinearSVC(max_iter=10000, random_state=42)
Running ===> SVC(random_state=42)
Running ===> DecisionTreeClassifier(random_state=42)


KeyboardInterrupt: 

In [None]:
# import numpy as np, scipy.stats as st
# import matplotlib.pyplot as plt
# import seaborn as sns

# lim_inf, lim_sup = st.t.interval(0.95, len(metrics["accuracy"])-1, loc=metrics["accuracy"].mean(), scale=st.sem(metrics["accuracy"]))
# sns.displot(metrics["accuracy"], kde=True)
# plt.axvline(lim_inf, color="red")
# plt.axvline(lim_sup, color="red")

In [None]:
# sns.barplot(y='accuracy', data=metrics, estimator=np.mean, ci=95, capsize=.2, color='lightblue')