In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../../data/authors_word2vec.csv").drop(["Unnamed: 0", "comment"], axis=1)
data.head()

Unnamed: 0,username,created_utc,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,Manada_2,2022-03-09 14:17:46+00:00,-0.432714,0.032157,1.099305,0.864218,0.837875,-3.109633,0.559502,3.881276,...,2.906217,0.421129,1.411148,1.064761,1.959023,1.601933,0.523119,0.897019,-0.426631,0.10312
1,BluePirate89,2022-03-31 16:00:06+00:00,0.125898,-0.242185,0.054639,0.735328,0.204172,-0.287813,0.222079,0.502998,...,0.112329,-0.015934,0.37365,0.769785,0.104709,0.382138,-0.12353,0.54085,-0.093899,-0.29784
2,9-Cortes,2021-11-29 02:08:23+00:00,-1.789684,0.469343,0.480798,1.27506,1.201183,-4.118136,0.754351,5.680286,...,2.970081,1.143064,0.718834,0.835258,3.357937,2.351601,1.128088,-0.537648,-0.405631,-0.016753
3,Supermunch2000,2021-12-21 12:01:05+00:00,0.515221,-0.98914,0.209595,3.852181,0.815947,-1.457306,1.57092,3.043237,...,0.800782,-0.175123,1.634715,3.569113,1.248902,2.070493,-0.804131,2.869144,-0.717174,-1.667274
4,TheGza1,2021-08-10 12:49:03+00:00,-1.702236,0.606132,0.567707,1.606782,1.322883,-4.053791,0.866824,5.870177,...,3.181649,0.857197,0.913484,0.512868,3.121929,2.340846,1.32559,-0.597204,-0.44673,0.19876


In [6]:
estimators = [
        ('svm', LinearSVC(random_state=42, max_iter=10000)),
        ('lr_l1', LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        ('rf'), RandomForestClassifier(random_state=42))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, penalty="l2", solver="liblinear"))

clfs = [MultinomialNB(),
        LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        LogisticRegression(random_state=42, penalty="l2", solver="liblinear"),
        LinearSVC(random_state=42, max_iter=10000),
        SVC(random_state=42),
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        AdaBoostClassifier(random_state=42),
        GradientBoostingClassifier(random_state=42),
        stacking
        ]

In [7]:
from joblib import Parallel, delayed
def process(clf):
    clf_str = clf.__str__()
    print(f"Running ===> {clf_str}")
    evaluation = list()
    usernames = list(np.unique(data["username"]))

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)

            author_clf = AuthorClassifier(clf=clf, scaler=MinMaxScaler(), embeddings=True)
            author_clf.fit(X_train, y_train)
            y_pred = author_clf.predict(X_test)
            evaluation.append(author_clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()

    print(f"Finish <=== {clf_str}")
    return (clf_str, metrics)
        
results = Parallel(n_jobs=6)(delayed(process)(clf) for clf in clfs)

Running ===> LogisticRegression(random_state=42, solver='liblinear')
Running ===> DecisionTreeClassifier(random_state=42)
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
Running ===> MultinomialNB()
Running ===> SVC(random_state=42)
Running ===> LinearSVC(max_iter=10000, random_state=42)
Finish <=== MultinomialNB()
Running ===> RandomForestClassifier(random_state=42)
Finish <=== LogisticRegression(random_state=42, solver='liblinear')
Running ===> AdaBoostClassifier(random_state=42)
Finish <=== DecisionTreeClassifier(random_state=42)
Running ===> GradientBoostingClassifier(random_state=42)
Finish <=== SVC(random_state=42)
Running ===> StackingClassifier(estimators=[('svm',
                                LinearSVC(max_iter=10000, random_state=42)),
                               ('lr_l1',
                                LogisticRegression(penalty='l1',
                                                   random_state=42,
                                 



Finish <=== LinearSVC(max_iter=10000, random_state=42)
Finish <=== LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
Finish <=== RandomForestClassifier(random_state=42)
Finish <=== AdaBoostClassifier(random_state=42)




Finish <=== GradientBoostingClassifier(random_state=42)
Finish <=== StackingClassifier(estimators=[('svm',
                                LinearSVC(max_iter=10000, random_state=42)),
                               ('lr_l1',
                                LogisticRegression(penalty='l1',
                                                   random_state=42,
                                                   solver='liblinear'),
                                'rf',
                                RandomForestClassifier(random_state=42))],
                   final_estimator=LogisticRegression(random_state=42,
                                                      solver='liblinear'))


In [8]:
metrics_df = pd.DataFrame([results[i][1] for i in range(len(results))])
metrics_df["classifier"] = [results[i][0] for i in range(len(results))]
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,classifier
0,0.630535,0.650665,0.69052,0.650862,0.710926,MultinomialNB()
1,0.750673,0.752473,0.759603,0.752688,0.825631,"LogisticRegression(penalty='l1', random_state=..."
2,0.745508,0.747753,0.756851,0.748021,0.822596,"LogisticRegression(random_state=42, solver='li..."
3,0.768187,0.76952,0.775856,0.76977,0.843301,"LinearSVC(max_iter=10000, random_state=42)"
4,0.741613,0.745708,0.759746,0.745841,0.836346,SVC(random_state=42)
5,0.696137,0.69635,0.6969,0.696498,0.69634,DecisionTreeClassifier(random_state=42)
6,0.773769,0.77432,0.775982,0.774233,0.84881,RandomForestClassifier(random_state=42)
7,0.755216,0.755476,0.75649,0.755632,0.826072,AdaBoostClassifier(random_state=42)
8,0.772878,0.773221,0.774362,0.773246,0.848967,GradientBoostingClassifier(random_state=42)
9,0.770671,0.771605,0.775839,0.771796,0.842546,"StackingClassifier(estimators=[('svm',\n ..."


In [9]:
metrics_df.to_csv("../../results/word2vec_results.csv")