In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split


import sys
sys.path.insert(1, '../../libs')
from utils import temporal_train_test_split, evaluate_bert
from autorship import AuthorClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from joblib import Parallel, delayed
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("../../data/authors_bert.csv").drop(["Unnamed: 0", "comment"], axis=1)
data.head()

Unnamed: 0,username,created_utc,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,...,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767,emb_768
0,Manada_2,2022-03-09 14:17:46+00:00,0.23065,0.0223,0.009343,-0.022771,0.290211,0.020917,-0.009539,0.097825,...,-0.020931,0.354142,-0.283546,-0.299946,0.193261,-0.084011,-0.091721,-0.417421,-0.220645,0.110566
1,BluePirate89,2022-03-31 16:00:06+00:00,0.049211,-0.13495,0.683201,-0.156496,0.245604,0.179675,0.007007,-0.312811,...,-0.106805,0.079122,-0.155585,-0.310752,0.332965,-0.026474,0.115311,-0.072641,0.151321,-0.403907
2,9-Cortes,2021-11-29 02:08:23+00:00,-0.033197,-0.06991,-0.02906,0.122222,0.745492,0.049575,0.162378,0.105202,...,0.009278,-0.165076,0.114492,0.008429,0.52152,-0.252321,0.15632,0.137112,0.095263,-0.285079
3,Supermunch2000,2021-12-21 12:01:05+00:00,-0.111692,0.001248,0.25867,-0.117452,0.261348,0.427504,-0.430138,-0.255422,...,-0.007786,0.048536,-0.39235,-0.171661,0.321356,-0.262561,0.397757,-0.058622,0.138947,-0.739329
4,TheGza1,2021-08-10 12:49:03+00:00,-0.218113,-0.152065,0.283837,-0.108286,0.430704,0.16281,0.000672,-0.046517,...,0.116052,-0.038508,-0.345945,-0.1364,0.242163,-0.418695,-0.129335,0.173007,-0.047265,-0.160961


In [3]:
data.groupby("username").count()["created_utc"]

username
9-Cortes            902
AgnaldoTeExplode    914
BeatoSalut          886
BluePirate89        980
CariocaSatanico     724
Manada_2            913
MaxVonHabsburg      945
MidnightRider00     883
Miteiro             940
Supermunch2000      950
TheGza1             895
logatwork           933
piperman_           858
um--no              893
xanax101010         937
Name: created_utc, dtype: int64

In [4]:
estimators = [
        ('svm', LinearSVC(random_state=42, max_iter=10000)),
        ('lr_l1', LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        ('rf'), RandomForestClassifier(random_state=42))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, penalty="l2", solver="liblinear"))

clfs = [MultinomialNB(),
        LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        LogisticRegression(random_state=42, penalty="l2", solver="liblinear"),
        LinearSVC(random_state=42, max_iter=10000),
        SVC(random_state=42),
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        AdaBoostClassifier(random_state=42),
        GradientBoostingClassifier(random_state=42),
        stacking
        ]

In [7]:
def process(clf):
    clf_str = clf.__str__()
    print(f"Running ===> {clf_str}")
    evaluation = list()
    usernames = list(np.unique(data["username"]))

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)

            scaler = MinMaxScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

            author_clf = clf
            author_clf.fit(X_train, y_train)
            y_pred = author_clf.predict(X_test)
            try:
                predict_proba = author_clf.predict_proba(X_test)[:,1]
            except:
                predict_proba = author_clf.decision_function(X_test)
            evaluation.append(evaluate_bert(y_test, y_pred, predict_proba, author1, author2))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()

    print(f"Finish <=== {clf_str}")
    return (clf_str, metrics)

results = Parallel(n_jobs=-1)(delayed(process)(clf) for clf in clfs)

Running ===> LinearSVC(max_iter=10000, random_state=42)
Running ===> DecisionTreeClassifier(random_state=42)
Running ===> RandomForestClassifier(random_state=42)
Running ===> LogisticRegression(random_state=42, solver='liblinear')
Running ===> SVC(random_state=42)
Running ===> AdaBoostClassifier(random_state=42)
Running ===> MultinomialNB()
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear')
Finish <=== MultinomialNB()
Running ===> GradientBoostingClassifier(random_state=42)
Finish <=== LogisticRegression(random_state=42, solver='liblinear')
Running ===> StackingClassifier(estimators=[('svm',
                                LinearSVC(max_iter=10000, random_state=42)),
                               ('lr_l1',
                                LogisticRegression(penalty='l1',
                                                   random_state=42,
                                                   solver='liblinear'),
                                'rf',
        

In [8]:
metrics_df = pd.DataFrame([results[i][1] for i in range(len(results))])
metrics_df["classifier"] = [results[i][0] for i in range(len(results))]
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,classifier
0,0.706826,0.716972,0.739695,0.714608,0.834214,MultinomialNB()
1,0.870744,0.87091,0.871315,0.871059,0.933232,"LogisticRegression(penalty='l1', random_state=..."
2,0.876614,0.876746,0.877249,0.876939,0.939154,"LogisticRegression(random_state=42, solver='li..."
3,0.862117,0.862388,0.862555,0.862384,0.925298,"LinearSVC(max_iter=10000, random_state=42)"
4,0.884814,0.884771,0.886411,0.885252,0.948604,SVC(random_state=42)
5,0.711033,0.711479,0.71181,0.71169,0.71144,DecisionTreeClassifier(random_state=42)
6,0.828343,0.828685,0.831932,0.829323,0.905814,RandomForestClassifier(random_state=42)
7,0.814322,0.814671,0.81479,0.814701,0.888587,AdaBoostClassifier(random_state=42)
8,0.844811,0.845,0.846225,0.845349,0.917809,GradientBoostingClassifier(random_state=42)
9,0.872861,0.87297,0.873428,0.873187,0.934193,"StackingClassifier(estimators=[('svm',\n ..."


In [9]:
metrics_df.to_csv("../../results/bert_results.csv")