In [21]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd

In [6]:
data = pd.read_csv("../../data/authors_word2vec.csv").drop(["Unnamed: 0", "comment"], axis=1)
data.head()

Unnamed: 0,username,created_utc,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,Manada_2,2022-03-09 14:17:46+00:00,-0.432714,0.032157,1.099305,0.864218,0.837875,-3.109633,0.559502,3.881276,...,2.906217,0.421129,1.411148,1.064761,1.959023,1.601933,0.523119,0.897019,-0.426631,0.10312
1,BluePirate89,2022-03-31 16:00:06+00:00,0.125898,-0.242185,0.054639,0.735328,0.204172,-0.287813,0.222079,0.502998,...,0.112329,-0.015934,0.37365,0.769785,0.104709,0.382138,-0.12353,0.54085,-0.093899,-0.29784
2,9-Cortes,2021-11-29 02:08:23+00:00,-1.789684,0.469343,0.480798,1.27506,1.201183,-4.118136,0.754351,5.680286,...,2.970081,1.143064,0.718834,0.835258,3.357937,2.351601,1.128088,-0.537648,-0.405631,-0.016753
3,Supermunch2000,2021-12-21 12:01:05+00:00,0.515221,-0.98914,0.209595,3.852181,0.815947,-1.457306,1.57092,3.043237,...,0.800782,-0.175123,1.634715,3.569113,1.248902,2.070493,-0.804131,2.869144,-0.717174,-1.667274
4,TheGza1,2021-08-10 12:49:03+00:00,-1.702236,0.606132,0.567707,1.606782,1.322883,-4.053791,0.866824,5.870177,...,3.181649,0.857197,0.913484,0.512868,3.121929,2.340846,1.32559,-0.597204,-0.44673,0.19876


In [22]:
clfs = [LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        LinearSVC(random_state=42)]

In [None]:
from joblib import Parallel, delayed
def process(clf):
    clf_str = clf.__str__()
    evaluation = list()
    usernames = list(np.unique(data["username"]))

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)

            clf = AuthorClassifier(clf=clf, embeddings=True)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            evaluation.append(clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
        
    return (clf_str, metrics)
        
results = Parallel(n_jobs=2)(delayed(process)(clf) for clf in clfs)
print(results) 

In [20]:
metrics.to_csv("../../results/word2vec_results.csv")