Use `authors_pos.csv` to build the same classifiers used in word2vec, and same vectorizers form tfidf and count, being that we just analyze words now (POS tagging), including unigrams, bigrams etc.

In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split


import sys
sys.path.insert(1, '../../libs')
from utils import temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd
# !python -m spacy download pt_core_news_lg

In [2]:
data = pd.read_csv("../../data/authors_pos.csv").drop(["Unnamed: 0", "comment"], axis=1)
data.head()

Unnamed: 0,username,created_utc,pos
0,Manada_2,2022-03-09 14:17:46+00:00,PROPN ADP NOUN ADJ PUNCT
1,BluePirate89,2022-03-31 16:00:06+00:00,NOUN PROPN
2,9-Cortes,2021-11-29 02:08:23+00:00,NOUN ADJ PUNCT ADV DET ADJ NOUN PUNCT ADV DET ...
3,Supermunch2000,2021-12-21 12:01:05+00:00,PROPN NOUN X VERB ADJ PUNCT
4,TheGza1,2021-08-10 12:49:03+00:00,ADV VERB ADP NOUN PRON PRON VERB PUNCT DET NOU...


In [3]:
estimators = [
        ('svm', LinearSVC(random_state=42, max_iter=10000)),
        ('lr_l1', LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        ('rf'), RandomForestClassifier(random_state=42))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, penalty="l2", solver="liblinear"))

clfs = [MultinomialNB(),
        LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
        LogisticRegression(random_state=42, penalty="l2", solver="liblinear"),
        LinearSVC(random_state=42, max_iter=10000),
        SVC(random_state=42),
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        AdaBoostClassifier(random_state=42),
        GradientBoostingClassifier(random_state=42),
        stacking
        ]

In [4]:
vectorizers = [CountVectorizer(ngram_range=(1,1), analyzer="word"), 
                CountVectorizer(ngram_range=(1,3), analyzer="word"), 
                TfidfVectorizer(ngram_range=(1,1), analyzer="word"), 
                TfidfVectorizer(ngram_range=(1,3), analyzer="word")]

In [5]:
from joblib import Parallel, delayed
def process(clf, vectorizer):
    clf_str = clf.__str__()
    vect_str = vectorizer.__str__()
    print(f"Running ===> {clf_str} | {vect_str}")
    evaluation = list()
    usernames = list(np.unique(data["username"]))

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)

            author_clf = AuthorClassifier(clf=clf, vectorizer=vectorizer, scaler=MaxAbsScaler())
            author_clf.fit(X_train.iloc[:,0], y_train)
            y_pred = author_clf.predict(X_test.iloc[:,0])
            evaluation.append(author_clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()

    print(f"Finish <=== {clf_str} | {vect_str}")
    return (clf_str, vect_str, metrics)
        
results = Parallel(n_jobs=8)(delayed(process)(clf, vectorizer) for clf in clfs for vectorizer in vectorizers)

Running ===> MultinomialNB() | TfidfVectorizer()
Running ===> MultinomialNB() | TfidfVectorizer(ngram_range=(1, 3))
Running ===> MultinomialNB() | CountVectorizer(ngram_range=(1, 3))
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear') | CountVectorizer(ngram_range=(1, 3))
Running ===> MultinomialNB() | CountVectorizer()
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear') | CountVectorizer()
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear') | TfidfVectorizer()
Running ===> LogisticRegression(penalty='l1', random_state=42, solver='liblinear') | TfidfVectorizer(ngram_range=(1, 3))
Finish <=== MultinomialNB() | CountVectorizer()
Running ===> LogisticRegression(random_state=42, solver='liblinear') | CountVectorizer()
Finish <=== LogisticRegression(penalty='l1', random_state=42, solver='liblinear') | CountVectorizer()
Running ===> LogisticRegression(random_state=42, solver='liblinear') | CountVecto

In [6]:
metrics_df = pd.DataFrame([results[i][2] for i in range(len(results))])
metrics_df["vectorizer"] = [results[i][1] for i in range(len(results))]
metrics_df["classifier"] = [results[i][0] for i in range(len(results))]
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer,classifier
0,0.526834,0.58378,0.62699,0.59088,0.709742,CountVectorizer(),MultinomialNB()
1,0.694003,0.69713,0.708136,0.698818,0.787522,"CountVectorizer(ngram_range=(1, 3))",MultinomialNB()
2,0.645791,0.652819,0.66799,0.654232,0.730966,TfidfVectorizer(),MultinomialNB()
3,0.713111,0.715467,0.724098,0.716042,0.800243,"TfidfVectorizer(ngram_range=(1, 3))",MultinomialNB()
4,0.702575,0.70711,0.716005,0.706065,0.777946,CountVectorizer(),"LogisticRegression(penalty='l1', random_state=..."
5,0.736984,0.73966,0.745718,0.738659,0.813783,"CountVectorizer(ngram_range=(1, 3))","LogisticRegression(penalty='l1', random_state=..."
6,0.718652,0.719491,0.722492,0.71995,0.783921,TfidfVectorizer(),"LogisticRegression(penalty='l1', random_state=..."
7,0.769256,0.769686,0.770716,0.769584,0.844366,"TfidfVectorizer(ngram_range=(1, 3))","LogisticRegression(penalty='l1', random_state=..."
8,0.660734,0.668247,0.682267,0.66769,0.740828,CountVectorizer(),"LogisticRegression(random_state=42, solver='li..."
9,0.730487,0.732553,0.737098,0.73175,0.807453,"CountVectorizer(ngram_range=(1, 3))","LogisticRegression(random_state=42, solver='li..."


In [7]:
metrics_df.to_csv("../../results/pos_results.csv")