In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np
import pandas as pd
import re

In [2]:
data = get_data("../../data/authors.csv")

In [3]:
vectorizers = [CountVectorizer(ngram_range=(1,1), analyzer="word"), 
                CountVectorizer(ngram_range=(1,3), analyzer="word"),
                CountVectorizer(ngram_range=(1,5), analyzer="char"),
                CountVectorizer(ngram_range=(4,5), analyzer="char"),
                CountVectorizer(ngram_range=(3,8), analyzer="char"),
                TfidfVectorizer(ngram_range=(1,1), analyzer="word"), 
                TfidfVectorizer(ngram_range=(1,3), analyzer="word"),
                TfidfVectorizer(ngram_range=(1,5), analyzer="word"),
                TfidfVectorizer(ngram_range=(4,5), analyzer="word"),
                TfidfVectorizer(ngram_range=(3,8), analyzer="word")]

In [4]:
from joblib import Parallel, delayed
def process(vectorizer):
    evaluation = list()
    usernames = list(np.unique(data["username"]))
    
    vectorizer_str = vectorizer.__str__()
    print(f"Running => {vectorizer_str}")

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(
                data, author1, author2)

            clf = AuthorClassifier(vectorizer, clf=MultinomialNB())
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            evaluation.append(clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
    
    return (vectorizer_str, metrics)
    
results = Parallel(n_jobs=5)(delayed(process)(vectorizer) for vectorizer in vectorizers)
print(results) 

Running => CountVectorizer()
Running => CountVectorizer(ngram_range=(1, 3))
Running => CountVectorizer(analyzer='char', ngram_range=(1, 5))
Running => CountVectorizer(analyzer='char', ngram_range=(3, 8))
Running => CountVectorizer(analyzer='char', ngram_range=(4, 5))
