# Multinomial Naive Bayes

In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re

In [2]:
data = get_data("../../data/authors.csv")

In [3]:
tt = TweetTokenizer()

vectorizers = [CountVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,5), analyzer="char"),
                CountVectorizer(ngram_range=(4,5), analyzer="char"),
                CountVectorizer(ngram_range=(3,8), analyzer="char"),
                TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(4,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(3,8), analyzer="char")]

In [4]:
from joblib import Parallel, delayed
def process(vectorizer):
    evaluation = list()
    usernames = list(np.unique(data["username"]))
    
    vectorizer_str = vectorizer.__str__()
    print(f"Running => {vectorizer_str}")

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(
                data, author1, author2)

            clf = AuthorClassifier(vectorizer, clf=MultinomialNB(), scaler=MaxAbsScaler())
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            evaluation.append(clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
    
    return (vectorizer_str, metrics)
    
results = Parallel(n_jobs=6)(delayed(process)(vectorizer) for vectorizer in vectorizers)
print(results) 

Running => CountVectorizer(analyzer='char', ngram_range=(4, 5))
Running => CountVectorizer(analyzer='char', ngram_range=(3, 8))
Running => CountVectorizer(analyzer='char', ngram_range=(1, 5))
Running => CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fed7058ed00>>)
Running => CountVectorizer(ngram_range=(1, 3),
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fde886a4d00>>)
Running => TfidfVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f8a7e9f3d00>>)
Running => TfidfVectorizer(ngram_range=(1, 3),
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f8a6adcfee0>>)
Running => TfidfVectorizer(analyzer='char', ngram_range=(1, 5))
Running => TfidfVectorizer(analyzer='char', ngram_range=(4, 5))
Running => TfidfVectorizer(analyzer=

In [15]:
metrics_df = pd.DataFrame([results[i][1] for i in range(len(results))])
metrics_df["vectorizer"] = [results[i][0] for i in range(len(results))]
metrics_df.to_csv("../../results/naive_bayes.csv")
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer
0,0.803071,0.805319,0.822644,0.80639,0.904933,CountVectorizer(tokenizer=<bound method TweetT...
1,0.816596,0.818869,0.837862,0.820152,0.921596,"CountVectorizer(ngram_range=(1, 3),\n ..."
2,0.784051,0.789419,0.817414,0.790479,0.890609,"CountVectorizer(analyzer='char', ngram_range=(..."
3,0.777455,0.782988,0.811125,0.784117,0.887508,"CountVectorizer(analyzer='char', ngram_range=(..."
4,0.786303,0.791392,0.820381,0.792875,0.884496,"CountVectorizer(analyzer='char', ngram_range=(..."
5,0.796719,0.799566,0.819279,0.800614,0.902281,TfidfVectorizer(tokenizer=<bound method TweetT...
6,0.80731,0.809757,0.82897,0.810959,0.914583,"TfidfVectorizer(ngram_range=(1, 3),\n ..."
7,0.766801,0.773479,0.804928,0.774559,0.882082,"TfidfVectorizer(analyzer='char', ngram_range=(..."
8,0.763678,0.77041,0.802375,0.771643,0.883225,"TfidfVectorizer(analyzer='char', ngram_range=(..."
9,0.772833,0.778535,0.808547,0.780003,0.871022,"TfidfVectorizer(analyzer='char', ngram_range=(..."


Row with max sum of columns (remember that by the result of f1-score we use the precision and recall score)

In [20]:
print(metrics_df.loc[metrics_df[['f1_macro', 'accuracy', 'auc_score']].sum(1).idxmax()])
print("\nVectorizer:", metrics_df.loc[metrics_df[['f1_macro', 'accuracy', 'auc_score']].sum(1).idxmax()]["vectorizer"])

f1_macro                                                    0.816596
recall_macro                                                0.818869
precision_macro                                             0.837862
accuracy                                                    0.820152
auc_score                                                   0.921596
vectorizer         CountVectorizer(ngram_range=(1, 3),\n         ...
Name: 1, dtype: object

Vectorizer: CountVectorizer(ngram_range=(1, 3),
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fde886a4d00>>)


In [21]:
pd.read_csv("../../results/naive_bayes.csv").drop("Unnamed: 0", axis=1)

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer
0,0.803071,0.805319,0.822644,0.80639,0.904933,CountVectorizer(tokenizer=<bound method TweetT...
1,0.816596,0.818869,0.837862,0.820152,0.921596,"CountVectorizer(ngram_range=(1, 3),\n ..."
2,0.784051,0.789419,0.817414,0.790479,0.890609,"CountVectorizer(analyzer='char', ngram_range=(..."
3,0.777455,0.782988,0.811125,0.784117,0.887508,"CountVectorizer(analyzer='char', ngram_range=(..."
4,0.786303,0.791392,0.820381,0.792875,0.884496,"CountVectorizer(analyzer='char', ngram_range=(..."
5,0.796719,0.799566,0.819279,0.800614,0.902281,TfidfVectorizer(tokenizer=<bound method TweetT...
6,0.80731,0.809757,0.82897,0.810959,0.914583,"TfidfVectorizer(ngram_range=(1, 3),\n ..."
7,0.766801,0.773479,0.804928,0.774559,0.882082,"TfidfVectorizer(analyzer='char', ngram_range=(..."
8,0.763678,0.77041,0.802375,0.771643,0.883225,"TfidfVectorizer(analyzer='char', ngram_range=(..."
9,0.772833,0.778535,0.808547,0.780003,0.871022,"TfidfVectorizer(analyzer='char', ngram_range=(..."
