# Stacking

In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
from sklearn.utils import shuffle

In [2]:
data = get_data("../../data/authors.csv")

In [3]:
tt = TweetTokenizer()

vectorizers = [CountVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,5), analyzer="char"),
                CountVectorizer(ngram_range=(4,5), analyzer="char"),
                CountVectorizer(ngram_range=(3,8), analyzer="char"),
                TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(4,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(3,8), analyzer="char")]

In [4]:
from joblib import Parallel, delayed
def process(vectorizer):
    evaluation = list()
    usernames = list(np.unique(data["username"]))
    
    vectorizer_str = vectorizer.__str__()
    print(f"Running => {vectorizer_str}")

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(   
                data, author1, author2)
            
            estimators = [
                ('svm', LinearSVC(random_state=42)),
                ('lr_l1', LogisticRegression(random_state=42, penalty="l1", solver="liblinear"),
                ('rf'), RandomForestClassifier(random_state=42))]
            stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=42, penalty="l2", solver="liblinear"))

            clf = AuthorClassifier(vectorizer, clf=stacking)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            evaluation.append(clf.evaluate(y_test, y_pred))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
    
    return (vectorizer_str, metrics)
    
results = Parallel(n_jobs=7)(delayed(process)(vectorizer) for vectorizer in vectorizers)
print(results) 

Running => CountVectorizer(analyzer='char', ngram_range=(3, 8))
Running => CountVectorizer(analyzer='char', ngram_range=(1, 5))
Running => CountVectorizer(analyzer='char', ngram_range=(4, 5))
Running => CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fc70a2873a0>>)
Running => CountVectorizer(ngram_range=(1, 3),
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fdd4bb6f5e0>>)
Running => TfidfVectorizer(ngram_range=(1, 3),
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f7d6df5f3a0>>)
Running => TfidfVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f03948dc3a0>>)




Running => TfidfVectorizer(analyzer='char', ngram_range=(1, 5))
Running => TfidfVectorizer(analyzer='char', ngram_range=(4, 5))




Running => TfidfVectorizer(analyzer='char', ngram_range=(3, 8))




[('CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fc70a2873a0>>)', f1_macro           0.867703
recall_macro       0.868348
precision_macro    0.869751
accuracy           0.867844
auc_score          0.935050
dtype: float64), ('CountVectorizer(ngram_range=(1, 3),\n                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fdd4bb6f5e0>>)', f1_macro           0.871662
recall_macro       0.872327
precision_macro    0.874092
accuracy           0.871830
auc_score          0.938159
dtype: float64), ("CountVectorizer(analyzer='char', ngram_range=(1, 5))", f1_macro           0.858274
recall_macro       0.858690
precision_macro    0.859233
accuracy           0.858360
auc_score          0.925204
dtype: float64), ("CountVectorizer(analyzer='char', ngram_range=(4, 5))", f1_macro           0.812863
recall_macro       0.813730
precision_macro    0.815374
accuracy           0.

In [8]:
metrics_df = pd.DataFrame([results[i][1] for i in range(len(results))])
metrics_df["vectorizer"] = [results[i][0] for i in range(len(results))]
metrics_df.to_csv("../../results/stacking.csv")
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer
0,0.867703,0.868348,0.869751,0.867844,0.93505,CountVectorizer(tokenizer=<bound method TweetT...
1,0.871662,0.872327,0.874092,0.87183,0.938159,"CountVectorizer(ngram_range=(1, 3),\n ..."
2,0.858274,0.85869,0.859233,0.85836,0.925204,"CountVectorizer(analyzer='char', ngram_range=(..."
3,0.812863,0.81373,0.815374,0.813122,0.893233,"CountVectorizer(analyzer='char', ngram_range=(..."
4,0.817945,0.818609,0.819606,0.818107,0.897576,"CountVectorizer(analyzer='char', ngram_range=(..."
5,0.88233,0.882515,0.883357,0.882461,0.947911,TfidfVectorizer(tokenizer=<bound method TweetT...
6,0.86978,0.870117,0.875426,0.870556,0.946929,"TfidfVectorizer(ngram_range=(1, 3),\n ..."
7,0.87541,0.875575,0.876049,0.875525,0.943361,"TfidfVectorizer(analyzer='char', ngram_range=(..."
8,0.84593,0.846172,0.846829,0.846071,0.923292,"TfidfVectorizer(analyzer='char', ngram_range=(..."
9,0.847596,0.847739,0.849513,0.847915,0.926102,"TfidfVectorizer(analyzer='char', ngram_range=(..."


Row with max sum of columns (remember that by the result of f1-score we use the precision and recall score)

In [9]:
print(metrics_df.loc[metrics_df[['f1_macro', 'accuracy', 'auc_score']].sum(1).idxmax()])
print("\nVectorizer:", metrics_df.loc[metrics_df[['f1_macro', 'accuracy', 'auc_score']].sum(1).idxmax()]["vectorizer"])

f1_macro                                                     0.88233
recall_macro                                                0.882515
precision_macro                                             0.883357
accuracy                                                    0.882461
auc_score                                                   0.947911
vectorizer         TfidfVectorizer(tokenizer=<bound method TweetT...
Name: 5, dtype: object

Vectorizer: TfidfVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f03948dc3a0>>)


In [10]:
pd.read_csv("../../results/stacking.csv").drop("Unnamed: 0", axis=1)

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer
0,0.867703,0.868348,0.869751,0.867844,0.93505,CountVectorizer(tokenizer=<bound method TweetT...
1,0.871662,0.872327,0.874092,0.87183,0.938159,"CountVectorizer(ngram_range=(1, 3),\n ..."
2,0.858274,0.85869,0.859233,0.85836,0.925204,"CountVectorizer(analyzer='char', ngram_range=(..."
3,0.812863,0.81373,0.815374,0.813122,0.893233,"CountVectorizer(analyzer='char', ngram_range=(..."
4,0.817945,0.818609,0.819606,0.818107,0.897576,"CountVectorizer(analyzer='char', ngram_range=(..."
5,0.88233,0.882515,0.883357,0.882461,0.947911,TfidfVectorizer(tokenizer=<bound method TweetT...
6,0.86978,0.870117,0.875426,0.870556,0.946929,"TfidfVectorizer(ngram_range=(1, 3),\n ..."
7,0.87541,0.875575,0.876049,0.875525,0.943361,"TfidfVectorizer(analyzer='char', ngram_range=(..."
8,0.84593,0.846172,0.846829,0.846071,0.923292,"TfidfVectorizer(analyzer='char', ngram_range=(..."
9,0.847596,0.847739,0.849513,0.847915,0.926102,"TfidfVectorizer(analyzer='char', ngram_range=(..."
