In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split
from autorship import AuthorClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd

In [2]:
results = list()

In [3]:
result = pd.read_csv("../../results/logistic_regression_l1.csv").drop("Unnamed: 0", axis=1)

In [4]:
keys = result.vectorizer.str.replace("\s+object at .+\s*", "")
items = ["Contagem de palavras",
        "Contagem de palavras em sequência (1,3)",
        "Contagem de caracteres em sequência (1,5)",
        "Contagem de caracteres em sequência (4,5)",
        "Contagem de caracteres em sequência (3,8)",
        "Tf-idf de palavras",
        "Tf-idf de palavras em sequência (1,3)",
        "Tf-idf de caracteres em sequência (1,5)",
        "Tf-idf de caracteres em sequência (4,5)",
        "Tf-idf de caracteres em sequência (3,8)"]

dict_vectorizers = dict(zip(keys, items))

  keys = result.vectorizer.str.replace("\s+object at .+\s*", "")


In [5]:
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Regressão Logística (L1)"
results.append(result)

In [6]:
result = pd.read_csv("../../results/logistic_regression_l2.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Regressão Logística (L2)"
results.append(result)

In [7]:
result = pd.read_csv("../../results/naive_bayes.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Naive Bayes Multinomial"
results.append(result)

In [8]:
result = pd.read_csv("../../results/linear_svc.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "SVM (kernel linear)"
results.append(result)

In [9]:
result = pd.read_csv("../../results/rbf_svm.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "SVM (kernel rbf)"
results.append(result)

In [10]:
result = pd.read_csv("../../results/random_forest.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Random Forest"
results.append(result)

In [11]:
result = pd.read_csv("../../results/decision_tree.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Árvore de Decisão"
results.append(result)

In [12]:
result = pd.read_csv("../../results/ada_boost.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "AdaBoost"
results.append(result)

In [13]:
result = pd.read_csv("../../results/gradient_boosting.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Gradient boosting"
results.append(result)

In [14]:
result = pd.read_csv("../../results/stacking.csv").drop("Unnamed: 0", axis=1)
result = result.iloc[result["f1_macro"].idxmax(),:]
result["clf"] = "Stacking"
results.append(result)

In [15]:
results_df = pd.DataFrame(results).reset_index().drop("index", axis=1)
results_df["vectorizer"] = results_df.vectorizer.str.replace("\s+object at .+\s*", "", regex=True)
results_df["vectorizer"] = results_df["vectorizer"].map(dict_vectorizers)
results_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,vectorizer,clf
0,0.861566,0.861734,0.862389,0.861691,0.929556,"Tf-idf de caracteres em sequência (1,5)",Regressão Logística (L1)
1,0.872112,0.872323,0.873116,0.872254,0.941017,Tf-idf de palavras,Regressão Logística (L2)
2,0.816596,0.818869,0.837862,0.820152,0.921596,"Contagem de palavras em sequência (1,3)",Naive Bayes Multinomial
3,0.866946,0.867229,0.86781,0.867064,0.936061,Tf-idf de palavras,SVM (kernel linear)
4,0.769654,0.774462,0.801371,0.776549,0.878803,Tf-idf de palavras,SVM (kernel rbf)
5,0.850644,0.851396,0.853803,0.850959,0.918879,Contagem de palavras,Random Forest
6,0.826782,0.8274,0.82904,0.8271,0.82739,"Contagem de palavras em sequência (1,3)",Árvore de Decisão
7,0.851744,0.85226,0.853313,0.851932,0.922067,"Contagem de caracteres em sequência (1,5)",AdaBoost
8,0.826782,0.8274,0.82904,0.8271,0.82739,"Contagem de palavras em sequência (1,3)",Gradient boosting
9,0.88233,0.882515,0.883357,0.882461,0.947911,Tf-idf de palavras,Stacking


In [16]:
results_df.f1_macro = results_df.f1_macro.round(decimals=4)
results_df.recall_macro = results_df.recall_macro.round(decimals=4)
results_df.precision_macro = results_df.precision_macro.round(decimals=4)
results_df.accuracy = results_df.accuracy.round(decimals=4)
results_df.auc_score = results_df.auc_score.round(decimals=4)

In [17]:
results_df.columns = ["F1-score", 
                    "Revocação", 
                    "Precisão", 
                    "Acurácia", 
                    "AUC", 
                    "Processamento do texto ótimo",
                    "Classificador"]

In [22]:
results_df = results_df[results_df.columns[::-1]]
results_df

Unnamed: 0,Classificador,Processamento do texto ótimo,AUC,Acurácia,Precisão,Revocação,F1-score
0,Regressão Logística (L1),"Tf-idf de caracteres em sequência (1,5)",0.9296,0.8617,0.8624,0.8617,0.8616
1,Regressão Logística (L2),Tf-idf de palavras,0.941,0.8723,0.8731,0.8723,0.8721
2,Naive Bayes Multinomial,"Contagem de palavras em sequência (1,3)",0.9216,0.8202,0.8379,0.8189,0.8166
3,SVM (kernel linear),Tf-idf de palavras,0.9361,0.8671,0.8678,0.8672,0.8669
4,SVM (kernel rbf),Tf-idf de palavras,0.8788,0.7765,0.8014,0.7745,0.7697
5,Random Forest,Contagem de palavras,0.9189,0.851,0.8538,0.8514,0.8506
6,Árvore de Decisão,"Contagem de palavras em sequência (1,3)",0.8274,0.8271,0.829,0.8274,0.8268
7,AdaBoost,"Contagem de caracteres em sequência (1,5)",0.9221,0.8519,0.8533,0.8523,0.8517
8,Gradient boosting,"Contagem de palavras em sequência (1,3)",0.8274,0.8271,0.829,0.8274,0.8268
9,Stacking,Tf-idf de palavras,0.9479,0.8825,0.8834,0.8825,0.8823


In [24]:
print(results_df.to_latex(index=False))

\begin{tabular}{llrrrrr}
\toprule
           Classificador &              Processamento do texto ótimo &    AUC &  Acurácia &  Precisão &  Revocação &  F1-score \\
\midrule
Regressão Logística (L1) &   Tf-idf de caracteres em sequência (1,5) & 0.9296 &    0.8617 &    0.8624 &     0.8617 &    0.8616 \\
Regressão Logística (L2) &                        Tf-idf de palavras & 0.9410 &    0.8723 &    0.8731 &     0.8723 &    0.8721 \\
 Naive Bayes Multinomial &   Contagem de palavras em sequência (1,3) & 0.9216 &    0.8202 &    0.8379 &     0.8189 &    0.8166 \\
     SVM (kernel linear) &                        Tf-idf de palavras & 0.9361 &    0.8671 &    0.8678 &     0.8672 &    0.8669 \\
        SVM (kernel rbf) &                        Tf-idf de palavras & 0.8788 &    0.7765 &    0.8014 &     0.7745 &    0.7697 \\
           Random Forest &                      Contagem de palavras & 0.9189 &    0.8510 &    0.8538 &     0.8514 &    0.8506 \\
       Árvore de Decisão &   Contagem de palavr

  print(results_df.to_latex(index=False))


In [86]:
result = pd.read_csv("../../results/pos_results.csv").drop("Unnamed: 0", axis=1)
result = result[result["f1_macro"].isin(pd.DataFrame(result.groupby("classifier")["f1_macro"].max()).reset_index()["f1_macro"].values)]
results_df = result

results_df.f1_macro = results_df.f1_macro.round(decimals=4)
results_df.recall_macro = results_df.recall_macro.round(decimals=4)
results_df.precision_macro = results_df.precision_macro.round(decimals=4)
results_df.accuracy = results_df.accuracy.round(decimals=4)
results_df.auc_score = results_df.auc_score.round(decimals=4)

results_df.columns = ["F1-score", 
                    "Revocação", 
                    "Precisão", 
                    "Acurácia", 
                    "AUC", 
                    "Processamento do texto ótimo",
                    "Classificador"]

results_df = results_df[results_df.columns[::-1]]
results_df

Unnamed: 0,Classificador,Processamento do texto ótimo,AUC,Acurácia,Precisão,Revocação,F1-score
3,MultinomialNB(),"TfidfVectorizer(ngram_range=(1, 3))",0.8002,0.716,0.7241,0.7155,0.7131
7,"LogisticRegression(penalty='l1', random_state=...","TfidfVectorizer(ngram_range=(1, 3))",0.8444,0.7696,0.7707,0.7697,0.7693
11,"LogisticRegression(random_state=42, solver='li...","TfidfVectorizer(ngram_range=(1, 3))",0.8366,0.7595,0.7608,0.7595,0.7591
15,"LinearSVC(max_iter=10000, random_state=42)","TfidfVectorizer(ngram_range=(1, 3))",0.7962,0.7314,0.7323,0.7313,0.7309
18,SVC(random_state=42),TfidfVectorizer(),0.8113,0.7432,0.7452,0.7431,0.7424
21,DecisionTreeClassifier(random_state=42),"CountVectorizer(ngram_range=(1, 3))",0.7141,0.7112,0.7118,0.7111,0.7108
27,RandomForestClassifier(random_state=42),"TfidfVectorizer(ngram_range=(1, 3))",0.8385,0.761,0.7654,0.7613,0.7599
29,AdaBoostClassifier(random_state=42),"CountVectorizer(ngram_range=(1, 3))",0.8307,0.7588,0.7609,0.7592,0.7583
33,GradientBoostingClassifier(random_state=42),"CountVectorizer(ngram_range=(1, 3))",0.8498,0.7729,0.7759,0.7732,0.7723
39,"StackingClassifier(estimators=[('svm',\n ...","TfidfVectorizer(ngram_range=(1, 3))",0.8399,0.767,0.768,0.767,0.7667


In [87]:
print(results_df.to_latex(index=False))

\begin{tabular}{llrrrrr}
\toprule
                                     Classificador &        Processamento do texto ótimo &    AUC &  Acurácia &  Precisão &  Revocação &  F1-score \\
\midrule
                                   MultinomialNB() & TfidfVectorizer(ngram\_range=(1, 3)) & 0.8002 &    0.7160 &    0.7241 &     0.7155 &    0.7131 \\
LogisticRegression(penalty='l1', random\_state=4... & TfidfVectorizer(ngram\_range=(1, 3)) & 0.8444 &    0.7696 &    0.7707 &     0.7697 &    0.7693 \\
LogisticRegression(random\_state=42, solver='lib... & TfidfVectorizer(ngram\_range=(1, 3)) & 0.8366 &    0.7595 &    0.7608 &     0.7595 &    0.7591 \\
        LinearSVC(max\_iter=10000, random\_state=42) & TfidfVectorizer(ngram\_range=(1, 3)) & 0.7962 &    0.7314 &    0.7323 &     0.7313 &    0.7309 \\
                              SVC(random\_state=42) &                   TfidfVectorizer() & 0.8113 &    0.7432 &    0.7452 &     0.7431 &    0.7424 \\
           DecisionTreeClassifier(random\_state=

  print(results_df.to_latex(index=False))


In [90]:
result = pd.read_csv("../../results/word2vec_results.csv").drop("Unnamed: 0", axis=1)
result = result[result["f1_macro"].isin(pd.DataFrame(result.groupby("classifier")["f1_macro"].max()).reset_index()["f1_macro"].values)]
results_df = result

results_df.f1_macro = results_df.f1_macro.round(decimals=4)
results_df.recall_macro = results_df.recall_macro.round(decimals=4)
results_df.precision_macro = results_df.precision_macro.round(decimals=4)
results_df.accuracy = results_df.accuracy.round(decimals=4)
results_df.auc_score = results_df.auc_score.round(decimals=4)

results_df.columns = ["F1-score", 
                    "Revocação", 
                    "Precisão", 
                    "Acurácia", 
                    "AUC", 
                    "Classificador"]

results_df = results_df[results_df.columns[::-1]]
results_df

Unnamed: 0,Classificador,AUC,Acurácia,Precisão,Revocação,F1-score
0,MultinomialNB(),0.7109,0.6509,0.6905,0.6507,0.6305
1,"LogisticRegression(penalty='l1', random_state=...",0.8256,0.7527,0.7596,0.7525,0.7507
2,"LogisticRegression(random_state=42, solver='li...",0.8226,0.748,0.7569,0.7478,0.7455
3,"LinearSVC(max_iter=10000, random_state=42)",0.8433,0.7698,0.7759,0.7695,0.7682
4,SVC(random_state=42),0.8363,0.7458,0.7597,0.7457,0.7416
5,DecisionTreeClassifier(random_state=42),0.6963,0.6965,0.6969,0.6964,0.6961
6,RandomForestClassifier(random_state=42),0.8488,0.7742,0.776,0.7743,0.7738
7,AdaBoostClassifier(random_state=42),0.8261,0.7556,0.7565,0.7555,0.7552
8,GradientBoostingClassifier(random_state=42),0.849,0.7732,0.7744,0.7732,0.7729
9,"StackingClassifier(estimators=[('svm',\n ...",0.8425,0.7718,0.7758,0.7716,0.7707


In [91]:
print(results_df.to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
                                     Classificador &    AUC &  Acurácia &  Precisão &  Revocação &  F1-score \\
\midrule
                                   MultinomialNB() & 0.7109 &    0.6509 &    0.6905 &     0.6507 &    0.6305 \\
LogisticRegression(penalty='l1', random\_state=4... & 0.8256 &    0.7527 &    0.7596 &     0.7525 &    0.7507 \\
LogisticRegression(random\_state=42, solver='lib... & 0.8226 &    0.7480 &    0.7569 &     0.7478 &    0.7455 \\
        LinearSVC(max\_iter=10000, random\_state=42) & 0.8433 &    0.7698 &    0.7759 &     0.7695 &    0.7682 \\
                              SVC(random\_state=42) & 0.8363 &    0.7458 &    0.7597 &     0.7457 &    0.7416 \\
           DecisionTreeClassifier(random\_state=42) & 0.6963 &    0.6965 &    0.6969 &     0.6964 &    0.6961 \\
           RandomForestClassifier(random\_state=42) & 0.8488 &    0.7742 &    0.7760 &     0.7743 &    0.7738 \\
               AdaBoostClassifier(random\_state=42) & 0

  print(results_df.to_latex(index=False))
