In [1]:
# import libraries
import os, sys
import string
import numpy as np, pandas as pd
try:
    import nltk
except:
    !conda install --yes --prefix {sys.prefix} nltk
    import nltk

import matplotlib.pyplot as plt

try:
    import seaborn as sns
except:
    !conda install --yes --prefix {sys.prefix} seaborn
    import seaborn as sns

# import modules
from utils.preprocessing import get_dataframe, label_encoding, feature_engineering, normalize_features

# magics
%matplotlib inline

In [2]:
#----------------------- CONFIG -----------------------#
visualization = False
norm = False
#------------------------------------------------------#

In [3]:
# from sklearn.tree import DecisionTreeClassifier

In [4]:
forced = [
#     "comma_per_word",
#     "point_per_word",
#     "ellipsis_per_word",
    ]

### 1) File Exploration

In [5]:
# ver conteúdos da pasta
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'BOA_project_description.pdf',
 'corpora',
 'corpora_preprocessing.ipynb',
 'LICENSE',
 'README.md',
 'requirements.txt',
 'utils']

In [6]:
authors = os.listdir("corpora/train")

In [7]:
for author in authors:
    for excerto in [x for x in os.listdir(f"corpora/train/{author}") if x.endswith(".txt")]:
        print(f"corpora/train/{author}/{excerto}")

corpora/train/AlmadaNegreiros/pg22615.txt
corpora/train/AlmadaNegreiros/pg22730.txt
corpora/train/AlmadaNegreiros/pg22801.txt
corpora/train/AlmadaNegreiros/pg22802.txt
corpora/train/AlmadaNegreiros/pg22969.txt
corpora/train/AlmadaNegreiros/pg23133.txt
corpora/train/AlmadaNegreiros/pg23620.txt
corpora/train/AlmadaNegreiros/pg23879.txt
corpora/train/AlmadaNegreiros/pg23961.txt
corpora/train/CamiloCasteloBranco/24691-0.txt
corpora/train/CamiloCasteloBranco/34756-0.txt
corpora/train/CamiloCasteloBranco/pg16425.txt
corpora/train/CamiloCasteloBranco/pg17927.txt
corpora/train/CamiloCasteloBranco/pg19375.txt
corpora/train/CamiloCasteloBranco/pg21406.txt
corpora/train/CamiloCasteloBranco/pg23203.txt
corpora/train/CamiloCasteloBranco/pg23345.txt
corpora/train/CamiloCasteloBranco/pg23346.txt
corpora/train/CamiloCasteloBranco/pg24339.txt
corpora/train/CamiloCasteloBranco/pg25844.txt
corpora/train/CamiloCasteloBranco/pg26017.txt
corpora/train/CamiloCasteloBranco/pg26103.txt
corpora/train/CamiloCast

___
### 2) Train

In [8]:
train = get_dataframe(path_to_train="corpora/train/", author_list=authors,
                      preserve_blank_lines=False, join_every_line=True,
                      separator=" ")

train.head(2)

Unnamed: 0,text,author
0,Title: A Scena do Odio Author: José de Almada ...,AlmadaNegreiros
1,Title: O Jardim da Pierrette Author: José de A...,AlmadaNegreiros


In [9]:
# add new features
train = feature_engineering(train, "text")

train.head(2)

Unnamed: 0,text,author,ellipsis_per_word,avg_word_len,punct_per_word,:,",",.,-,!,?,;
0,Title: A Scena do Odio Author: José de Almada ...,AlmadaNegreiros,0.003939,5.082161,0.266179,0.003939,0.066404,0.016882,0.077659,0.099606,0.001688,0.0
1,Title: O Jardim da Pierrette Author: José de A...,AlmadaNegreiros,0.0,5.009434,0.132075,0.015723,0.022013,0.059748,0.031447,0.003145,0.0,0.0


In [10]:
if visualization:
    plt.figure(figsize=(14,8))
    sns.violinplot(x="author", y="avg_word_len", data=train, palette="GnBu_d")
    sns.swarmplot(x="author", y="avg_word_len", data=train, color="r", alpha=1)
    plt.ylabel("\"avg_word_len\"", fontsize=14)
    plt.xlabel("author", fontsize=14)
    plt.title("Average word characters per author", fontsize=18)
    plt.show()

In [11]:
lst_normalize_features = []

for i in train.columns:
    if (type(train[i][0]) == np.float64) or (type(train[i][0]) == np.int64):
        if (train[i].describe()["max"] != 1) or (train[i].describe()["min"] != 0):
            lst_normalize_features.append(i)

lst_normalize_features

['ellipsis_per_word',
 'avg_word_len',
 'punct_per_word',
 ':',
 ',',
 '.',
 '-',
 '!',
 '?',
 ';']

In [12]:
train.groupby(["author"]).mean()

Unnamed: 0_level_0,ellipsis_per_word,avg_word_len,punct_per_word,:,",",.,-,!,?,;
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AlmadaNegreiros,0.005191,4.882636,0.163368,0.004889,0.041741,0.051775,0.03874,0.022652,0.002303,0.001268
CamiloCasteloBranco,0.008852,5.034852,0.251378,0.004408,0.091458,0.078101,0.058093,0.007303,0.00548,0.006535
EcaDeQueiros,0.008596,5.089281,0.245337,0.006869,0.120008,0.060798,0.037662,0.011511,0.00381,0.00468
JoseRodriguesSantos,0.003556,4.98023,0.19304,0.000977,0.071044,0.075043,0.023359,0.005536,0.015201,0.00188
JoseSaramago,0.001568,4.758429,0.18935,0.00161,0.131467,0.030636,0.022544,0.001501,0.001473,0.00012
LuisaMarquesSilva,0.001181,4.657484,0.174198,0.009503,0.071131,0.072681,0.009004,0.004903,0.005796,0.001179


In [13]:
if norm:
    train = normalize_features(train, lst_normalize_features)

In [14]:
train.groupby(["author"]).mean()

Unnamed: 0_level_0,ellipsis_per_word,avg_word_len,punct_per_word,:,",",.,-,!,?,;
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AlmadaNegreiros,0.005191,4.882636,0.163368,0.004889,0.041741,0.051775,0.03874,0.022652,0.002303,0.001268
CamiloCasteloBranco,0.008852,5.034852,0.251378,0.004408,0.091458,0.078101,0.058093,0.007303,0.00548,0.006535
EcaDeQueiros,0.008596,5.089281,0.245337,0.006869,0.120008,0.060798,0.037662,0.011511,0.00381,0.00468
JoseRodriguesSantos,0.003556,4.98023,0.19304,0.000977,0.071044,0.075043,0.023359,0.005536,0.015201,0.00188
JoseSaramago,0.001568,4.758429,0.18935,0.00161,0.131467,0.030636,0.022544,0.001501,0.001473,0.00012
LuisaMarquesSilva,0.001181,4.657484,0.174198,0.009503,0.071131,0.072681,0.009004,0.004903,0.005796,0.001179


In [15]:
halt

NameError: name 'halt' is not defined

In [16]:
# codificação da coluna y para fornecer ao futuro modelo
train, le = label_encoding(train, "author")

In [17]:
# train = clean_text(train, "text")
# train.head()

In [18]:
# import string
# from collections import Counter

# count = lambda l1,l2: sum([1 for x in l1 if x in l2])

# selected_punct = "!,.-:;?"
# # train.text.apply(lambda s: {k:v for k, v in Counter(s).items() if k in string.punctuation}).apply(pd.Series).fillna(0).astype("int")
# train.text.apply(lambda s: {k:v for k, v in Counter(s).items() if k in selected_punct}).apply(pd.Series).fillna(0).astype("int").divide(np.random.randint(1, 9999, 63), axis=0)

# # train.text.apply(lambda s: count(s, selected_punct))

In [19]:
# # TODO cols:  
# train["punctuation"]

In [20]:
# reorder columns
Xcols = list(set(train.columns) - set(["author"]) - set(["text"]))
Xcols.sort()

cols = ["text"]
cols.extend(Xcols)
cols.append("author")

train = train[cols]
train.head()

Unnamed: 0,text,!,",",-,.,:,;,?,avg_word_len,ellipsis_per_word,punct_per_word,author
0,Title: A Scena do Odio Author: José de Almada ...,0.099606,0.066404,0.077659,0.016882,0.003939,0.0,0.001688,5.082161,0.003939,0.266179,0
1,Title: O Jardim da Pierrette Author: José de A...,0.003145,0.022013,0.031447,0.059748,0.015723,0.0,0.0,5.009434,0.0,0.132075,0
2,Title: A Invenção do Dia Claro Author: José de...,0.029977,0.043723,0.035939,0.059954,0.005962,0.002981,0.002319,4.501491,0.001656,0.180855,0
3,Title: Litoral A Amadeo de Souza Cardozo Autho...,0.006107,0.001527,0.038168,0.001527,0.007634,0.0,0.001527,5.006107,0.0,0.056489,0
4,EXPOSIÇÃO +amadeo de souza cardoso+ LIGA NAVAL...,0.003454,0.050086,0.032815,0.056995,0.001727,0.003454,0.003454,4.905009,0.0,0.151986,0


In [21]:
if len(forced) > 0:
    Xcols = forced

X = train[Xcols]
y = train["author"]

___
### 3) Test

In [22]:
test = get_dataframe("corpora/test/", author_list=["1000Palavras", "500Palavras"],
                     preserve_blank_lines=False, join_every_line=True,
                     separator=" ")

test["author"] = test["author"].str.replace("1000Palavras", "1000").str.replace("500Palavras", "500").astype(int)
test.columns = [list(test.columns)[0], "word_count"]
test.head()

Unnamed: 0,text,word_count
0,"Depois, pouco a pouco, a tranquilidade regress...",1000
1,Justamente como se eu tivesse tido a ideia de ...,1000
2,"Quase um mês depois, a época de exames aproxim...",1000
3,"Agora, porém, era sem fervor, arrastadamente, ...",1000
4,"O cahos de cima a descer, a descer com a morta...",1000


In [23]:
# add new features
test = feature_engineering(test, "text")
test.drop("word_count", axis=1, inplace=True)
test[Xcols].head()

Unnamed: 0,!,",",-,.,:,;,?,avg_word_len,ellipsis_per_word,punct_per_word
0,0.0,0.169028,0.025304,0.017206,0.0,0.0,0.0,4.834008,0.0,0.211538
1,0.002134,0.028815,0.03095,0.036286,0.001067,0.001067,0.006403,4.820704,0.0,0.106724
2,0.001908,0.04771,0.025763,0.085878,0.003817,0.001908,0.009542,4.732824,0.014313,0.176527
3,0.012085,0.144008,0.035247,0.040282,0.005035,0.009063,0.003021,5.308157,0.005035,0.248741
4,0.0,0.114201,0.023833,0.038729,0.003972,0.006951,0.000993,5.426018,0.0,0.188679


In [24]:
if norm:
    test = normalize_features(test, lst_normalize_features)

___

### 4) Model predictions

In [25]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [26]:
test["author_pred"] = clf.predict(test[Xcols])
test["author_pred"] = le.inverse_transform(test["author_pred"])

In [27]:
test

Unnamed: 0,text,ellipsis_per_word,avg_word_len,punct_per_word,",",-,.,?,!,;,:,author_pred
0,"Depois, pouco a pouco, a tranquilidade regress...",0.0,4.834008,0.211538,0.169028,0.025304,0.017206,0.0,0.0,0.0,0.0,JoseSaramago
1,Justamente como se eu tivesse tido a ideia de ...,0.0,4.820704,0.106724,0.028815,0.03095,0.036286,0.006403,0.002134,0.001067,0.001067,AlmadaNegreiros
2,"Quase um mês depois, a época de exames aproxim...",0.014313,4.732824,0.176527,0.04771,0.025763,0.085878,0.009542,0.001908,0.001908,0.003817,AlmadaNegreiros
3,"Agora, porém, era sem fervor, arrastadamente, ...",0.005035,5.308157,0.248741,0.144008,0.035247,0.040282,0.003021,0.012085,0.009063,0.005035,EcaDeQueiros
4,"O cahos de cima a descer, a descer com a morta...",0.0,5.426018,0.188679,0.114201,0.023833,0.038729,0.000993,0.0,0.006951,0.003972,CamiloCasteloBranco
5,"""O Senhor ensina pela pena o que o homem não s...",0.002035,4.80061,0.157681,0.066124,0.017294,0.059003,0.010173,0.001017,0.002035,0.002035,LuisaMarquesSilva
6,"Depois, pouco a pouco, a tranquilidade regress...",0.0,4.721774,0.189516,0.159274,0.014113,0.016129,0.0,0.0,0.0,0.0,JoseSaramago
7,Justamente como se eu tivesse tido a ideia de ...,0.0,4.904762,0.111801,0.041408,0.024845,0.028986,0.012422,0.00207,0.00207,0.0,LuisaMarquesSilva
8,"Quase um mês depois, a época de exames aproxim...",0.013807,4.783037,0.179487,0.04142,0.037475,0.078895,0.011834,0.003945,0.001972,0.003945,AlmadaNegreiros
9,"Agora, porém, era sem fervor, arrastadamente, ...",0.00409,5.364008,0.231084,0.139059,0.034765,0.02863,0.0,0.00409,0.018405,0.006135,EcaDeQueiros


___
### 5) Hard evaluation

Evaluating on test set given the fact that the train dataset is small for train/valid split, which is not desirable.

In [28]:
# labels feitas à mão
labels = ["JoseSaramago", "AlmadaNegreiros", "LuisaMarquesSilva",
          "EcaDeQueiros", "CamiloCasteloBranco", "JoseRodriguesSantos",
          "JoseSaramago", "LuisaMarquesSilva", "LuisaMarquesSilva",
          "EcaDeQueiros", "CamiloCasteloBranco", "JoseRodriguesSantos"]

In [29]:
score, certos = 0, []

for index, i in enumerate(test["author_pred"]):
    if i == labels[index]:
        print(f"#{index}:\tcerto ({i})")
        score += 1
        certos.append(index)
    else:
        print(f"#{index}:\terrado")

accuracy = round(score/len(test)*100, 2)
print(f"\naccuracy = {accuracy} %")

#0:	certo (JoseSaramago)
#1:	certo (AlmadaNegreiros)
#2:	errado
#3:	certo (EcaDeQueiros)
#4:	certo (CamiloCasteloBranco)
#5:	errado
#6:	certo (JoseSaramago)
#7:	certo (LuisaMarquesSilva)
#8:	errado
#9:	certo (EcaDeQueiros)
#10:	certo (CamiloCasteloBranco)
#11:	errado

accuracy = 66.67 %


In [30]:
print(f"\tforced: \t{forced}\n\tcertos: \t{', '.join([(f'{n}') for n in certos])}\n\taccuracy:\t{accuracy} %")

	forced: 	[]
	certos: 	0, 1, 3, 4, 6, 7, 9, 10
	accuracy:	66.67 %


In [31]:
print(test.columns.tolist()[1:-1])

['ellipsis_per_word', 'avg_word_len', 'punct_per_word', ',', '-', '.', '?', '!', ';', ':']


___
### **[TODO] features:**
- <s>número de caracteres por linha</s>
- <s>número de palavras por linha</s>
- <s>número de vírgulas por frase</s>
- <s>vírgulas utilizadas por palavra</s>
- <s>número de ellipsis por palavra</s>
- <s>número de caracteres especiais (pontuação) por palavra</s>
- <s>número médio de caracteres por palavra </s>