In [1]:
# import libraries
import os, sys
import string
import pandas as pd
try:
    import nltk
except:
    !conda install --yes --prefix {sys.prefix} nltk
    import nltk

# import modules
from utils.preprocessing import get_dataframe, label_encoding

In [27]:
# from sklearn.tree import DecisionTreeClassifier

In [2]:
forced = ["comma_per_word"]

### 1) File Exploration

In [3]:
# ver conteúdos da pasta
os.listdir()

['.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'BOA_project_description.pdf',
 'corpora',
 'corpora_preprocessing.ipynb',
 'LICENSE',
 'README.md',
 'requirements.txt',
 'utils']

In [4]:
authors = os.listdir("corpora/train")
authors

['AlmadaNegreiros',
 'CamiloCasteloBranco',
 'EcaDeQueiros',
 'JoseRodriguesSantos',
 'JoseSaramago',
 'LuisaMarquesSilva']

In [5]:
for author in authors:
    for excerto in [x for x in os.listdir(f"corpora/train/{author}") if x.endswith(".txt")]:
        print(f"corpora/train/{author}/{excerto}")

corpora/train/AlmadaNegreiros/pg22615.txt
corpora/train/AlmadaNegreiros/pg22730.txt
corpora/train/AlmadaNegreiros/pg22801.txt
corpora/train/AlmadaNegreiros/pg22802.txt
corpora/train/AlmadaNegreiros/pg22969.txt
corpora/train/AlmadaNegreiros/pg23133.txt
corpora/train/AlmadaNegreiros/pg23620.txt
corpora/train/AlmadaNegreiros/pg23879.txt
corpora/train/AlmadaNegreiros/pg23961.txt
corpora/train/CamiloCasteloBranco/24691-0.txt
corpora/train/CamiloCasteloBranco/34756-0.txt
corpora/train/CamiloCasteloBranco/pg16425.txt
corpora/train/CamiloCasteloBranco/pg17927.txt
corpora/train/CamiloCasteloBranco/pg19375.txt
corpora/train/CamiloCasteloBranco/pg21406.txt
corpora/train/CamiloCasteloBranco/pg23203.txt
corpora/train/CamiloCasteloBranco/pg23345.txt
corpora/train/CamiloCasteloBranco/pg23346.txt
corpora/train/CamiloCasteloBranco/pg24339.txt
corpora/train/CamiloCasteloBranco/pg25844.txt
corpora/train/CamiloCasteloBranco/pg26017.txt
corpora/train/CamiloCasteloBranco/pg26103.txt
corpora/train/CamiloCast

___
### 2) Train

In [6]:
train = get_dataframe(path_to_train="corpora/train/", author_list=authors,
                      preserve_blank_lines=False, join_every_line=True,
                      separator=" ")

In [7]:
# add new features
train['comma_count'] = train["text"].str.count(",")
train["word_count"] = train['text'].str.split().str.len()
train['char_count'] = train["text"].str.len()
train["comma_per_word"] = train['comma_count']/train["word_count"]

train.head()

Unnamed: 0,text,author,comma_count,word_count,char_count,comma_per_word
0,Title: A Scena do Odio Author: José de Almada ...,AlmadaNegreiros,118,1777,10807,0.066404
1,Title: O Jardim da Pierrette Author: José de A...,AlmadaNegreiros,7,318,1910,0.022013
2,Title: A Invenção do Dia Claro Author: José de...,AlmadaNegreiros,264,6038,33745,0.043723
3,Title: Litoral A Amadeo de Souza Cardozo Autho...,AlmadaNegreiros,1,655,4166,0.001527
4,EXPOSIÇÃO +amadeo de souza cardoso+ LIGA NAVAL...,AlmadaNegreiros,29,579,3418,0.050086


In [8]:
# codificação da coluna y para fornecer ao futuro modelo
train, le = label_encoding(train, "author")

In [9]:
def remove_punctuation(text):
    """
        Greedy removal of all the punctuation from a list of text lines.
        However, it also removes the email and website punctuation, making
        them hard to recognise.
        Returns a list of text lines without punctuation.
    """
    no_punct_text = []
    for line in text:
        no_punct = "".join([char for char in line if char not in string.punctuation])
        no_punct_text.append(no_punct)
    return "".join(no_punct_text)


# cada vez que uma nova função for criada, introduzi-la em "clean_text()"
def clean_text(df, text_col=str):
    """
        Compiles all the preprocessing functions inside a single function.
    """
    df[text_col] = df[text_col].apply(remove_punctuation)
    return df

In [10]:
train = clean_text(train, "text")
train.head()

Unnamed: 0,text,author,comma_count,word_count,char_count,comma_per_word
0,Title A Scena do Odio Author José de Almada Ne...,0,118,1777,10807,0.066404
1,Title O Jardim da Pierrette Author José de Alm...,0,7,318,1910,0.022013
2,Title A Invenção do Dia Claro Author José de A...,0,264,6038,33745,0.043723
3,Title Litoral A Amadeo de Souza Cardozo Author...,0,1,655,4166,0.001527
4,EXPOSIÇÃO amadeo de souza cardoso LIGA NAVAL D...,0,29,579,3418,0.050086


In [11]:
# # TODO cols:  
# train["punctuation"] = 0
# train["reticências"] = 0

In [12]:
# reorder columns
Xcols = list(set(train.columns) - set(["author"]) - set(["text"]))
Xcols.sort()

cols = ["text"]
cols.extend(Xcols)
cols.append("author")

train = train[cols]
train.head()

Unnamed: 0,text,char_count,comma_count,comma_per_word,word_count,author
0,Title A Scena do Odio Author José de Almada Ne...,10807,118,0.066404,1777,0
1,Title O Jardim da Pierrette Author José de Alm...,1910,7,0.022013,318,0
2,Title A Invenção do Dia Claro Author José de A...,33745,264,0.043723,6038,0
3,Title Litoral A Amadeo de Souza Cardozo Author...,4166,1,0.001527,655,0
4,EXPOSIÇÃO amadeo de souza cardoso LIGA NAVAL D...,3418,29,0.050086,579,0


In [13]:
if len(forced) > 0:
    Xcols = forced

In [14]:
X = train[Xcols]
y = train["author"]

___
### 3) Test

In [16]:
test = get_dataframe("corpora/test/", author_list=["1000Palavras", "500Palavras"],
                     preserve_blank_lines=False, join_every_line=True,
                     separator=" ")

test["author"] = test["author"].str.replace("1000Palavras", "1000").str.replace("500Palavras", "500").astype(int)
test.columns = [list(test.columns)[0], "word_count"]
test

Unnamed: 0,text,word_count
0,"Depois, pouco a pouco, a tranquilidade regress...",1000
1,Justamente como se eu tivesse tido a ideia de ...,1000
2,"Quase um mês depois, a época de exames aproxim...",1000
3,"Agora, porém, era sem fervor, arrastadamente, ...",1000
4,"O cahos de cima a descer, a descer com a morta...",1000
5,"""O Senhor ensina pela pena o que o homem não s...",1000
6,"Depois, pouco a pouco, a tranquilidade regress...",500
7,Justamente como se eu tivesse tido a ideia de ...,500
8,"Quase um mês depois, a época de exames aproxim...",500
9,"Agora, porém, era sem fervor, arrastadamente, ...",500


In [20]:
# add new features
test['comma_count'] = test["text"].str.count(",")
test["word_count"] = test['text'].str.split().str.len()
test['char_count'] = test["text"].str.len()
test["comma_per_word"] = test['comma_count'] / test["word_count"]

test[Xcols].head()

Unnamed: 0,comma_per_word
0,0.169028
1,0.028815
2,0.04771
3,0.144008
4,0.114201


___

### 4) Model predictions

In [21]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X, y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
test["author_pred"] = clf.predict(test[Xcols])
test["author_pred"] = le.inverse_transform(test["author_pred"])

In [26]:
test.head()

Unnamed: 0,text,word_count,comma_count,char_count,comma_per_word,author_pred
0,"Depois, pouco a pouco, a tranquilidade regress...",988,167,5763,0.169028,JoseSaramago
1,Justamente como se eu tivesse tido a ideia de ...,937,27,5453,0.028815,AlmadaNegreiros
2,"Quase um mês depois, a época de exames aproxim...",1048,50,6008,0.04771,AlmadaNegreiros
3,"Agora, porém, era sem fervor, arrastadamente, ...",993,143,6263,0.144008,JoseSaramago
4,"O cahos de cima a descer, a descer com a morta...",1007,115,6470,0.114201,EcaDeQueiros


___
### 5) Evaluate

In [24]:
# labels feitas à mão
# faltam 3 excertos
labels = ["JoseSaramago", "AlmadaNegreiros", "???", "EcaDeQueiros", "CamiloCasteloBranco",
          "JoseRodriguesSantos", "JoseSaramago", "???", "???", "EcaDeQueiros", "CamiloCasteloBranco",
          "JoseRodriguesSantos"]

In [25]:
score = 0

for index, i in enumerate(test["author_pred"]):
    if i == labels[index]:
        print(f"#{index}:\tcerto")
        score += 1
    else:
        print(f"#{index}:\terrado")

accuracy = round(score/len(test)*100, 2)
print(f"\naccuracy = {accuracy} %")

#0:	certo
#1:	certo
#2:	errado
#3:	errado
#4:	errado
#5:	certo
#6:	certo
#7:	errado
#8:	errado
#9:	errado
#10:	errado
#11:	errado

accuracy = 33.33 %


___
### **[TODO] features:**
- <s>número de caracteres por linha</s>
- <s>número de palavras por linha</s>
- <s>número de vírgulas por frase</s>
- número de caracteres especiais (pontuação) por linha
- número de reticências por linha
