# Text Authorship Identification

In [1]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ronn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ronn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Carregando dados de treino e teste

In [3]:
TRAIN_PATH = './TrainingSet/text-authorship-training.csv'
TEST_PATH = './TestSet/text-authorship-test.csv'

In [4]:
train_data = pd.read_csv(TRAIN_PATH)
train_data.head()

Unnamed: 0,id,text,author
0,id15709,The occupation is often full of interest and h...,EAP
1,id18229,"Tall, thin, and strait, her face still handsom...",MWS
2,id10092,"What men attempt to embody in the word ""though...",EAP
3,id05976,They will scarcely be so weak as not to 'reali...,EAP
4,id19298,"There dwelt in that ghastly and flexible face,...",HPL


In [5]:
test_data = pd.read_csv(TEST_PATH)
test_data.head()

Unnamed: 0,id,text
0,id26305,"This process, however, afforded me no means of..."
1,id17569,It never once occurred to me that the fumbling...
2,id11008,"In his left hand was a gold snuff box, from wh..."
3,id27763,How lovely is spring As we looked from Windsor...
4,id12958,"Finding nothing else, not even gold, the Super..."


### Função
Pré-processamento de texto com o Lemmatizer

In [6]:
lemmatiser = WordNetLemmatizer()

def text_process(text):
    nopunct = [char for char in text if char not in string.punctuation]
    nopunct = ''.join(nopunct)
    
    a = ''
    i = 0
    for i in range(len(nopunct.split())):
        b = lemmatiser.lemmatize(nopunct.split()[i], pos = "v")
        a = a + b + ' '
    
    return [word for word in a.split() if word.lower() not 
            in stopwords.words('english')]

### Separando os dados de treino e teste

In [7]:
labelencoder = LabelEncoder()

y = train_data['author']
y = labelencoder.fit_transform(y)

X = train_data['text']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                    test_size=0.2, random_state=1234)

### Transformando em bag-of-words
Pode levar alguns minutos

In [8]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(X_train)

In [9]:
text_bow_train = bow_transformer.transform(X_train)

In [10]:
text_bow_test = bow_transformer.transform(X_test)

### Treinando o modelo

In [11]:
model = MultinomialNB()
model = model.fit(text_bow_train, y_train)

In [12]:
model.score(text_bow_train, y_train)

0.9140818625610214

In [13]:
model.score(text_bow_test, y_test)

0.8260738960648843

In [14]:
predictions = model.predict(text_bow_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      1341
           1       0.84      0.81      0.83       909
           2       0.81      0.85      0.83      1079

    accuracy                           0.83      3329
   macro avg       0.83      0.83      0.83      3329
weighted avg       0.83      0.83      0.83      3329



### Testando o modelo e salvando no CSV

In [15]:
predictions = []
for index, row in test_data.iterrows():
    text_bow = bow_transformer.transform(row)
    prediction = model.predict(text_bow)
    if prediction[1] == 0:
        predictions.append('EAP')
    elif prediction[1] == 1:
        predictions.append('HPL')
    elif prediction[1] == 2:
        predictions.append('MWS')
    else:
        print(prediction[1])

test_data['author'] = predictions

In [16]:
test_data.to_csv('result.csv', index=False)
test_data.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,EAP
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",EAP
