In [1]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/ronn/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ronn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
TRAIN_PATH = './TrainingSet/text-authorship-training.csv'
TEST_PATH = './TestSet/text-authorship-test.csv'

In [7]:
train_data = pd.read_csv(TRAIN_PATH)
train_data.head()

Unnamed: 0,id,text,author
0,id15709,The occupation is often full of interest and h...,EAP
1,id18229,"Tall, thin, and strait, her face still handsom...",MWS
2,id10092,"What men attempt to embody in the word ""though...",EAP
3,id05976,They will scarcely be so weak as not to 'reali...,EAP
4,id19298,"There dwelt in that ghastly and flexible face,...",HPL


In [8]:
test_data = pd.read_csv(TEST_PATH)
test_data.head()

Unnamed: 0,id,text
0,id26305,"This process, however, afforded me no means of..."
1,id17569,It never once occurred to me that the fumbling...
2,id11008,"In his left hand was a gold snuff box, from wh..."
3,id27763,How lovely is spring As we looked from Windsor...
4,id12958,"Finding nothing else, not even gold, the Super..."


In [9]:
lemmatiser = WordNetLemmatizer()

# Defining a module for Text Processing
def text_process(tex):
    # 1. Removal of Punctuation Marks 
    nopunct=[char for char in tex if char not in string.punctuation]
    nopunct=''.join(nopunct)
    # 2. Lemmatisation 
    a=''
    i=0
    for i in range(len(nopunct.split())):
        b=lemmatiser.lemmatize(nopunct.split()[i], pos="v")
        a=a+b+' '
    # 3. Removal of Stopwords
    return [word for word in a.split() if word.lower() not 
            in stopwords.words('english')]

In [10]:
labelencoder = LabelEncoder()

y = train_data['author']
y = labelencoder.fit_transform(y)

X = train_data['text']

# 80-20 splitting the dataset (80%->Training and 20%->Validation)
X_train, X_test, y_train, y_test = train_test_split(X, y
                                  ,test_size=0.2, random_state=1234)

In [11]:
# defining the bag-of-words transformer on the text-processed corpus # i.e., text_process() declared in II is executed...
bow_transformer=CountVectorizer(analyzer=text_process).fit(X_train)

In [12]:
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA

In [13]:
# transforming into Bag-of-Words and hence textual data to numeric..
text_bow_test=bow_transformer.transform(X_test)#TEST DATA
X_test.head()

1363     "Such were the events that preyed on the heart...
11246    But now the time is come when I may quit life,...
13227    The Italian believes it the voice of a Russian...
3126     The hills rose scarlet and gold to the north o...
1842     Nature, our mother, and our friend, had turned...
Name: text, dtype: object

### Treinando o modelo

In [14]:
# instantiating the model with Multinomial Naive Bayes..
model = MultinomialNB()
# training the model...
model = model.fit(text_bow_train, y_train)

In [15]:
model.score(text_bow_train, y_train)

0.9140818625610214

In [16]:
model.score(text_bow_test, y_test)

0.8260738960648843

In [21]:
# getting the predictions of the Validation Set...
predictions = model.predict(text_bow_test)
# getting the Precision, Recall, F1-Score
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      1341
           1       0.84      0.81      0.83       909
           2       0.81      0.85      0.83      1079

    accuracy                           0.83      3329
   macro avg       0.83      0.83      0.83      3329
weighted avg       0.83      0.83      0.83      3329



### Testando o modelo e salvando no CSV

In [22]:
for index, row in test_data.iterrows():
    print(row['text'])
    text_bow = bow_transformer.transform(row['text'])
    print(text_bow)
    break
#     pm = process_message(row['message'])
#     spam = sc_tf_idf.classify(pm)
#     if spam:
#         sms_test.loc[index, 'label'] = 'spam'
#     else:
#         sms_test.loc[index, 'label'] = 'ham'

ValueError: Iterable over raw text documents expected, string object received.