# Importing libraries

In [36]:
import pandas as pd
import numpy as np

import spacy
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import accuracy_score, confusion_matrix

import joblib

# importing the dataset

In [25]:
dataset = pd.read_csv('reviews.txt',sep = '\t', names =['Sentiment','Comments'])
dataset.head()

Unnamed: 0,Sentiment,Comments
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [27]:
X = dataset['Comments']
y = dataset['Sentiment']

In [28]:
# Splitting the dataset before we fit tfidf on X to learn vocabulary
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=105)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5534,)
(1384,)
(5534,)
(1384,)


In [None]:
# loading the english model for spacy 
nlp = spacy.load("en_core_web_lg")

In [29]:
# cleaning the comment column before tokenizing
X_train = X_train.apply(lambda x: re.sub("[^a-zA-Z.,!?']+"," ", x))
X_test = X_test.apply(lambda x: re.sub("[^a-zA-Z.,!?']+"," ", x))

In [30]:
def tokenize(document):
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [31]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(ngram_range=(1,2),
#                         max_df=.97,
#                         min_df=3,
                        lowercase= True,
                        tokenizer=tokenize)

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = tfidf.fit_transform(X_train)
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = tfidf.transform(X_test)

In [32]:
# examine the vocabulary and document-term matrix together
dtm = pd.DataFrame(X_train_dtm.todense(), columns=tfidf.get_feature_names())
dtm.head()

Unnamed: 0,Unnamed: 1,awful,brokeback,couple,da,die,favorite,g,gasp,hate,...,young,young adult,yuck,yuck hate,yuh,yuh nasy,zach,zach loud,zen,zen da
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Selecting a model 

In [34]:
clf = naive_bayes.MultinomialNB()

clf.fit(X_train_dtm, y_train)
y_train_pred = clf.predict(X_train_dtm)

In [35]:
# checking the score on the training set
score = accuracy_score(y_train, y_train_pred)
score

0.9967473798337549

In [37]:
confusion_matrix(y_train, y_train_pred)

array([[2370,   18],
       [   0, 3146]])

Checking our model on the test dataset 

In [38]:
y_test_pred = clf.predict(X_test_dtm)

score = accuracy_score(y_test, y_test_pred)
score

0.9855491329479769

In [39]:
confusion_matrix(y_test, y_test_pred)

array([[573,  14],
       [  6, 791]])

# Saving the model 

In [40]:
filename = 'predictor.joblib'
joblib.dump(clf, filename, compress=True)

['predictor.joblib']