In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [29]:
df = pd.read_csv('reviews.tsv', sep='\t')

In [30]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [31]:
df['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [32]:
import nltk
import re

In [33]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mellio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [34]:
from nltk.corpus import stopwords

In [35]:
df['Review'][0]

'Wow... Loved this place.'

In [36]:
test = re.sub('[^a-zA-Z]', ' ', df['Review'][0])
test

'Wow    Loved this place '

In [37]:
test = test.lower()

In [1]:
test = test.split()
test

NameError: name 'test' is not defined

In [39]:
sw = stopwords.words('english')

In [40]:
test = [word for word in test if not word in sw]
test

['wow', 'loved', 'place']

In [41]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [42]:
test = [ps.stem(word) for word in test]
test

['wow', 'love', 'place']

In [43]:
test = " ".join(test)
test

'wow love place'

In [65]:
def text_parse(df,label):
    corpus = []
    for i in range(len(df)):
        review = re.sub('[^a-zA-Z]', ' ', df[label][i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(word) for word in review if not word in sw]
        review = " ".join(review)
        corpus.append(review)
    return corpus

In [68]:
corpus = text_parse(df, "Review")
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [69]:
len(corpus)

1000

## Bag of words


In [47]:
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
cv = CountVectorizer(max_features=1500)

In [49]:
x = cv.fit_transform(corpus).toarray()
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [50]:
from sklearn.model_selection import train_test_split


In [54]:
y = df.iloc[:, 1].values

In [52]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [58]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [59]:
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [60]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [62]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

[[55 42]
 [12 91]]
0.73
              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

    accuracy                           0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



In [73]:
import joblib

joblib.dump(classifier, 'review.pkl')

['review.pkl']