In [48]:
#NLP

#Importing the libraries 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [49]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter ='\t', quoting = 3)

In [50]:
print (dataset)

                                                Review  Liked
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]


In [51]:
#Cleaning the reviews
# Get the libraries and the stopwords
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

corpus = []
for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # The line above replaces all the characters that are not in a-zA-Z with ' ' 

    review = review.lower()
    review = review.split()


    ### ps object is the stemmer object that just keeps the stem of each word
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # The line above reads every word in review and if the word is not in stopwords.words then removes it
    # The set() function is to optimize the process, so if your data is an article remember to definitely include
    # set() so that it is more optimzed
    ### Now join the vector back into a string
    review = ' '.join(review)
#     print (review)
    corpus.append(review)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\n_kon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
# print (corpus)
# Print the corpus to see the results

In [53]:
# Make a table with 1000 rows and one column for each word
# This is the sparsity matrix that is built
# We like sparsity matrix since it helps a lot with learnign
# Sparse Matrix is a matrix that has a lot of zeros
# Create the bag of word model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
# The max_features allows you to pick the most frequent 1500 words 
#and discard the bottom whatever

X = cv.fit_transform(corpus).toarray()

y = dataset.iloc[:, -1]
print (y)
print (X)
print(X.shape)

# There are two techniques to deal with large sparse matrix.
# Dimensionality Reduction
# max_feature reduction
# Common methods a 
 

0      1
1      0
2      0
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: Liked, Length: 1000, dtype: int64
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1000, 1500)


In [54]:
# No Feature Scaling necessaary
# Using Naive Bayes as the classifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [55]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)


In [56]:
# Make the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)


In [57]:
print (cm)

[[55 42]
 [12 91]]


In [58]:
# Calculate Accuracy
Acc = np.trace(cm)/np.sum(np.sum(cm))
print (Acc)

0.73


In [59]:
# Do the challenge later