In [1]:
# Natural Language Processing

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
!curl https://raw.githubusercontent.com/muke888/UdemyMachineLearning/master/Part7-1.Natural%20Language%20Processing%20Algorithms/Restaurant_Reviews.tsv -o Restaurant_Reviews.tsv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 61332  100 61332    0     0   356k      0 --:--:-- --:--:-- --:--:--  356k


In [2]:
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv',delimiter = '\t', quoting = 3)
dataset.info()
dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Refresher for regular expressions:



```
'^[a-zA-Z]':  match all strings that start with a letter
'[^a-zA-Z]':  match all strings that contain a non-letter
```



In [3]:
# Cleaning the texts

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
ps = PorterStemmer()
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
corpus[:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [5]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
#create a feature matrix out of the most 1500 frequent words:
cv = CountVectorizer(max_features = 1500) 
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
X[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [6]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
X_train[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [7]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [8]:
# Predicting the Test set results
# Looking at first 5 testing data, we can see we predicted the first 3 incorrectly as positive reviews, and last 2 correctly as negative review

y_pred = classifier.predict(X_test)
print(y_pred[:5])
print(y_test[:5])
print(cv.inverse_transform(X_test[:5]))

[1 1 1 0 0]
[0 0 0 0 0]
[array(['aw', 'food', 'present'], dtype='<U17'), array(['food', 'servic', 'worst'], dtype='<U17'), array(['dine', 'never', 'place'], dtype='<U17'), array(['disgrac', 'guess', 'mayb', 'night', 'went'], dtype='<U17'), array(['avoid', 'lover', 'mean', 'place', 'sushi'], dtype='<U17')]


In [9]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 42],
       [12, 91]])

In [10]:
# True Negative, False Positive, False Negative, True Positive
# Not bad results for training on only 800 reviews

tn, fp, fn, tp = cm.ravel()
print(tn, fp, fn, tp)
print("Accuracy of:", (tn+tp)/(tn+tp+fp+fn))

55 42 12 91
Accuracy of: 0.73


Accuracy = (TP + TN) / (TP + TN + FP + FN)

Precision = TP / (TP + FP)

Recall = TP / (TP + FN)

F1 Score = 2 * Precision * Recall / (Precision + Recall)

In [11]:
#Examples
#Example of PorterStemmer

from nltk.stem import PorterStemmer
 
words = ["game","gamed","gaming","games"]
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

game
game
game
game


In [12]:
#Example of tokenization

import nltk
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
 
data = "That quick black cat who jumped over the black lazy dog."
words = word_tokenize(data)
print(words)

['That', 'quick', 'black', 'cat', 'who', 'jumped', 'over', 'the', 'black', 'lazy', 'dog', '.']


In [13]:
#Example of using Stopwords

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
 
data = "That quick black cat who jumped over the black lazy dog."
stopWords = set(stopwords.words('english'))
words = word_tokenize(data)
wordsFiltered = []
 
for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)
 
print(wordsFiltered)
print(stopWords)

['That', 'quick', 'black', 'cat', 'jumped', 'black', 'lazy', 'dog', '.']
{'doing', 'further', 'and', 'no', 'am', "wouldn't", 'below', "you'd", 'only', "aren't", 'hers', 'are', "needn't", "won't", 'wouldn', 'if', 'ma', 'yours', 'our', 'out', 'can', 'had', 'the', 'why', 'ourselves', 'at', 'were', 'she', 'was', 'own', 'shan', "you're", 'didn', 'as', "hadn't", 'mustn', 'themselves', "wasn't", 'couldn', 'needn', 'between', 'whom', 'off', 'isn', 'hadn', 'or', 'who', 'you', 'there', 'against', 'during', 'on', 'yourselves', 'these', 'be', 'has', 'about', 'they', 'here', 'not', "hasn't", 'i', 'it', 'but', 'few', "weren't", 'her', 'now', 'being', "shouldn't", 'himself', 'we', 'yourself', 'when', "couldn't", 'don', "mustn't", 'its', 'which', "that'll", 'again', 'does', 'very', 'mightn', 'by', 'until', 'into', "you'll", 'under', 'to', 'have', "haven't", "don't", 'this', 'should', 'ours', 'with', 'because', 'doesn', 'of', "isn't", 'herself', 'd', 'their', 'theirs', "it's", 'any', 'more', 'just', 'w

In [14]:
#Example of Text Vectorization

from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["That quick black cat who jumped over the black lazy dog."]
# create the transform
vectorizer = CountVectorizer(max_features = 5)
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())

{'black': 0, 'cat': 1, 'jumped': 3, 'lazy': 4, 'dog': 2}
(1, 5)
<class 'scipy.sparse.csr.csr_matrix'>
[[2 1 1 1 1]]
