## Restuarant review sentiment predictor

In [1]:
#importing the main libraries
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
dataset.shape

(1000, 2)

In [5]:
#importing libraries for NLP
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/isayapin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
#Getting rid of punctuation
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
review

'Wow    Loved this place '

In [7]:
#Making everything lowercase
review = review.lower()
review

'wow    loved this place '

In [8]:
#splitting the sentence into words
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [9]:
#Getting the stems
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'love', 'place']

In [10]:
#Joining words to sentences
review = ' '.join(review)
review

'wow love place'

In [11]:
#Setting loop to process the same for all words
corpus = []
for i in range(dataset.shape[0]):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
corpus[0: 10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

In [13]:
#Building predictor algorithm
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [15]:
#Using naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
y_pred = classifier.predict(X_test)

In [17]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [18]:
cm

array([[55, 42],
       [12, 91]])

## Some techniques for NLP + web scraping

In [19]:
#importing libraries for web scraping
import bs4 as bs
import urllib.request
import nltk
import re
from nltk import sent_tokenize, word_tokenize

In [20]:
#Getting the book online
from urllib import request
url = 'https://www.gutenberg.org/files/98/98-0.txt'
response = request.urlopen(url)
raw = response.read().decode('utf8')
raw



In [21]:
start_index = raw.find("Book the First--Recalled to Life")
end_index = raw.find("IV. Congratulatory")
raw = raw[start_index: end_index]

In [22]:
#sorting the punctuations
raw = raw.replace('?', '.')
raw = raw.replace(';', '.')
raw = raw.replace('!', '.')
raw = raw.replace('”', '')
raw = raw.replace('“', '')

In [23]:
#tokenizing the corpus        
sentences = sent_tokenize(raw)
sentences

["Book the First--Recalled to Life\r\n\r\n     Chapter I      The Period\r\n     Chapter II     The Mail\r\n     Chapter III    The Night Shadows\r\n     Chapter IV     The Preparation\r\n     Chapter V      The Wine-shop\r\n     Chapter VI     The Shoemaker\r\n\r\n\r\n     Book the Second--the Golden Thread\r\n\r\n     Chapter I      Five Years Later\r\n     Chapter II     A Sight\r\n     Chapter III    A Disappointment\r\n     Chapter IV     Congratulatory\r\n     Chapter V      The Jackal\r\n     Chapter VI     Hundreds of People\r\n     Chapter VII    Monseigneur in Town\r\n     Chapter VIII   Monseigneur in the Country\r\n     Chapter IX     The Gorgon's Head\r\n     Chapter X      Two Promises\r\n     Chapter XI     A Companion Picture\r\n     Chapter XII    The Fellow of Delicacy\r\n     Chapter XIII   The Fellow of no Delicacy\r\n     Chapter XIV    The Honest Tradesman\r\n     Chapter XV     Knitting\r\n     Chapter XVI    Still Knitting\r\n     Chapter XVII   One Night\r\n   

In [24]:
for i in range(len(sentences)):
    sentences[i] = sentences[i].replace('\r\n', ' ')

In [25]:
sentences

["Book the First--Recalled to Life       Chapter I      The Period      Chapter II     The Mail      Chapter III    The Night Shadows      Chapter IV     The Preparation      Chapter V      The Wine-shop      Chapter VI     The Shoemaker        Book the Second--the Golden Thread       Chapter I      Five Years Later      Chapter II     A Sight      Chapter III    A Disappointment      Chapter IV     Congratulatory      Chapter V      The Jackal      Chapter VI     Hundreds of People      Chapter VII    Monseigneur in Town      Chapter VIII   Monseigneur in the Country      Chapter IX     The Gorgon's Head      Chapter X      Two Promises      Chapter XI     A Companion Picture      Chapter XII    The Fellow of Delicacy      Chapter XIII   The Fellow of no Delicacy      Chapter XIV    The Honest Tradesman      Chapter XV     Knitting      Chapter XVI    Still Knitting      Chapter XVII   One Night      Chapter XVIII  Nine Days      Chapter XIX    An Opinion      Chapter XX     A Plea   

In [26]:
short_sentences = []
for i in range(len(sentences)):
    if 7 < len(word_tokenize(sentences[i])) < 12:
            short_sentences.append(sentences[i])

In [27]:
short_sentences[0:15]

['now, stringing up long rows of miscellaneous criminals.',
 'and the team had capitulated and returned to their duty.',
 "What o'clock do you make it, Joe.",
 'Ten minutes, good, past eleven.',
 "In the king's name, all of you.",
 'they remained in the road below him.',
 'I want a passenger, if it is.',
 'Gentleman of the name of Lorry answer straight.',
 'asked the passenger, then, with mildly quavering speech.',
 "He's hoarser than suits me, is Jerry.)",
 'A despatch sent after you from over yonder.',
 'said Jerry, more hoarsely than before.',
 "So now let's look at you.",
 "You must know Tellson's Bank in London.",
 'I am going to Paris on business.']

In [28]:
#Saving our words
f = open('/home/isayapin/Documents/Bootcamp_6/Natural_Language_Processing/words.txt', 'w')
for i in short_sentences:
    f.write(i + '\n')
f.close()

## Now use these words for any NLP ML models!