### Restaurant Reviews Sentiment Analysis / Natural Language Processing project

In [11]:
import numpy as np
import pandas as pd
import re
import nltk


In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TRANSFORMER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
df = pd.read_csv("C:\\Users\\TRANSFORMER\\Desktop\\Data Science Projects\\Restaurants review  NLP\\datasets_6660_9643_Restaurant_Reviews.tsv",delimiter='\t',quoting=3)

In [9]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [10]:
len(df)

1000

### Data Preprocessing

In [20]:
corp = []
for i in range(len(df)):
    rev = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])
    rev = rev.lower()
    rev = rev.split()
    ps = PorterStemmer()
    rev = [ps.stem(word) for word in rev if not word in set(stopwords.words('english'))]
    rev = ' '.join(rev)
    corp.append(rev)

### Vectorization

Converting the texual data into numerical data

In [35]:
## Creating a bag of words model using CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corp).toarray()
y = df.iloc[:,1]

### Implementing the ML Algorithms for review classification 

In [39]:
#Splitting the df into train and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state =42)

### Implementing MultiNomial Naive-Bayes Algorithm

In [40]:
# Multinomial Naive-bayes
# Fitting Naive-bayes to the Training Set

from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train,y_train)

#Predicting for y_test
y_pred = classifier.predict(X_test)

In [41]:
##Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

#Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

acc= accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall= recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ",round(acc*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

Confusion Matrix:
 [[119  33]
 [ 34 114]]


Accuracy is  77.67 %
Precision is  0.78
Recall is  0.77


### Bernoulli Naive Bayes

In [45]:
## Bernoulli Naive Bayes 
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB(alpha=0.8)
classifier.fit(X_train,y_train)

#Predicting y_test
y_pred = classifier.predict(X_test)

In [46]:
#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

acc= accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall= recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ",round(acc*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

Confusion Matrix:
 [[115  37]
 [ 32 116]]


Accuracy is  77.0 %
Precision is  0.76
Recall is  0.78


### Logistic Regression

In [48]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=1.5)
classifier.fit(X_train, y_train)

#Predicting y_test
y_pred = classifier.predict(X_test)



In [49]:
#Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n",cm)

acc= accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred)
recall= recall_score(y_test, y_pred)

print("\n")
print("Accuracy is ",round(acc*100,2),"%")
print("Precision is ",round(precision,2))
print("Recall is ",round(recall,2))

Confusion Matrix:
 [[125  27]
 [ 43 105]]


Accuracy is  76.67 %
Precision is  0.8
Recall is  0.71


### Conclusion

From the above results, Multinomial Naive Bayes is slightly better method compared to Bernoulli Naive Bayes and Logistic Regression, with 77.67% accuracy which means the model built for the prediction of sentiment of the restaurant review gives 77.67% right prediction.