# Natural Language Processing

In [9]:
## Import all the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVC
from matplotlib.colors import ListedColormap
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/omkarmutreja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('/Users/omkarmutreja/Downloads/Natural_Language_Processing/Restaurant_Reviews.tsv',delimiter='\t',quoting=3)
print(df.shape)
df.head()

(1000, 2)


Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# Cleaning the texts
# Creating an empty list: Corpus to append the bag of words in different reviews
corpus =[]
for i in range (0,1000):
    review = re.sub('[^a-zA-Z]',' ',df['Review'][i]) # Replacing numbers, punctuations and other wild cards by space
    review = review.lower() # Lower-casing all the letters
    review = review.split()
    ps = PorterStemmer() # Stemming so that we do not have repetitive words in our corpus
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
corpus[:5]    

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [4]:
# Creating the bag of words model
cv = CountVectorizer(max_features=1500) # To select the most frequent words
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:,1].values

In [5]:
# Split the data into training and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)
print(X_train.shape)
print(X_test.shape)

(800, 1500)
(200, 1500)


In [10]:
# Fitting the Logistic Regression model
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy of the Logistic Regression model :",accuracy_score(y_test,y_pred))

[[76 21]
 [37 66]]
Accuracy of the Logistic Regression model : 0.71


In [6]:
# Fitting the SVM model
svm = SVC(kernel='linear',random_state=0)
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy of the SVM model :",accuracy_score(y_test,y_pred))

[[74 23]
 [33 70]]
Accuracy of the SVM model : 0.72


In [7]:
# Fitting the Naive Bayes model
naive_bayes = GaussianNB()
naive_bayes.fit(X_train,y_train)
y_pred = naive_bayes.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy of the Naive Bayes model :",accuracy_score(y_test,y_pred))

[[55 42]
 [12 91]]
Accuracy of the Naive Bayes model : 0.73


In [8]:
# Fitting the Random Forest model
random_forest = RandomForestClassifier(n_estimators=1000,criterion='entropy',random_state=0)
random_forest.fit(X_train,y_train)
y_pred = random_forest.predict(X_test) 
print(confusion_matrix(y_test,y_pred))
print("Accuracy of the Random Forest model :",accuracy_score(y_test,y_pred))

[[88  9]
 [47 56]]
Accuracy of the Random Forest model : 0.72


## Naive Bayes performs the best as compared to other models and the accuracy would have been better if we had more number of obersvations in our training set