In [2]:
import numpy as np   
import pandas as pd  
  
# Import dataset 
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t') 
import re  
  
# Natural Language Tool Kit 
import nltk

dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
nltk.download('stopwords') 
  
# to remove stopword 
from nltk.corpus import stopwords 
  
# for Stemming propose  
from nltk.stem.porter import PorterStemmer 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:

# Initialize empty array 
# to append clean text  
corpus = []  
  
# 1000 (reviews) rows to clean 
for i in range(0, 1000):  
      
    # column : "Review", row ith 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  
      
    # convert all cases to lower cases 
    review = review.lower()  
      
    # split to array(default delimiter is " ") 
    review = review.split()  
      
    # creating PorterStemmer object to 
    # take main stem of each word 
    ps = PorterStemmer()  
      
    # loop for stemming each word 
    # in string array at ith row     
    review = [ps.stem(word) for word in review 
                if not word in set(stopwords.words('english'))]  
                  
    # rejoin all string array elements 
    # to create back into a string 
    review = ' '.join(review)   
      
    # append each string to create 
    # array of clean text  
    corpus.append(review)  
    

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()  

y = dataset.iloc[:, 1].values 

from sklearn.model_selection import train_test_split 
  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

from sklearn.ensemble import RandomForestClassifier 
  
model = RandomForestClassifier(n_estimators=100,max_depth=5)
                              
model.fit(X_train, y_train)  

y_pred = model.predict(X_test) 
  
from sklearn.metrics import accuracy_score as ac
print("Accuracy of Random Forest "+ str(ac(y_test, y_pred)))

from sklearn.svm import SVC
  
model = SVC(kernel='rbf')
                              
model.fit(X_train, y_train)  

y_pred = model.predict(X_test) 
  
from sklearn.metrics import accuracy_score as ac
print("Accuracy of Support Vector Machine " +str(ac(y_test, y_pred)))



from sklearn.linear_model import LogisticRegression 
  
model = LogisticRegression(penalty='l1')
                              
model.fit(X_train, y_train)  

y_pred = model.predict(X_test) 
  
from sklearn.metrics import accuracy_score as ac
print("Accuracy of Logistic Regression " +str(ac(y_test, y_pred)))
    
    
    

Accuracy of Random Forest 0.688




Accuracy of Support Vector Machine 0.46
Accuracy of Logistic Regression 0.756


