Colab Link: https://colab.research.google.com/drive/1oe1Hfd81xUTT0Fj3c5NB2D9wOSRJTZgd

In [27]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Bag Of Words

### Data Prep

#### Read

In [28]:
data = pd.read_csv("./data/NLPData1.tsv", delimiter = '\t', quoting = 3)  # setting quoting = 3 will ignore all double quotes
data.head(2)

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0


#### Clean Data

In [29]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer   # 'extracts' the root of the words from their variations

[nltk_data] Downloading package stopwords to /Users/pauls/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [30]:
corpus = []
ps = PorterStemmer()
    
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])  # replace all punctuations by space
    review = review.lower()    # convert to lowercase
    review = review.split()    # split each review into list of words
    
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)   # join back the stemmed words for the review
    corpus.append(review)

In [15]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

### Build Model

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
# max features set to a number lower than the total number of words so that it excludes the sparsely occuring words
# total number of words can be found by checking the length of X after the fit transform step
c_cv = CountVectorizer(max_features = 1500)  
X = c_cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

### Train Test Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

### Train Model and Predict

In [31]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
c_lr = LogisticRegression(random_state = 0)
c_lr.fit(X_train, y_train)
y_pred_lr = c_lr.predict(X_test)

In [32]:
# SVM
from sklearn.svm import SVC
c_svc = SVC(kernel="rbf", random_state=0)
c_svc.fit(X_train,y_train)
y_pred_svc = c_svc.predict(X_test)

In [34]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
c_gnb = GaussianNB()
c_gnb.fit(X_train, y_train)
y_pred_gnb = c_gnb.predict(X_test)

In [35]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
c_dt = DecisionTreeClassifier(random_state = 0)
c_dt.fit(X_train, y_train)
y_pred_dt = c_dt.predict(X_test)

In [36]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
c_rf = RandomForestClassifier(n_estimators=10, random_state=0)
c_rf.fit(X_train, y_train)
y_pred_rf = c_rf.predict(X_test)

### Check Predictions

In [37]:
comp_df = pd.DataFrame(np.concatenate((y_test.reshape(len(y_test),1),y_pred_lr.reshape(len(y_pred_lr),1), y_pred_svc.reshape(len(y_pred_svc),1),y_pred_gnb.reshape(len(y_pred_gnb),1), y_pred_dt.reshape(len(y_pred_dt),1), y_pred_rf.reshape(len(y_pred_rf),1)),1))
comp_df.columns=['test','pred_lr','pred_svc','pred_gnb','pred_dt','pred_rf']
comp_df      

Unnamed: 0,test,pred_lr,pred_svc,pred_gnb,pred_dt,pred_rf
0,0,0,0,1,0,0
1,0,0,0,1,0,0
2,0,0,0,1,1,1
3,0,0,0,0,0,0
4,0,0,0,0,1,0
...,...,...,...,...,...,...
195,0,0,0,0,0,0
196,1,1,1,1,1,1
197,1,1,1,1,1,1
198,0,0,0,1,0,0


### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [38]:
# Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix for Logistic Regression: ",cm_lr)
print("Number of correct predictions using Logistic Regression: ",cm_lr[0][0] + cm_lr[1][1])
print("Number of incorrect predictions using Logistic Regression: ",cm_lr[0][1] + cm_lr[1][0])

Confusion Matrix for Logistic Regression:  [[80 17]
 [28 75]]
Number of correct predictions using Logistic Regression:  155
Number of incorrect predictions using Logistic Regression:  45


In [39]:
# SVC
cm_svc = confusion_matrix(y_test, y_pred_svc)
print("Confusion Matrix for SVC: ",cm_svc)
print("Number of correct predictions using SVC: ",cm_svc[0][0] + cm_svc[1][1])
print("Number of incorrect predictions using SVC: ",cm_svc[0][1] + cm_svc[1][0])

Confusion Matrix for SVC:  [[89  8]
 [36 67]]
Number of correct predictions using SVC:  156
Number of incorrect predictions using SVC:  44


In [40]:
# Naive Bayes
cm_gnb = confusion_matrix(y_test, y_pred_gnb)
print("Confusion Matrix for Naive Bayes: ",cm_gnb)
print("Number of correct predictions using Naive Bayes: ",cm_gnb[0][0] + cm_gnb[1][1])
print("Number of incorrect predictions using Naive Bayes: ",cm_gnb[0][1] + cm_gnb[1][0])

Confusion Matrix for Naive Bayes:  [[55 42]
 [12 91]]
Number of correct predictions using Naive Bayes:  146
Number of incorrect predictions using Naive Bayes:  54


In [41]:
# Decision Tree
cm_dt = confusion_matrix(y_test, y_pred_dt)
print("Confusion Matrix for Decision Tree: ",cm_dt)
print("Number of correct predictions using Decision Tree: ",cm_dt[0][0] + cm_dt[1][1])
print("Number of incorrect predictions using Decision Tree: ",cm_dt[0][1] + cm_dt[1][0])

Confusion Matrix for Decision Tree:  [[77 20]
 [33 70]]
Number of correct predictions using Decision Tree:  147
Number of incorrect predictions using Decision Tree:  53


In [42]:
# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix for Random Forest: ",cm_rf)
print("Number of correct predictions using Random Forest: ",cm_rf[0][0] + cm_rf[1][1])
print("Number of incorrect predictions using Random Forest: ",cm_rf[0][1] + cm_rf[1][0])

Confusion Matrix for Random Forest:  [[84 13]
 [44 59]]
Number of correct predictions using Random Forest:  143
Number of incorrect predictions using Random Forest:  57


### Evaluate

In [45]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [43]:
print("Accuracy for Logistic Regression: ",accuracy_score(y_test, y_pred_lr))
print("Accuracy for SVC: ",accuracy_score(y_test, y_pred_svc))
print("Accuracy for Naive Bayes: ",accuracy_score(y_test, y_pred_gnb))
print("Accuracy for Decision Tree: ",accuracy_score(y_test, y_pred_dt))
print("Accuracy for Random Forest: ",accuracy_score(y_test, y_pred_rf))

Accuracy for Logistic Regression:  0.775
Accuracy for SVC:  0.78
Accuracy for Naive Bayes:  0.73
Accuracy for Decision Tree:  0.735
Accuracy for Random Forest:  0.715


In [46]:
print("Recall for Logistic Regression: ",recall_score(y_test, y_pred_lr))
print("Recall for SVC: ",recall_score(y_test, y_pred_svc))
print("Recall for Naive Bayes: ",recall_score(y_test, y_pred_gnb))
print("Recall for Decision Tree: ",recall_score(y_test, y_pred_dt))
print("Recall for Random Forest: ",recall_score(y_test, y_pred_rf))

Recall for Logistic Regression:  0.7281553398058253
Recall for SVC:  0.6504854368932039
Recall for Naive Bayes:  0.883495145631068
Recall for Decision Tree:  0.6796116504854369
Recall for Random Forest:  0.5728155339805825


In [47]:
print("Precision for Logistic Regression: ",precision_score(y_test, y_pred_lr))
print("Precision for SVC: ",precision_score(y_test, y_pred_svc))
print("Precision for Naive Bayes: ",precision_score(y_test, y_pred_gnb))
print("Precision for Decision Tree: ",precision_score(y_test, y_pred_dt))
print("Precision for Random Forest: ",precision_score(y_test, y_pred_rf))

Precision for Logistic Regression:  0.8152173913043478
Precision for SVC:  0.8933333333333333
Precision for Naive Bayes:  0.6842105263157895
Precision for Decision Tree:  0.7777777777777778
Precision for Random Forest:  0.8194444444444444


In [48]:
print("F1 Score for Logistic Regression: ",f1_score(y_test, y_pred_lr))
print("F1 Score for SVC: ",f1_score(y_test, y_pred_svc))
print("F1 Score for Naive Bayes: ",f1_score(y_test, y_pred_gnb))
print("F1 Score for Decision Tree: ",f1_score(y_test, y_pred_dt))
print("F1 Score for Random Forest: ",f1_score(y_test, y_pred_rf))

F1 Score for Logistic Regression:  0.7692307692307693
F1 Score for SVC:  0.7528089887640449
F1 Score for Naive Bayes:  0.7711864406779662
F1 Score for Decision Tree:  0.7253886010362695
F1 Score for Random Forest:  0.6742857142857143
