# Sentiment Classification 

In [1]:
from os import getcwd, chdir
import re
import numpy as np
import pickle as pk
import pandas as pd

from nltk.metrics import ConfusionMatrix
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify import accuracy
from nltk.tokenize import word_tokenize as wt

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score

# Change your path here
fpath = getcwd()
print(fpath)

/Users/pierlim/PycharmProjects/sent_mining_CA


In [2]:
df = pd.read_csv('./data/df_reviews_train.csv')
df.head()

Unnamed: 0,rating,review,sentiment
0,5,In my younger days when lunch choices consiste...,1
1,3,"After going through yelp and tripadvisor, I wa...",-1
2,5,Ordered Sichuan Prawns and Singapore Rice Nood...,1
3,3,Wong Kei is one of the many options you'll fin...,-1
4,1,One of the worst experience in a restaurant in...,-1


## Use Standard Dataset to combine with scrapped data

In [3]:
ffile1 = open("./data/train.csv","r", encoding = "ISO-8859-1")
df_standard = pd.read_csv(ffile1, encoding = "utf-8")
df_standard.drop(['restaurant_id', 'date', 'review_id', 'stars'], inplace=True, axis=1)
df_standard.rename(columns={'text': 'review', 'Sentiment': 'sentiment'}, inplace=True)
df_standard.head()

Unnamed: 0,review,sentiment
0,Very disappointed in the customer service. We ...,negative
1,I really wasn't thrilled with our meal here. T...,negative
2,STAY AWAY...\n\nWe've been 3 times over the pa...,negative
3,The food is good and the portions are large. ...,negative
4,I feel bad about giving this place such a meh ...,negative


In [4]:
mask = df_standard.sentiment == 'negative'
column_name = 'sentiment'
df_standard.loc[mask, column_name] = -1
mask = df_standard.sentiment == 'positive'
column_name = 'sentiment'
df_standard.loc[mask, column_name] = 1

df_standard.head()

Unnamed: 0,review,sentiment
0,Very disappointed in the customer service. We ...,-1
1,I really wasn't thrilled with our meal here. T...,-1
2,STAY AWAY...\n\nWe've been 3 times over the pa...,-1
3,The food is good and the portions are large. ...,-1
4,I feel bad about giving this place such a meh ...,-1


## Split Data into Train and Validation Set

In [5]:
# Train-Test Split + Stratify
df.drop(['rating'], inplace=True, axis=1)
df.head()

Unnamed: 0,review,sentiment
0,In my younger days when lunch choices consiste...,1
1,"After going through yelp and tripadvisor, I wa...",-1
2,Ordered Sichuan Prawns and Singapore Rice Nood...,1
3,Wong Kei is one of the many options you'll fin...,-1
4,One of the worst experience in a restaurant in...,-1


In [6]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df,stratify=df['sentiment'], test_size=0.3) 

In [7]:
# df_train and df_standard are now the same format, we can concat and use it as a whole 
df_train_all = pd.concat([df_train, df_standard], axis=0)


## Naive Bayes Classifier

In [8]:
# Convert to dictionary format because apparently NLTK requires this
train_pos = [[row["review"], 1] for idx, row in df_train.iterrows() if row["sentiment"]==1]
train_neg = [[row["review"], -1] for idx, row in df_train.iterrows() if row["sentiment"]==-1]

test_pos = [[row["review"], 1] for idx, row in df_test.iterrows() if row["sentiment"]==1]
test_neg = [[row["review"], -1] for idx, row in df_test.iterrows() if row["sentiment"]==-1]

def word_feats(words):
    return dict([(word, True) for word in words])

In [9]:
# need this part for max ent portion
testset = test_pos + test_neg
test_nolab = [t[0] for t in testset]
test_lab = [t[1] for t in testset]
test_tokenized = [[wt(x), c] for x,c in testset]
test_featureset = [(word_feats(d), c) for (d,c) in test_tokenized] 
test_nolab_tok = [t[0] for t in test_featureset]  # need to transform to predict

In [10]:
trainset = train_pos + train_neg
train_tokenized = [[wt(x), c] for x,c in trainset] # may need to introduce some pre-processing at this stage for better results

def word_feats(words):
    return dict([(word, True) for word in words])
train_featureset = [(word_feats(d), c) for (d,c) in train_tokenized] 

Prepare a combined training dataset of both scraped reviews and lecturer's standard set 

In [11]:
train_all_pos = [[row["review"], 1] for idx, row in df_train_all.iterrows() if row["sentiment"]==1]
train_all_neg = [[row["review"], -1] for idx, row in df_train_all.iterrows() if row["sentiment"]==-1]
trainset_all = train_all_pos + train_all_neg
train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

In [12]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)

## Preparing the data first 
train_nolab = [t[0] for t in trainset]
train_lab = [t[1] for t in trainset]

# Preparing test set in same format
testset = test_pos + test_neg
test_nolab = [t[0] for t in testset]
test_lab = [t[1] for t in testset]

# Create your tf-idf function
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
train_vectors = vectorizer.fit_transform(train_nolab)
test_vectors = vectorizer.transform(test_nolab)

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(train_vectors, train_lab)

predNB = clf.predict(train_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
                 refused = True               -1 : 1      =     24.0 : 1.0
                horrible = True               -1 : 1      =     20.1 : 1.0
                     Meh = True               -1 : 1      =     19.4 : 1.0
                     ok. = True               -1 : 1      =     18.5 : 1.0
               tasteless = True               -1 : 1      =     17.1 : 1.0
               overrated = True               -1 : 1      =     17.1 : 1.0
               happening = True               -1 : 1      =     17.1 : 1.0
                lukewarm = True               -1 : 1      =     17.1 : 1.0
                 Average = True               -1 : 1      =     17.1 : 1.0
                   worst = True               -1 : 1      =     16.7 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,50,858,908
1,0,3113,3113
All,50,3971,4021


Unnamed: 0,review,sentiment
5517,Had Lunch here yesterday afternoon with friend...,1
2369,Very good and expensive Indian restaurant. \nP...,1
1464,"Hands down the best roast duck in London, it's...",1
297,If you're looking for the closest thing to aut...,1
2766,I have become a fairly regular customer at the...,1
949,One of my favourite spots to eat in London. I'...,1
4569,Came for breakfast . Bacon and egg naan. Yum....,1
4561,"Excellent! Everything: food, service, and ambi...",1
454,Slow service with many mistakes - during a slo...,-1
566,So the whole entrance experience is pretty ama...,1


In [13]:
print (classification_report(pred,  train_lab))
print (accuracy_score(pred, train_lab))

             precision    recall  f1-score   support

         -1       0.05      1.00      0.09        44
          1       1.00      0.78      0.88      3977

avg / total       0.99      0.79      0.87      4021

0.7851280775926387


May need some preprocessing. See '4' as an informative feature. Makes sense? 

Now test on the test set

In [14]:
predNB = clf.predict(test_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(test_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,4,385,389
1,0,1335,1335
All,4,1720,1724


In [15]:
print (classification_report(pred,  test_lab))
print (accuracy_score(pred, test_lab))

             precision    recall  f1-score   support

         -1       0.01      1.00      0.02         4
          1       1.00      0.78      0.87      1720

avg / total       1.00      0.78      0.87      1724

0.7766821345707656


## Repeat Naive Bayes Classifier Using Combined Training Set

In [19]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_all_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)

## Preparing the data first 
trainall_nolab = [t[0] for t in trainset_all]
trainall_lab = [t[1] for t in trainset_all]

# Create your tf-idf function
vectorizer_all = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
trainall_vectors = vectorizer_all.fit_transform(trainall_nolab)
testall_vectors = vectorizer_all.transform(test_nolab)
pk.dump(vectorizer_all, open("./models/vectorise.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(trainall_vectors, trainall_lab)
pk.dump(clf, open("./models/classifier_naivebayes.pk","wb"))
predNB = clf.predict(trainall_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
                Terrible = True               -1 : 1      =     65.0 : 1.0
                      Wo = True               -1 : 1      =     58.1 : 1.0
                Horrible = True               -1 : 1      =     48.2 : 1.0
               Brazilian = True               -1 : 1      =     45.5 : 1.0
                     Meh = True               -1 : 1      =     41.1 : 1.0
              flavorless = True               -1 : 1      =     39.0 : 1.0
            unacceptable = True               -1 : 1      =     36.5 : 1.0
                  rudely = True               -1 : 1      =     35.8 : 1.0
                   WORST = True               -1 : 1      =     31.6 : 1.0
              microwaved = True               -1 : 1      =     30.9 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10295,1624,11919
1,1088,11343,12431
All,11383,12967,24350


In [20]:
print (classification_report(pred,  trainall_lab))
print (accuracy_score(pred, trainall_lab))

             precision    recall  f1-score   support

         -1       0.86      0.90      0.88     11383
          1       0.91      0.87      0.89     12967

avg / total       0.89      0.89      0.89     24350

0.8886242299794661


Now use full dataset-trained classifier on test set 

In [21]:
predNB = clf.predict(testall_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(test_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,185,204,389
1,65,1270,1335
All,250,1474,1724


In [22]:
print (classification_report(pred,  test_lab))
print (accuracy_score(pred, test_lab))

             precision    recall  f1-score   support

         -1       0.48      0.74      0.58       250
          1       0.95      0.86      0.90      1474

avg / total       0.88      0.84      0.86      1724

0.8439675174013921


## SVM Classifier

In [28]:
from sklearn.svm import SVC

# SVM Classifier from sklearn
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=10000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

classifier_svm = train_svm(train_vectors, train_lab)  # training the SVM model
predSVM = classifier_svm.predict(train_vectors) 
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,864,44,908
1,14,3099,3113
All,878,3143,4021


In [29]:
print (classification_report(predSVM,  train_lab))
print (accuracy_score(predSVM, train_lab))

             precision    recall  f1-score   support

         -1       0.95      0.98      0.97       878
          1       1.00      0.99      0.99      3143

avg / total       0.99      0.99      0.99      4021

0.9855757274309873


In [30]:
train_vectors.shape


(4021, 6223)

In [31]:
test_vectors.shape

(1724, 6223)

In [32]:
predSVM = classifier_svm.predict(test_vectors) 
pred_svm = list(predSVM)

In [33]:
cm6=pd.crosstab( pd.Series(test_lab), pd.Series(pred_svm), rownames= ['actuals'], colnames=['pred'],margins=True)
cm6


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,232,157,389
1,91,1244,1335
All,323,1401,1724


In [34]:
print (classification_report(pred_svm,  test_lab))
print (accuracy_score(pred_svm, test_lab))

             precision    recall  f1-score   support

         -1       0.60      0.72      0.65       323
          1       0.93      0.89      0.91      1401

avg / total       0.87      0.86      0.86      1724

0.8561484918793504


## Repeat SVM Classifier using Combined Training Dataset

In [None]:
classifier_svm = train_svm(trainall_vectors, trainall_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_svm.pk","wb"))
predSVM = classifier_svm.predict(trainall_vectors) 
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1




In [None]:
print (classification_report(predSVM,  trainall_lab))
print (accuracy_score(predSVM, trainall_lab))

In [None]:
predSVM = classifier_svm.predict(testall_vectors) 
pred_svm = list(predSVM)
cm6=pd.crosstab( pd.Series(test_lab), pd.Series(pred_svm), rownames= ['actuals'], colnames=['pred'],margins=True)
cm6

In [None]:
print (classification_report(pred_svm,  test_lab))
print (accuracy_score(pred_svm, test_lab))

## Max Ent Classifier

In [93]:
classifier_me = MaxentClassifier.train(train_featureset, algorithm="IIS", max_iter=5)

  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.875
             2          -0.27948        0.875
             3          -0.23122        0.875
             4          -0.19607        0.917
         Final          -0.16931        0.958


In [94]:
pred_me = []
test_nolab_tok = [t[0] for t in test_featureset]  # need to transform to predict
for t in test_nolab_tok:
    pred_me.append(classifier_me.classify(t))

cm5=pd.crosstab( pd.Series(test_lab), pd.Series(pred_me), rownames= ['actuals'], colnames=['pred'],margins=True)
print (cm5)

pred      1  All
actuals         
-1        1    1
1        10   10
All      11   11


In [95]:
print (classification_report(pred_me,  test_lab))
print (accuracy_score(pred_me, test_lab))

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.91      0.95        11

avg / total       1.00      0.91      0.95        11

0.9090909090909091


  'recall', 'true', average, warn_for)


## Repeat Max Ent Classifier with Full Dataset

In [136]:
classifier_me = MaxentClassifier.train(train_all_featureset, algorithm="IIS", max_iter=5)
pred_me = []
test_nolab_tok = [t[0] for t in test_featureset]  # need to transform to predict
for t in test_nolab_tok:
    pred_me.append(classifier_me.classify(t))

cm5=pd.crosstab( pd.Series(test_lab), pd.Series(pred_me), rownames= ['actuals'], colnames=['pred'],margins=True)
print (cm5)


  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.459
             2          -0.68658        0.472
             3          -0.71457        0.469
             4          -0.73350        0.468
         Final          -0.74345        0.468
pred      1  All
actuals         
-1        1    1
1        10   10
All      11   11


In [137]:
print (classification_report(pred_me,  test_lab))
print (accuracy_score(pred_me, test_lab))

             precision    recall  f1-score   support

         -1       0.00      0.00      0.00         0
          1       1.00      0.91      0.95        11

avg / total       1.00      0.91      0.95        11

0.9090909090909091


  'recall', 'true', average, warn_for)


## Use K Best Features 

In [227]:
# TODO update this to run on full dataset 

ch21 = SelectKBest(chi2, k=500) # TODO modify k according to number of features you want 
# Transform your training and testing datasets accordingly
train_Kbest = ch21.fit_transform(train_vectors, train_lab)
test_Kbest = ch21.transform(test_vectors)

## K Best SVM

In [228]:
# Train your SVM with the k best selected features
sv = train_svm(train_Kbest, train_lab)
predSVM= sv.predict(test_Kbest)
pred = list(predSVM)
cm8 = confusion_matrix(pred, test_lab)
print (cm8)
print (accuracy_score(pred, test_lab))
print (classification_report(pred,  test_lab))

[[ 255   80]
 [ 134 1255]]
0.8758700696055685
             precision    recall  f1-score   support

         -1       0.66      0.76      0.70       335
          1       0.94      0.90      0.92      1389

avg / total       0.88      0.88      0.88      1724



## K Best Naive Bayes

In [229]:
clf = MultinomialNB().fit(train_Kbest, train_lab)
predNB = clf.predict(test_Kbest)
pred = list(predNB)
cm9 = confusion_matrix(pred, test_lab)
print (cm9)
print (accuracy_score(pred, test_lab))
print (classification_report(pred,  test_lab))

# View the selected features
selected_features = list(np.array(vectorizer.get_feature_names())[ch21.get_support()])
print (selected_features)

[[  12    0]
 [ 377 1335]]
0.781322505800464
             precision    recall  f1-score   support

         -1       0.03      1.00      0.06        12
          1       1.00      0.78      0.88      1712

avg / total       0.99      0.78      0.87      1724

['12', '17', '20', '99', 'about', 'absolutely', 'abysmal', 'acknowledged', 'acted', 'actual', 'adding', 'affordable', 'after', 'aimed', 'alot', 'alright', 'always', 'amazing', 'annoyed', 'anticipation', 'any', 'anything', 'apologetic', 'apology', 'appalling', 'appeal', 'argue', 'arguing', 'arrived', 'arrogant', 'arugula', 'ask', 'asked', 'asking', 'ass', 'at', 'atmosphere', 'attentive', 'attitude', 'average', 'awesome', 'awful', 'awkwardly', 'bad', 'badly', 'bags', 'barely', 'basic', 'bathroom', 'beautifully', 'below', 'berry', 'best', 'better', 'biggest', 'bill', 'biryani', 'black', 'bland', 'bleh', 'blowingly', 'boring', 'bother', 'brag', 'branches', 'breakfast', 'broth', 'buck', 'building', 'burnt', 'but', 'bye', 'canned', 'can

## K Best SVM with Combined Training Data

In [230]:
# TODO update this to run on full dataset 

ch21 = SelectKBest(chi2, k=50) # TODO modify k according to number of features you want 
# Transform your training and testing datasets accordingly
train_Kbest = ch21.fit_transform(trainall_vectors, trainall_lab)
test_Kbest = ch21.transform(testall_vectors)

In [231]:
# Train your SVM with the k best selected features
sv = train_svm(train_Kbest, trainall_lab)
predSVM= sv.predict(test_Kbest)
pred = list(predSVM)
cm8 = confusion_matrix(pred, test_lab)
print (cm8)
print (accuracy_score(pred, test_lab))
print (classification_report(pred,  test_lab))

[[ 277  153]
 [ 112 1182]]
0.8462877030162413
             precision    recall  f1-score   support

         -1       0.71      0.64      0.68       430
          1       0.89      0.91      0.90      1294

avg / total       0.84      0.85      0.84      1724



## K Best Naive Bayes with Combined Training Data

In [232]:
clf = MultinomialNB().fit(train_Kbest, trainall_lab)
predNB = clf.predict(test_Kbest)
pred = list(predNB)
cm9 = confusion_matrix(pred, test_lab)
print (cm9)
print (accuracy_score(pred, test_lab))
print (classification_report(pred,  test_lab))

# View the selected features
selected_features = list(np.array(vectorizer_all.get_feature_names())[ch21.get_support()])
print (selected_features)

[[ 221  104]
 [ 168 1231]]
0.8422273781902552
             precision    recall  f1-score   support

         -1       0.57      0.68      0.62       325
          1       0.92      0.88      0.90      1399

avg / total       0.86      0.84      0.85      1724

['always', 'amazing', 'asked', 'awesome', 'awful', 'bad', 'best', 'bland', 'burger', 'cold', 'definitely', 'delicious', 'didn', 'disgusting', 'dry', 'excellent', 'fantastic', 'favorite', 'friendly', 'great', 'he', 'her', 'highly', 'horrible', 'indian', 'london', 'love', 'loved', 'manager', 'mediocre', 'minutes', 'naan', 'no', 'not', 'nothing', 'ok', 'okay', 'overpriced', 'perfect', 'poor', 'rude', 'she', 'terrible', 'thai', 'told', 'waitress', 'was', 'wasn', 'wonderful', 'worst']
