# Classification on Raw Data

In [3]:
from os import getcwd, chdir
import re
import numpy as np
import pickle as pk
import pandas as pd

from nltk.metrics import ConfusionMatrix
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify import accuracy
from nltk.tokenize import word_tokenize as wt

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score

# Change your path here
fpath = getcwd()
print(fpath)

# Just Scraped Data
df_train= pd.read_csv('./data/df_reviews_train.csv')
print(df_train.shape)
df_train.head()

/Users/pierlim/PycharmProjects/sent_mining_CA
(5745, 3)


Unnamed: 0,rating,review,sentiment
0,5,In my younger days when lunch choices consiste...,1
1,3,"After going through yelp and tripadvisor, I wa...",-1
2,5,Ordered Sichuan Prawns and Singapore Rice Nood...,1
3,3,Wong Kei is one of the many options you'll fin...,-1
4,1,One of the worst experience in a restaurant in...,-1


In [4]:
ffile1 = open("./data/train.csv","r", encoding = "ISO-8859-1")
df_standard = pd.read_csv(ffile1, encoding = "utf-8")
df_standard.drop(['restaurant_id', 'date', 'review_id', 'stars'], inplace=True, axis=1)
df_standard.rename(columns={'text': 'review', 'Sentiment': 'sentiment'}, inplace=True)
mask = df_standard.sentiment == 'negative'
column_name = 'sentiment'
df_standard.loc[mask, column_name] = -1
mask = df_standard.sentiment == 'positive'
column_name = 'sentiment'
df_standard.loc[mask, column_name] = 1

df_standard.head()

Unnamed: 0,review,sentiment
0,Very disappointed in the customer service. We ...,-1
1,I really wasn't thrilled with our meal here. T...,-1
2,STAY AWAY...\n\nWe've been 3 times over the pa...,-1
3,The food is good and the portions are large. ...,-1
4,I feel bad about giving this place such a meh ...,-1


In [6]:
# df_train and df_standard are now the same format, we can concat and use it as a whole 
df_train_all = pd.concat([df_train, df_standard], axis=0)
df_train_all.head()

Unnamed: 0,rating,review,sentiment
0,5.0,In my younger days when lunch choices consiste...,1
1,3.0,"After going through yelp and tripadvisor, I wa...",-1
2,5.0,Ordered Sichuan Prawns and Singapore Rice Nood...,1
3,3.0,Wong Kei is one of the many options you'll fin...,-1
4,1.0,One of the worst experience in a restaurant in...,-1


## Naive Bayes Classifier

In [7]:
# Convert to dictionary format because apparently NLTK requires this
train_pos = [[row["review"], 1] for idx, row in df_train.iterrows() if row["sentiment"]==1]
train_neg = [[row["review"], -1] for idx, row in df_train.iterrows() if row["sentiment"]==-1]

def word_feats(words):
    return dict([(word, True) for word in words])

# need this part for max ent portion
trainset = train_pos + train_neg
train_tokenized = [[wt(x), c] for x,c in trainset] # may need to introduce some pre-processing at this stage for better results
train_featureset = [(word_feats(d), c) for (d,c) in train_tokenized] 

train_all_pos = [[row["review"], 1] for idx, row in df_train_all.iterrows() if row["sentiment"]==1]
train_all_neg = [[row["review"], -1] for idx, row in df_train_all.iterrows() if row["sentiment"]==-1]
trainset_all = train_all_pos + train_all_neg
# train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
# train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

In [8]:
#wt(trainset_all[1][0])
train_all_tokenized = []
for x,c in trainset_all:
    train_all_tokenized.append([wt(str(x)), c])
    
#train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

## Naive Bayes, Just Scraped Data

In [10]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)

## Preparing the data first 
train_nolab = [t[0] for t in trainset]
train_lab = [t[1] for t in trainset]

# Create your tf-idf function
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
train_vectors = vectorizer.fit_transform(train_nolab)
pk.dump(vectorizer, open("./models/vectorise_scraped.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(train_vectors, train_lab)
pk.dump(clf, open("./models/classifier_scraped_naivebayes.pk","wb"))
predNB = clf.predict(train_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
              flavorless = True               -1 : 1      =     28.6 : 1.0
                     Meh = True               -1 : 1      =     24.0 : 1.0
                 Average = True               -1 : 1      =     21.7 : 1.0
                Horrible = True               -1 : 1      =     19.4 : 1.0
                  poorly = True               -1 : 1      =     19.4 : 1.0
                   Avoid = True               -1 : 1      =     17.1 : 1.0
                 letdown = True               -1 : 1      =     17.1 : 1.0
            unremarkable = True               -1 : 1      =     17.1 : 1.0
               tasteless = True               -1 : 1      =     17.1 : 1.0
               overrated = True               -1 : 1      =     17.1 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,87,1210,1297
1,1,4447,4448
All,88,5657,5745


In [11]:
print (classification_report(pred,  train_lab))
print (accuracy_score(pred, train_lab))

             precision    recall  f1-score   support

         -1       0.07      0.99      0.13        88
          1       1.00      0.79      0.88      5657

avg / total       0.99      0.79      0.87      5745

0.7892080069625762


## Repeat Naive Bayes Classifier Using Combined Training Set

In [13]:

trainall_nolab = [str(t[0]) for t in trainset_all]
trainall_lab = [t[1] for t in trainset_all]
trainall_nolab[0]

"In my younger days when lunch choices consisted of Soho's wide range of eateries as opposed to today's regimented packed lunches, there was one spot that was a consistent favourite. Wong Kei. This evening after a long day at work followed by a lengthy meeting and beer, I decided to revisit my favourite old haunt. It's as good as ever. I ordered my standard:\n- Ho fun with beef and scrambled egg\nIt is a dish that sums up my love for the restaurant; hearty, authentic, homestyle cooking that will fill you up on a budget with the finest of ingredients - simplicity. There is nothing fancy about a plate of wet noodles with a generous helping of tender beef and eggs. Its taste is mildly salty with a slight spring onion sweetness and there isn't much else to it. What you get however is a sense of warmth and comfort with every bite that few other places can offer at its price point. Wong Kei has long been infamous for its abrasive staff and unassuming environment (both things I have no proble

In [14]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_all_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)



# Create your tf-idf function
vectorizer_all = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
trainall_vectors = vectorizer_all.fit_transform(trainall_nolab)
pk.dump(vectorizer_all, open("./models/vectorise_combined.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(trainall_vectors, trainall_lab)
pk.dump(clf, open("./models/classifier_combined_naivebayes.pk","wb"))
predNB = clf.predict(trainall_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
                Terrible = True               -1 : 1      =     72.0 : 1.0
                      Wo = True               -1 : 1      =     63.0 : 1.0
               Brazilian = True               -1 : 1      =     48.8 : 1.0
              flavorless = True               -1 : 1      =     42.8 : 1.0
                  rudely = True               -1 : 1      =     39.9 : 1.0
            unacceptable = True               -1 : 1      =     39.9 : 1.0
                Horrible = True               -1 : 1      =     36.9 : 1.0
                   WORST = True               -1 : 1      =     33.9 : 1.0
              microwaved = True               -1 : 1      =     33.2 : 1.0
                     Meh = True               -1 : 1      =     32.4 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10384,1924,12308
1,1057,12709,13766
All,11441,14633,26074


In [15]:
print (classification_report(pred,  trainall_lab))
print (accuracy_score(pred, trainall_lab))

             precision    recall  f1-score   support

         -1       0.84      0.91      0.87     11441
          1       0.92      0.87      0.90     14633

avg / total       0.89      0.89      0.89     26074

0.8856715502032676


## SVM Classifier

In [16]:
from sklearn.svm import SVC

# SVM Classifier from sklearn
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=10000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

classifier_svm = train_svm(train_vectors, train_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_scraped_svm.pk","wb"))
predSVM = classifier_svm.predict(train_vectors) 
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,1192,105,1297
1,35,4413,4448
All,1227,4518,5745


In [17]:
print (classification_report(predSVM,  train_lab))
print (accuracy_score(predSVM, train_lab))

             precision    recall  f1-score   support

         -1       0.92      0.97      0.94      1227
          1       0.99      0.98      0.98      4518

avg / total       0.98      0.98      0.98      5745

0.9756309834638817


## Repeat SVM Classifier using Combined Training Dataset

In [18]:
classifier_svm = train_svm(trainall_vectors, trainall_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_combined_svm.pk","wb"))
predSVM = classifier_svm.predict(trainall_vectors) 
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1




pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,11371,937,12308
1,635,13131,13766
All,12006,14068,26074


In [19]:
print (classification_report(predSVM,  trainall_lab))
print (accuracy_score(predSVM, trainall_lab))

             precision    recall  f1-score   support

         -1       0.92      0.95      0.94     12006
          1       0.95      0.93      0.94     14068

avg / total       0.94      0.94      0.94     26074

0.9397100559944772
