# Classification on Pre-processed Data

In [203]:
from os import getcwd, chdir
import re
import numpy as np
import pickle as pk
import pandas as pd

from nltk.metrics import ConfusionMatrix
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify import accuracy
from nltk.tokenize import word_tokenize as wt

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score

# Change your path here
fpath = getcwd()
print(fpath)

def classification_report_csv(report, csv_name):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split(' ') 
        row_data = list(filter(None, row_data))
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(csv_name, index = False)
    
# Just Scraped Data
df_train= pd.read_csv('./data/df_preprocessing_scraped.csv')
print(df_train.shape)
df_train.head()

/Users/pierlim/PycharmProjects/sent_mining_CA
(2594, 3)


Unnamed: 0,rating,review,sentiment
0,3,take nice bite cheese swig wine trust divine m...,-1
1,5,london long stand indian restaurant hold title...,1
2,5,best ramen london far place amaze want try ori...,1
3,5,dishoom please come nyc love restaurant husban...,1
4,4,wander soho drizzly cold summer day really aug...,1


In [138]:
# Combined Data
df_train_all = pd.read_csv('./data/df_preprocessing_combined.csv')
print(df_train_all.shape)
df_train_all.head()

(26074, 3)


Unnamed: 0,rating,review,sentiment
0,5,young day lunch choice consist sohos wide rang...,1
1,3,go yelp tripadvisor want try chinese restauran...,-1
2,5,order sichuan prawn singapore rice noodle amaz...,1
3,3,wong kei many option youll find heart london c...,-1
4,1,bad experience restaurant life food service fr...,-1


## Naive Bayes Classifier

In [139]:
# Convert to dictionary format because apparently NLTK requires this
train_pos = [[row["review"], 1] for idx, row in df_train.iterrows() if row["sentiment"]==1]
train_neg = [[row["review"], -1] for idx, row in df_train.iterrows() if row["sentiment"]==-1]

def word_feats(words):
    return dict([(word, True) for word in words])

# need this part for max ent portion
trainset = train_pos + train_neg
train_tokenized = [[wt(x), c] for x,c in trainset] # may need to introduce some pre-processing at this stage for better results
train_featureset = [(word_feats(d), c) for (d,c) in train_tokenized] 

train_all_pos = [[row["review"], 1] for idx, row in df_train_all.iterrows() if row["sentiment"]==1]
train_all_neg = [[row["review"], -1] for idx, row in df_train_all.iterrows() if row["sentiment"]==-1]
trainset_all = train_all_pos + train_all_neg
# train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
# train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

In [140]:
#wt(trainset_all[1][0])
train_all_tokenized = []
for x,c in trainset_all:
    train_all_tokenized.append([wt(str(x)), c])
    
#train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

## Naive Bayes, Just Scraped Data, Top 10 Most Informative Words

In [185]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)

## Preparing the data first 
train_nolab = [t[0] for t in trainset]
train_lab = [t[1] for t in trainset]

# Create your tf-idf function
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
train_vectors = vectorizer.fit_transform(train_nolab)
pk.dump(vectorizer, open("./models/vectorise_scraped.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(train_vectors, train_lab)
pk.dump(clf, open("./models/classifier_scraped_naivebayes.pk","wb"))
predNB = clf.predict(train_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
                mediocre = True               -1 : 1      =     20.2 : 1.0
           disappointing = True               -1 : 1      =     14.7 : 1.0
           underwhelming = True               -1 : 1      =     13.7 : 1.0
                shouldnt = True               -1 : 1      =     13.0 : 1.0
                terrible = True               -1 : 1      =     12.8 : 1.0
                   awful = True               -1 : 1      =     12.2 : 1.0
                   worst = True               -1 : 1      =     11.7 : 1.0
               tasteless = True               -1 : 1      =     11.7 : 1.0
                 improve = True               -1 : 1      =     10.3 : 1.0
                   count = True               -1 : 1      =     10.3 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,1156,141,1297
1,163,1134,1297
All,1319,1275,2594


In [186]:
print (classification_report(pred,  train_lab))
print (accuracy_score(pred, train_lab))

             precision    recall  f1-score   support

         -1       0.89      0.88      0.88      1319
          1       0.87      0.89      0.88      1275

avg / total       0.88      0.88      0.88      2594

0.8828064764841943


In [187]:
pd.DataFrame(cm1).to_csv('naive_bayes_scraped.csv')
report = classification_report(pred, train_lab)
classification_report_csv(report, 'naive_bayes_scraped_report.csv')

In [188]:
nb_result = pd.Series(pred)

## Naive Bayes Classifier Using Combined Training Set, Top 10 Most Informative Words

In [189]:

trainall_nolab = [str(t[0]) for t in trainset_all]
trainall_lab = [t[1] for t in trainset_all]


'young day lunch choice consist sohos wide range eatery oppose today regiment pack lunch spot consistent favourite wong kei even long day work follow lengthy meeting beer decide revisit favourite old haunt good ever order standard ho fun beef scramble egg dish sum love restaurant hearty authentic homestyle cook fill budget fine ingredient simplicity nothing fancy plate wet noodle generous help tender beef egg taste mildly salty slight spring onion sweetness isnt much else get however sense warmth comfort bite place offer price point wong kei long infamous abrasive staff unassuming environment thing problem personally dish provide care way feed way encapsulate chinese way express love word many people appreciate open notice thing place forever love place hope stay forever little pro tip consume eat make use plentiful chilli oil crucial dish rarest perk today standard free tea wong kei please dont change'

In [190]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_all_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)



# Create your tf-idf function
vectorizer_all = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
trainall_vectors = vectorizer_all.fit_transform(trainall_nolab)
pk.dump(vectorizer_all, open("./models/vectorise_combined.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(trainall_vectors, trainall_lab)
pk.dump(clf, open("./models/classifier_combined_naivebayes.pk","wb"))
predNB = clf.predict(trainall_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
               brazilian = True               -1 : 1      =     56.0 : 1.0
              flavorless = True               -1 : 1      =     52.4 : 1.0
            unacceptable = True               -1 : 1      =     44.4 : 1.0
                  rudely = True               -1 : 1      =     40.8 : 1.0
                     ugh = True               -1 : 1      =     36.5 : 1.0
                 roomali = True                1 : -1     =     31.1 : 1.0
                downhill = True               -1 : 1      =     28.0 : 1.0
                    bleh = True               -1 : 1      =     27.1 : 1.0
                 disgust = True               -1 : 1      =     25.5 : 1.0
                   worst = True               -1 : 1      =     24.3 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10662,1852,12514
1,1175,12385,13560
All,11837,14237,26074


In [191]:
print (classification_report(pred,  trainall_lab))
print (accuracy_score(pred, trainall_lab))

nb_result_all = pd.Series(pred)

             precision    recall  f1-score   support

         -1       0.85      0.90      0.88     11837
          1       0.91      0.87      0.89     14237

avg / total       0.89      0.88      0.88     26074

0.8839073406458541


In [192]:
#df_train_all = pd.concat([df_train_all, pd.Series(pred)], axis=1)

In [193]:
pd.DataFrame(cm1).to_csv('naive_bayes_combined.csv')
report = classification_report(pred, trainall_lab)
classification_report_csv(report, 'naive_bayes_combined_report.csv')

## SVM Classifier

In [194]:
from sklearn.svm import SVC

# SVM Classifier from sklearn
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=10000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

classifier_svm = train_svm(train_vectors, train_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_scraped_svm.pk","wb"))
predSVM = classifier_svm.predict(train_vectors) 
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,1291,6,1297
1,11,1286,1297
All,1302,1292,2594


In [195]:
print (classification_report(predSVM,  train_lab))
print (accuracy_score(predSVM, train_lab))

             precision    recall  f1-score   support

         -1       1.00      0.99      0.99      1302
          1       0.99      1.00      0.99      1292

avg / total       0.99      0.99      0.99      2594

0.9934464148033925


In [204]:
pd.DataFrame(cm1).to_csv('svm_scraped.csv')
report = classification_report(predSVM, train_lab)
classification_report_csv(report, 'svm_scraped_report.csv')

In [205]:
svm_result = pd.Series(predSVM)

## Repeat SVM Classifier using Combined Training Dataset

In [206]:
classifier_svm = train_svm(trainall_vectors, trainall_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_combined_svm.pk","wb"))
predSVM = classifier_svm.predict(trainall_vectors) 
cm1=pd.crosstab( pd.Series(trainall_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1




pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,11515,999,12514
1,705,12855,13560
All,12220,13854,26074


In [207]:
print (classification_report(predSVM,  trainall_lab))
print (accuracy_score(predSVM, trainall_lab))
svm_result_all = pd.Series(predSVM)

             precision    recall  f1-score   support

         -1       0.92      0.94      0.93     12220
          1       0.95      0.93      0.94     13854

avg / total       0.93      0.93      0.93     26074

0.9346475416123341


In [208]:
pd.DataFrame(cm1).to_csv('svm_scraped_combined.csv')
report = classification_report(predSVM, trainall_lab)
classification_report_csv(report, 'svm_scraped_combined_report.csv')

In [198]:
df_result = pd.concat([pd.Series(train_lab), nb_result, svm_result], keys=["actual", "nb", "svm"], axis=1)

In [183]:
df_result_all = pd.concat([pd.Series(trainall_lab), nb_result_all, svm_result_all], keys=["actual", "nb_combined", "svm_combined"], axis=1)
df_result_all.head()

Unnamed: 0,actual,nb_combined,svm_combined
0,1,1,1
1,1,1,1
2,1,-1,-1
3,1,1,1
4,1,1,1


In [184]:
df_result_all.to_csv('combined_result.csv')

In [201]:
df_result.to_csv('result.csv')

## Conclusion

After pre-processing, we saw some improvement in the results. 
We decided to use these models to test on the test data. 

Note that the scraped data has much less data, and it was able to get a good accuracy. 
Doing predictions for a larger set of data, the combined data gave a lower accuracy.
We will run both these models against the test data as we feel that there is a possibility that the combined data model is more generalized. 