# Classification on Pre-processed Data

In [1]:
from os import getcwd, chdir
import re
import numpy as np
import pickle as pk
import pandas as pd

from nltk.metrics import ConfusionMatrix
from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.classify import accuracy
from nltk.tokenize import word_tokenize as wt

from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import accuracy_score

# Change your path here
fpath = getcwd()
print(fpath)

def classification_report_csv(report, csv_name):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split(' ') 
        row_data = list(filter(None, row_data))
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(csv_name, index = False)
    
# Just Scraped Data
df_train= pd.read_csv('./data/df_preprocessing_with_negation.csv')
print(df_train.shape)
df_train.head()

/Users/pierlim/PycharmProjects/sent_mining_CA
(26074, 4)


Unnamed: 0.1,Unnamed: 0,rating,review,sentiment
0,0,5,young day lunch choice consist soho wide range...,1
1,1,3,go yelp tripadvisor i want try chinese restaur...,-1
2,2,5,order sichuan prawn singapore rice noodle be a...,1
3,3,3,wong kei be many option find heart london chin...,-1
4,4,1,bad experience restaurant life food service ha...,-1


In [2]:
# Combined Data
# df_train_all = pd.read_csv('./data/df_preprocessing_combined.csv')
# print(df_train_all.shape)
# df_train_all.head()

## Naive Bayes Classifier

In [5]:
# Convert to dictionary format because apparently NLTK requires this
train_pos = [[row["review"], 1] for idx, row in df_train.iterrows() if row["sentiment"]==1]
train_neg = [[row["review"], -1] for idx, row in df_train.iterrows() if row["sentiment"]==-1]

def word_feats(words):
    return dict([(word, True) for word in words])

# need this part for max ent portion
trainset = train_pos + train_neg
train_tokenized = [[wt(str(x)), c] for x,c in trainset] # may need to introduce some pre-processing at this stage for better results
train_featureset = [(word_feats(d), c) for (d,c) in train_tokenized] 

# train_all_pos = [[row["review"], 1] for idx, row in df_train_all.iterrows() if row["sentiment"]==1]
# train_all_neg = [[row["review"], -1] for idx, row in df_train_all.iterrows() if row["sentiment"]==-1]
# trainset_all = train_all_pos + train_all_neg
# train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
# train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

In [6]:
# #wt(trainset_all[1][0])
# train_all_tokenized = []
# for x,c in trainset_all:
#     train_all_tokenized.append([wt(str(x)), c])
    
# #train_all_tokenized = [[wt(x), c] for x,c in trainset_all] # may need to introduce some pre-processing at this stage for better results
# train_all_featureset = [(word_feats(d), c) for (d,c) in train_all_tokenized] 

## Naive Bayes, Just Scraped Data, Top 10 Most Informative Words

In [8]:
## Naive Bayes Rule using nltk
classifier_nb = NaiveBayesClassifier.train(train_featureset)
#print("Accuracy :" +str(accuracy(classifier_nb, test_featureset)))
classifier_nb.show_most_informative_features(10)

## Preparing the data first 
train_nolab = [str(t[0]) for t in trainset]
train_lab = [t[1] for t in trainset]

# Create your tf-idf function
vectorizer = TfidfVectorizer(max_df=0.7, min_df=3, use_idf=True) # modified this
train_vectors = vectorizer.fit_transform(train_nolab)
pk.dump(vectorizer, open("./models/vectorise_negation.pk","wb"))

## train Naive Bayes Rule using sklearn
clf = MultinomialNB().fit(train_vectors, train_lab)
pk.dump(clf, open("./models/classifier_negation_naivebayes.pk","wb"))
predNB = clf.predict(train_vectors)
pred = list(predNB)
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(pred), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


Most Informative Features
              flavorless = True               -1 : 1      =     64.9 : 1.0
               brazilian = True               -1 : 1      =     50.9 : 1.0
            unacceptable = True               -1 : 1      =     43.0 : 1.0
           neg_returning = True               -1 : 1      =     39.9 : 1.0
                  rudely = True               -1 : 1      =     37.9 : 1.0
               poisoning = True               -1 : 1      =     30.7 : 1.0
                 roomali = True                1 : -1     =     30.5 : 1.0
                downhill = True               -1 : 1      =     28.6 : 1.0
                    bleh = True               -1 : 1      =     28.5 : 1.0
                 disgust = True               -1 : 1      =     27.9 : 1.0


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,10797,1717,12514
1,1174,12386,13560
All,11971,14103,26074


In [9]:
print (classification_report(pred,  train_lab))
print (accuracy_score(pred, train_lab))

             precision    recall  f1-score   support

         -1       0.86      0.90      0.88     11971
          1       0.91      0.88      0.90     14103

avg / total       0.89      0.89      0.89     26074

0.8891232645547289


In [10]:
#pd.DataFrame(cm1).to_csv('naive_bayes_scraped.csv')
#report = classification_report(pred, train_lab)
#classification_report_csv(report, 'naive_bayes_scraped_report.csv')

In [11]:
nb_result = pd.Series(pred)

## SVM Classifier

In [17]:
from sklearn.svm import SVC

# SVM Classifier from sklearn
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=10000.0, gamma='auto', kernel='rbf')
    svm.fit(X, y)
    return svm

classifier_svm = train_svm(train_vectors, train_lab)  # training the SVM model
pk.dump(classifier_svm, open("./models/classifier_negation_svm.pk","wb"))
predSVM = classifier_svm.predict(train_vectors) 
cm1=pd.crosstab( pd.Series(train_lab), pd.Series(predSVM), rownames= ['actuals'], colnames=['pred'],margins=True)
cm1


pred,-1,1,All
actuals,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,11621,893,12514
1,585,12975,13560
All,12206,13868,26074


In [18]:
print (classification_report(predSVM,  train_lab))
print (accuracy_score(predSVM, train_lab))

             precision    recall  f1-score   support

         -1       0.93      0.95      0.94     12206
          1       0.96      0.94      0.95     13868

avg / total       0.94      0.94      0.94     26074

0.9433151798726701


In [19]:
pd.DataFrame(cm1).to_csv('svm_scraped.csv')
report = classification_report(predSVM, train_lab)
classification_report_csv(report, 'svm_scraped_report.csv')

In [20]:
svm_result = pd.Series(predSVM)

In [198]:
df_result = pd.concat([pd.Series(train_lab), nb_result, svm_result], keys=["actual", "nb", "svm"], axis=1)

In [183]:
df_result_all = pd.concat([pd.Series(trainall_lab), nb_result_all, svm_result_all], keys=["actual", "nb_combined", "svm_combined"], axis=1)
df_result_all.head()

Unnamed: 0,actual,nb_combined,svm_combined
0,1,1,1
1,1,1,1
2,1,-1,-1
3,1,1,1
4,1,1,1


In [184]:
df_result_all.to_csv('combined_result.csv')

In [201]:
df_result.to_csv('result.csv')

## Conclusion

After pre-processing, we saw some improvement in the results. We decided to use these models to test on the test data. 