In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

import re
import json 
import nltk
import spacy
import string
import unicodedata
from bs4 import BeautifulSoup
from textblob import TextBlob 
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm

# read data file
data = pd.read_csv('all_kindle_review.csv')
data = data[['reviewText', 'rating']]
data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [2]:
data["rating"] = data["rating"].apply(lambda x: 1 if x < 3  else 0)
data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",0
1,Great short read. I didn't want to put it dow...,0
2,I'll start by saying this is the first of four...,0
3,Aggie is Angela Lansbury who carries pocketboo...,0
4,I did not expect this type of book to be in li...,0


In [3]:
# 1. lowering case 
data['reviewText'] = data['reviewText'].str.lower()
data.head()

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",0
1,great short read. i didn't want to put it dow...,0
2,i'll start by saying this is the first of four...,0
3,aggie is angela lansbury who carries pocketboo...,0
4,i did not expect this type of book to be in li...,0


In [4]:
# 2. remove punctuation
data['reviewText'] = data['reviewText'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may be short but hes nothing to me...,0
1,great short read i didnt want to put it down ...,0
2,ill start by saying this is the first of four ...,0
3,aggie is angela lansbury who carries pocketboo...,0
4,i did not expect this type of book to be in li...,0


In [5]:
# 3. remove stopwords

from spacy.lang.en.stop_words import STOP_WORDS
data['reviewText'] = data['reviewText'].apply(lambda x: " ".join([i for i in x.split() if i not in STOP_WORDS]) )

data.head()

Unnamed: 0,reviewText,rating
0,jace rankin short hes mess man hauled saloon u...,0
1,great short read didnt want read sitting sex s...,0
2,ill start saying books wasnt expecting 34concl...,0
3,aggie angela lansbury carries pocketbooks inst...,0
4,expect type book library pleased find price right,0


In [6]:
# 4. remove url and tags
data['reviewText'] = data['reviewText'].apply(
    lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x))
)
                                              
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin short hes mess man hauled saloon u...,0
1,great short read didnt want read sitting sex s...,0
2,ill start saying books wasnt expecting 34concl...,0
3,aggie angela lansbury carries pocketbooks inst...,0
4,expect type book library pleased find price right,0


In [7]:
# 5. remove html tags
data['reviewText'] = data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())

data.head()

Unnamed: 0,reviewText,rating
0,jace rankin short hes mess man hauled saloon u...,0
1,great short read didnt want read sitting sex s...,0
2,ill start saying books wasnt expecting 34concl...,0
3,aggie angela lansbury carries pocketbooks inst...,0
4,expect type book library pleased find price right,0


In [8]:
# 6. remove emails 
data['reviewText'] = data['reviewText'].apply(lambda x: re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+\b)', '', x))

In [9]:
# 7. remove extra spaces
data['reviewText'] = data['reviewText'].apply(lambda x: " ".join(x.split()))
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin short hes mess man hauled saloon u...,0
1,great short read didnt want read sitting sex s...,0
2,ill start saying books wasnt expecting 34concl...,0
3,aggie angela lansbury carries pocketbooks inst...,0
4,expect type book library pleased find price right,0


In [10]:
# 8. lemmatizer 
nltk.download('wordnet')
nltk.download('omw-1.4')

%time
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

data["reviewText"] = data["reviewText"].apply(lambda text: lemmatize_words(text))
data.head()

[nltk_data] Downloading package wordnet to /Users/xozhang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/xozhang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs


Unnamed: 0,reviewText,rating
0,jace rankin short he mess man hauled saloon un...,0
1,great short read didnt want read sitting sex s...,0
2,ill start saying book wasnt expecting 34conclu...,0
3,aggie angela lansbury carry pocketbook instead...,0
4,expect type book library pleased find price right,0


In [11]:
# data split 
from sklearn.model_selection import train_test_split 
xtrain, xtest, ytrain, ytest = train_test_split(data['reviewText'], data['rating'], test_size = 0.3)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

xtrain_bow = vectorizer.fit_transform(xtrain).toarray()
xtest_bow = vectorizer.transform(xtest).toarray()

In [13]:
# build svm model
clf_linear = svm.SVC(kernel='linear', tol=1e-5)

clf_linear.fit(xtrain_bow, ytrain)

In [14]:
train_predicts = clf_linear.predict(xtrain_bow)


In [15]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 

def metrics(prediction, actual): 
    print('Confusion_matrix \n', confusion_matrix(actual, prediction))
    print('\nAccuracy:', accuracy_score(actual, prediction))
    print('\nclassification_report\n')
    print(classification_report(actual, prediction))


In [16]:
metrics(train_predicts, ytrain)

Confusion_matrix 
 [[5606   19]
 [  53 2722]]

Accuracy: 0.9914285714285714

classification_report

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5625
           1       0.99      0.98      0.99      2775

    accuracy                           0.99      8400
   macro avg       0.99      0.99      0.99      8400
weighted avg       0.99      0.99      0.99      8400



In [17]:
predicts = clf_linear.predict(xtest_bow)

In [18]:
metrics(predicts, ytest)

Confusion_matrix 
 [[2043  332]
 [ 390  835]]

Accuracy: 0.7994444444444444

classification_report

              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2375
           1       0.72      0.68      0.70      1225

    accuracy                           0.80      3600
   macro avg       0.78      0.77      0.77      3600
weighted avg       0.80      0.80      0.80      3600



In [19]:
# build svm model
clf_rbf = svm.SVC(kernel='rbf', tol=1e-5)

clf_rbf.fit(xtrain_bow, ytrain)

In [20]:
train_predicts_rbf = clf_rbf.predict(xtrain_bow)
metrics(train_predicts_rbf, ytrain)

Confusion_matrix 
 [[5476  149]
 [ 538 2237]]

Accuracy: 0.9182142857142858

classification_report

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      5625
           1       0.94      0.81      0.87      2775

    accuracy                           0.92      8400
   macro avg       0.92      0.89      0.90      8400
weighted avg       0.92      0.92      0.92      8400



In [21]:
predicts_rbf = clf_rbf.predict(xtest_bow)
metrics(predicts_rbf, ytest)

Confusion_matrix 
 [[2220  155]
 [ 532  693]]

Accuracy: 0.8091666666666667

classification_report

              precision    recall  f1-score   support

           0       0.81      0.93      0.87      2375
           1       0.82      0.57      0.67      1225

    accuracy                           0.81      3600
   macro avg       0.81      0.75      0.77      3600
weighted avg       0.81      0.81      0.80      3600



In [22]:
import pickle

file_svm_linear = "svm_linear.pickle"
pickle.dump(clf_linear, open(file_svm_linear, "wb"))

file_svm_rbf = "svm_rbf.pickle"
pickle.dump(clf_rbf, open(file_svm_rbf, "wb"))