### Relevant Data Extraction

In [157]:
import pandas as pd
df = pd.read_json("Kindle_Store_5.json", lines=True)

In [158]:
#Negative reviews 1 or 2 stars - Positive reviews 5 stars

df = df.drop(['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'unixReviewTime'], axis=1)
#df = df[df.overall != 2]
df = df[df.overall != 3]
df = df[df.overall != 4]
df['positive'] = df['overall'] == 5
df = df.drop('overall', axis=1)
df.head()

Unnamed: 0,reviewText,summary,positive
0,I enjoy vintage books and movies so I enjoyed ...,Nice vintage story,True
3,I'd never read any of the Amy Brewster mysteri...,I really liked it.,True
8,Darth Maul working under cloak of darkness com...,Darth Maul,True
10,I think I have this one in both book and audio...,Audio and book,True
14,I am not for sure on how much of a difference ...,Possibly Important,True


In [164]:
import pickle
reviewList = []
max_l = 57000
p = n = 0
for s, r, t in zip(list(df['summary']), list(df['reviewText']), list(df['positive'])):
    if s!="" and r!="":#no empty reviews
        if t:
            if p<max_l:
                reviewList.append((s,r,t))
                p+=1
        else:
            if n<max_l:
                reviewList.append((s,r,t))
                n+=1
#summary, review, if-positive
print("Positive Count:{} Negative Count: {}".format(p,n))

Positive Count:57000 Negative Count: 57000


In [160]:
with open('kindlereviews.pkl', 'wb') as f:
    pickle.dump(reviewList, f)

### Pre-Processing

In [1]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk import word_tokenize, pos_tag
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\osman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class PreProcess:
    
    def __init__(self, stem='lemma'):
        self.pos_keep = set("JJ, JJR, JJS, MD, NN, NNS, NNP, NNPS, PDT, POS, PRP, PRP$, RB, RBR, RBS, RP, UH,\
VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WRB, WP$".split(", "))
        self.tokenizer = TreebankWordTokenizer()
        self.stopWords = set(stopwords.words('english'))

        self.lemma = stem == 'lemma'
        self.wordnet_lemmatizer = WordNetLemmatizer()

        self.porter_stemmer = PorterStemmer()


    def process(self, review):
        review = review.lower()#lower-case
        review = pos_tag(self.tokenizer.tokenize(review))#tokenize and tag
        #print(review)
        new_review = []
        for w, p in review:#remove stopwords and other pos_tags
            if w not in self.stopWords and p in self.pos_keep:
                new_w = self.wordnet_lemmatizer.lemmatize(w.strip("."))
                new_w = self.porter_stemmer.stem(new_w)
                new_review.append(new_w)

        #remove stopwords and keep certain POS_tags
        return new_review

In [3]:
import pickle
with open('kindlereviews.pkl', 'rb') as f:
    reviews = pickle.load(f)
    
pp=PreProcess()
    
print(reviews[0][1])
print(" ".join(pp.process(reviews[0][1])))

I enjoy vintage books and movies so I enjoyed reading this book.  The plot was unusual.  Don't think killing someone in self-defense but leaving the scene and the body without notifying the police or hitting someone in the jaw to knock them out would wash today.Still it was a good read for me.
enjoy vintag book movi enjoy read book plot unusu n't kill someon self-defens leav scene bodi notifi polic hit someon jaw would good read


In [5]:
import threading#takes a while
def process_worker(i,n):
    for idx in range(n*i, n*(i+1)):
        s = pp.process(reviews[idx][0])
        r = pp.process(reviews[idx][1])
        t = reviews[idx][2]
        
        reviews[idx] = ("DONE", s,r,t)
        
t=[]
num_w = 4
n = len(reviews)//num_w
for i in range(num_w):
    t.append(threading.Thread(target=process_worker, args=(i,n)))
    
for i in range(num_w):
    t[i].start()
    
for i in range(num_w):
    t[i].join()

print("Done")

Done


In [8]:
reviews_processed = []

for idx, r in enumerate(reviews):
    if r[0]=="DONE":
        reviews_processed.append((r[1],r[2],r[3]))

In [9]:
with open('kindlereviews_processed.pkl', 'wb') as f:
    pickle.dump(reviews_processed, f)

### Vocabulary and Frequency

In [41]:
import pickle
with open('kindlereviews_processed.pkl', 'rb') as f:
    reviews_processed = pickle.load(f)
full_reviews = [x[1] for x in reviews_processed]
predictions = [x[2] for x in reviews_processed]

In [42]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(full_reviews, predictions, test_size=0.2)

all_words = []
for r in X_train:
    all_words.extend(r)
    
from collections import Counter
word_counts = Counter(all_words)
vocab_list = list(word_counts.most_common(10000))

In [43]:
#Count Vectorizer function
vocabulary = {}
idx = 0
for x in vocab_list:
    vocabulary[x[0]] = idx
    idx+=1
            
#filter
for idx in range(len(X_train)):
    X_train[idx] = " ".join( list(filter(lambda x: x in vocabulary, X_train[idx])))

for idx in range(len(X_test)):
    X_test[idx] = " ".join( list(filter(lambda x: x in vocabulary, X_test[idx])))

In [57]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [63]:
from sklearn.metrics import accuracy_score

def get_accuracy(clf):
    preds = clf.predict(X_test_tfidf)
    return accuracy_score(preds, y_test) * 100

### Naive-Bayes

In [64]:
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB().fit(X_train_tfidf, y_train)

print ("Naive-Bayes classifier accuracy is {:.2f}%".format(get_accuracy(nb_clf)))

Naive-Bayes classifier accuracy is 89.72%


### Random Forest

In [65]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(min_samples_leaf=20).fit(X_train_tfidf, y_train)

print ("Random Forest classifier accuracy is {:.2f}%".format(get_accuracy(forest_classifier)))

Random Forest classifier accuracy is 85.06%


### SVM

In [None]:
from sklearn import svm
svm_classifier = svm.SVC(gamma=0.001,kernel='rbf').fit(X_train_tfidf, y_train)#RBF kernel

print ("SVM accuracy is {:.2f}%".format(get_accuracy(svm_classifier)))