In [9]:
#The purpose of this notebook is to use Machine Learning (M.L.) in order to classify
#amazon book reviews into three different sentiments: negative, neutral and positive.

In [10]:
#This is the outlined procedure:

# 1. Collect each set of books and merge all the books together.
# 2. Get rid of all the columns except "Review Text" and "Review Score".
# 3. Clean up "Review Text" column (text only) without modifying order/number of rows.
# 4. Check to see if there is a 1:1 relationship between len("Review Text") and len("Review Score").
# 5. Convert "Review Score" to multiple classes: "Negative", "Neutral" and "Positive" classes.
# 6. Call function to build maxtrix; will have in the end: X, y and vectorizer.
# 7. Now with X and y, can use any M.L. classifier.

In [11]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
from six.moves import range

from pandas import Series, DataFrame
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [12]:
#Read in all book reviews

books = ["Andy-Weir-The-Martian.csv", "Donna-Tartt-The-Goldfinch.csv", 
         "EL-James-Fifty-Shades-of-Grey.csv", "Fillian_Flynn-Gone_Girl.csv", 
         "John-Green-The-Fault-in-our-Stars.csv", "Laura-Hillenbrand-Unbroken.csv", 
         "Paula_Hawkins-The-Girl-On-The-Train.csv", "Suzanne-Collins-The-Hunger-Games.csv"]

In [13]:
#From each book that is read in create a pandas data frame for each one

all_reviews = []
def all_books(books):
    
    
    i = 0  
    while i < len(books):
        more_elements = pd.read_csv(books[i], sep='\t', 
                  names = ["Review Score", "Tail of Review URL", "Review Title", "Review Text"])  
        all_reviews.append(more_elements) 
        i += 1 
        
    return(all_reviews)
   
all_books(books)

[       Review Score                                 Tail of Review URL                                       Review Title                                        Review Text
 0               4.0  /gp/customer-reviews/RKMO449VT48H3?ASIN=149159...                                 4.7573214851 Stars  <span class="a-size-base review-text">I'm a ha...
 1               3.0  /gp/customer-reviews/R3RDNZNCOMRN0K?ASIN=14915...                           Who needs nail clippers?  <span class="a-size-base review-text">"I'm str...
 2               4.0  /gp/customer-reviews/R1TC15NPCF9GMW?ASIN=14915...                                  Abandoned on Mars  <span class="a-size-base review-text">A futuri...
 3               5.0  /gp/customer-reviews/RT3R8XN5KZZGW?ASIN=149159...                                    Excellent Story  <span class="a-size-base review-text">Follow t...
 4               5.0  /gp/customer-reviews/R32NNLGY7QGRVJ?ASIN=14915...                       Inventive, humorous, tedious  <span 

In [14]:
#Combining all data frames to work off of one big data frame

#Concat function used here
comb_DFs = pd.concat([all_reviews[0], all_reviews[1], all_reviews[2], all_reviews[3], all_reviews[4], all_reviews[5], all_reviews[6], all_reviews[7]], ignore_index=True)

In [15]:
#To verify that combining data frames worked: Checking that number of reviews in new big data frame matches sum of reviews from all books

len(comb_DFs)

243269

In [16]:
#Took a look at the first entries of the big data frame

comb_DFs.head(10)

Unnamed: 0,Review Score,Tail of Review URL,Review Title,Review Text
0,4.0,/gp/customer-reviews/RKMO449VT48H3?ASIN=149159...,4.7573214851 Stars,"<span class=""a-size-base review-text"">I'm a ha..."
1,3.0,/gp/customer-reviews/R3RDNZNCOMRN0K?ASIN=14915...,Who needs nail clippers?,"<span class=""a-size-base review-text"">""I'm str..."
2,4.0,/gp/customer-reviews/R1TC15NPCF9GMW?ASIN=14915...,Abandoned on Mars,"<span class=""a-size-base review-text"">A futuri..."
3,5.0,/gp/customer-reviews/RT3R8XN5KZZGW?ASIN=149159...,Excellent Story,"<span class=""a-size-base review-text"">Follow t..."
4,5.0,/gp/customer-reviews/R32NNLGY7QGRVJ?ASIN=14915...,"Inventive, humorous, tedious","<span class=""a-size-base review-text"">This is ..."
5,5.0,/gp/customer-reviews/R14NNZV8RFYM5K?ASIN=14915...,Cool science and tech...but no life on Mars,"<span class=""a-size-base review-text"">The Mart..."
6,3.0,/gp/customer-reviews/R7IJIAHW6TK62?ASIN=149159...,Hard Sci-Fi For The Win!,"<span class=""a-size-base review-text"">I can't ..."
7,3.0,/gp/customer-reviews/R62IPW4T33YZ4?ASIN=149159...,Wonderful,"<span class=""a-size-base review-text"">A fascin..."
8,5.0,/gp/customer-reviews/R3GFO6M9HJB5KZ?ASIN=14915...,"Science is great, writing is fair.","<span class=""a-size-base review-text"">Sorry......"
9,5.0,/gp/customer-reviews/R1RZMRWYGW49DM?ASIN=14915...,Best physics class ever!,"<span class=""a-size-base review-text"">I just f..."


In [17]:
#Separating out the reviews column from the big data frame.
#Also verifying that it has the same amount of rows as in the big data frame

reviews = comb_DFs['Review Text']
len(reviews)

243269

In [18]:
##Separating out the scores column from the big data frame.
#Also verifying that it has the same amount of rows as in the big data frame


scores = comb_DFs['Review Score']
len(scores)

243269

In [19]:
#Taking a look at what a raw review looks like, specifically at index #: 243268

reviews[243268]

'<span class="a-size-base review-text">If I could rate this book a zero I would... The book was to all over the place.. Ive herd many good things about this book aand the others...Woorst book I hae EVER reead  Dont read it!!!!!!!!</span>'

In [20]:
#Cleaning up the reviews so that what is remaining is pure text

import re
import string

r_all = [] #not sure if ok with leaving all the re-made reviews as a list 
            #vs. as the original pandas.series obj
def cleaned_reviews(reviews):
    for r in reviews:
        r = r.lstrip('<span class="a-size-base review-text">')
        r = r.rstrip('</span>')
        
        r.replace('<br/><br/>', ' ')
        r.replace('<br/>', ' ')
        r.replace('\\','') 
           
        table = string.maketrans("","") 
        r = r.translate(table, string.punctuation)
        r_all.append(r) #not sure about appending r to a list since want a series 

    return r_all
    
cleaned_reviews(reviews)

['Im a hardscience science fiction fan and would rather read hard scfi than almost anything I love stories and movies about Mars and Im a fan of survival castaway and managainstthe elements stories I loved Robinson Crusoe so it should not surprise you that I loved the movie Robinson Crusoe on Mars I realize its not Academy Award material but to me its everything I want it to be as was this book The MartianbrbrThe main character Watney presumed dead is accidentally left by his crew mates when an intense Martian dust storm forces them to abort their mission What follows for part of the book is a logbook style narrative that describes in great technical detail Watneys efforts to extend his life until the next scheduled mission arrives in 4 years After reading just the first 20 of the book my Kindle has no page numbers one cant help but be impressed by the authors depth of knowledge in this regard In fact the entire book is an astronauts primer on extraterrestrial and deep space survival a

In [21]:
#Note from output: # UNICODE to ASCII replacement not needed since Unicode is just the way Python internally represents strings.

In [22]:
#Check for cleaned_reviews function, specifically review at idx #: 243268. 
#Verifying that text is clean. 

r_all[243268]

'If I could rate this book a zero I would The book was to all over the place Ive herd many good things about this book aand the othersWoorst book I hae EVER reead  Dont read it'

In [23]:
#Checking to see if number of reviews is same as number of reviews in big data frame.

len(r_all)

243269

In [24]:
#Vectorizing reviews and scores (along with changing scores into three different sentiment classes) 

#Thank you to Harvard CS109 course and my mentor, AJ, for help with this portion.

def buildMatrixAndVector(r_all, scores, vectorizer=None):
    
    # initialize vectorizer if none is provided
    if vectorizer is None:
        vectorizer = CountVectorizer()
    
    # create X using the vectorizer on the reviews
    X = vectorizer.fit_transform(r_all)
    
    # check out the type and shape of X
    print("... in buildMatrixAndVector: ")
    print("... type of X as returned by vectorizer.fit_transform(reviews): " + str(type(X)))
    print("... shape of X as returned by vectorizer.fit_transform(reviews): " + str(X.shape))
    
    # transform X to compressed Sparse Column format (CSC)
    X = X.tocsc()
    
    # check out the type and shape of X after the transformation
    print("... in buildMatrixAndVector: ")
    print("... type of X as transformed by tocsc(): " + str(type(X)))
    print("... shape of X as as transformed by tocsc(): " + str(X.shape))
    
    # now get y from labels
    # in this case this is a multi-class problem, so we 
    # transform {1, 2} to 1; {3} to 2; and {4, 5} to 3
    # for sentiment classification - {1,2} = negative, {3} = neutral, {4,5} = positive
    y = scores.apply(lambda x: 1 if x in range(1,3) 
                          else 3 if x in range(4,6) 
                          else 2)
    
    # check out the type and shape of y
    print("... in buildMatrixAndVector: ")
    print("... type of y: " + str(type(y)))
    print("... length of y: " + str(y.shape))
    
    # return what we have built, including the vectorizer object
    return X, y, vectorizer

In [25]:
#Calling function using all of the cleaned up reviews and its paired scores

X, y, vectorizer = buildMatrixAndVector(r_all, scores)

... in buildMatrixAndVector: 
... type of X as returned by vectorizer.fit_transform(reviews): <class 'scipy.sparse.csr.csr_matrix'>
... shape of X as returned by vectorizer.fit_transform(reviews): (243269, 153408)
... in buildMatrixAndVector: 
... type of X as transformed by tocsc(): <class 'scipy.sparse.csc.csc_matrix'>
... shape of X as as transformed by tocsc(): (243269, 153408)
... in buildMatrixAndVector: 
... type of y: <class 'pandas.core.series.Series'>
... length of y: (243269,)


In [26]:
#Using M.L. Classifier to predict reviews' sentiments

from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, y)
clf = MultinomialNB().fit(xtrain, ytrain)

training_accuracy = clf.score(xtrain, ytrain)
test_accuracy = clf.score(xtest, ytest)

print "Accuracy on training data: %0.2f" % (training_accuracy)
print "Accuracy on test data:     %0.2f" % (test_accuracy)



Accuracy on training data: 0.75
Accuracy on test data:     0.72


In [27]:
#Training and testing accuracies are pretty close. 
#This means that model does not overfit on training data.
#The model is able to generalize enough in order to 
#predict well with new test data.

#Recommendations for improvement:

# 1. Create an Unsupervised Learning model (without y-values) for model to train on.
# This would replicate reality better. 
# Usually a model is given test data that is completely new, 
# not from the original data set at all.

#2. Perform sentiment analysis by transforming text even more
# (clear stop words, set thresholds for acceptable words in corpus 
# based on number of appearances in corpus) 
#and using spaCy and Keras' neural networks to perform sentiment analysis on the reviews.