In [44]:
import gzip
import itertools
import string
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import pylab as pl

from collections import Counter
from sklearn import svm
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

%matplotlib inline

In [4]:
# Functions for Loading Data

def parse_gz(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def convert_to_DF(path):
    i = 0
    df = {}
    for d in parse_gz(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')


In [7]:
sports_products = convert_to_DF('../data/reviews_Sports_and_Outdoors_5.json.gz')

In [9]:
print('Dataset size: {:,}'.format(len(sports_products)))

Dataset size: 296,337


In [11]:
sports_products.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5.0,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5.0,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4.0,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4.0,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4.0,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [12]:
sports_products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 296337 entries, 0 to 296336
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewerID      296337 non-null  object 
 1   asin            296337 non-null  object 
 2   reviewerName    294935 non-null  object 
 3   helpful         296337 non-null  object 
 4   reviewText      296337 non-null  object 
 5   overall         296337 non-null  float64
 6   summary         296337 non-null  object 
 7   unixReviewTime  296337 non-null  int64  
 8   reviewTime      296337 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 32.6+ MB


In [14]:
# Reformat datetime from raw form.

sports_products["reviewTime"] = pd.to_datetime(sports_products["reviewTime"])

In [18]:
# Find unique products based on 'ASIN'

print("Total product ASINs : {}".format( sports_products['asin'].count() ))
print("Total unique product ASINs : {}".format( np.count_nonzero(sports_products['asin'].unique()) ))

Total product ASINs : 296337
Total unique product ASINs : 18357


In [29]:
products = sports_products['overall'].groupby(sports_products['asin']).count()

sorted_products = products.sort_values(ascending=False)

# Top 10 Most Reviewed Products

print("Top 10 Reviewed Products:\n")
print(sorted_products[:10], end='\n\n')

# Most reviwed product is

Top 10 Reviewed Products:

asin
B001HBHNHE    1042
B001T7QJ9O     763
B000S5ODN2     647
B0010O748Q     513
B0000C50K3     427
B002ZYRV2E     401
B002OKWHVO     398
B000GCRWCG     393
B001HBHNHY     372
B0035L35A8     359
Name: overall, dtype: int64



In [32]:
# Bottom 10 Least Reviewed Products

print("Bottom 10 Reviewed Products:\n")
print(sorted_products[-10:], end='\n\n')

# Least reviwed product is

Bottom 10 Reviewed Products:

asin
B00BGI7R38    5
B003WHHXAS    5
B000TTR0JG    5
B003WHA1N4    5
B000TRKTUK    5
B000TTHVYA    5
B003WGUEEG    5
B000TTM3OI    5
B003WDTI78    5
B003Z6HUZE    5
Name: overall, dtype: int64



In [33]:
# Preprocessing for Sentiment Analysis

In [41]:
# View all reviews

sports_products['reviewText']

0         This came in on time and I am veru happy with ...
1         I had a factory Glock tool that I was using fo...
2         If you don't have a 3/32 punch or would like t...
3         This works no better than any 3/32 punch you w...
4         I purchased this thinking maybe I need a speci...
                                ...                        
296332    This is a water bottle done right. It is a ver...
296333    If you're looking for an insulated water bottl...
296334    This Hydracentials Sporty 25 OZ, double insula...
296335    As usual I received this item free in exchange...
296336    Hydracentials insulated 25 oz water bottle.Thi...
Name: reviewText, Length: 296337, dtype: object

In [40]:
# Let us take a look at a random review

sports_products['reviewText'][11]

"It's cheaply made but does what it is supposed to do. Wish it was USB rechargeable. I don't think it will survive a monsoon but light rain it can handle."

In [53]:
# Fetching stopword list from NLTK Corpus

stopword_list = stopwords.words('english')

print("Example of the stop words : {}".format(stopword_list[:10]))

Example of the stop words : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [54]:
# Functions for text preprocessing

# Tolenize function

def tokenize(text):
    tokenized = word_tokenize(text)
    no_punc = []
    for review in tokenized:
        line = "".join(char for char in review if char not in string.punctuation)
        no_punc.append(line)
    tokens = lemmatize(no_punc)
    return tokens

# Lemmatization function

def lemmatize(tokens):
    lmtzr = WordNetLemmatizer()
    lemma = [lmtzr.lemmatize(t) for t in tokens]
    return lemma