In [10]:
# Methods to unpack json file and import as pandas data frame
import json
import pandas as pd
import gzip

# import iplot 
from plotly import __version__
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
cf.go_offline()

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


df = getDF('Software_5.json.gz')

In [11]:
list(df.columns)

['overall',
 'verified',
 'reviewTime',
 'reviewerID',
 'asin',
 'style',
 'reviewerName',
 'reviewText',
 'summary',
 'unixReviewTime',
 'vote',
 'image']

In [12]:
df[['overall', 'reviewText', 'summary']]

Unnamed: 0,overall,reviewText,summary
0,4.0,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5
1,4.0,"The demo is done with the PC version, with ref...",A good value
2,5.0,If you've been wanting to learn how to create ...,This is excellent software for those who want ...
3,5.0,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...
4,5.0,I decided (after trying a number of other prod...,Excellent Tutorials!
...,...,...,...
12800,4.0,When I ordered this it was listed as Photo Edi...,File Management Software with Basic Editing Ca...
12801,3.0,This software has SO much going on. Theres a ...,"Might not be for the ""novice"""
12802,4.0,I have used both more complex and less complex...,"Great, Inexpensive Software for Those Who Have..."
12803,3.0,Pinnacle Studio 20 Ultimate is a perfectly ser...,Gets the job done ... but not as easy as it sh...


In [13]:
# number of ratings in each category

# display
df['overall'].iplot(
    kind='hist',
    xTitle='Rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution'
)

In [14]:
# length of each review
df['review_len'] = df['reviewText'].str.split().str.len()

# display
df['review_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Text Length Distribution')

In [64]:
# Extracting predictor and target variables
X, y = df['reviewText'].values.astype('U'),df['overall']

In [65]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [66]:
# Create pipeline for baseline Multinomial Naive Bayes model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [67]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 5000)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [68]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(max_features=5000)),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [69]:
import numpy as np

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.5007098911500236

In [None]:
# Achieves 50% accuracy over 5 class multiclass prediction. Better than random. Random would be 1/5 = %20