In [3]:
# Methods to unpack json file and import as pandas data frame
import json
import pandas as pd
import gzip

# import iplot 

from plotly import __version__
%matplotlib inline
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

init_notebook_mode(connected=True)
cf.go_offline()

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


df = getDF('Software_5.json.gz')

In [4]:
list(df.columns)

['overall',
 'verified',
 'reviewTime',
 'reviewerID',
 'asin',
 'style',
 'reviewerName',
 'reviewText',
 'summary',
 'unixReviewTime',
 'vote',
 'image']

In [5]:
df[['overall', 'reviewText', 'summary']]

Unnamed: 0,overall,reviewText,summary
0,4.0,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5
1,4.0,"The demo is done with the PC version, with ref...",A good value
2,5.0,If you've been wanting to learn how to create ...,This is excellent software for those who want ...
3,5.0,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...
4,5.0,I decided (after trying a number of other prod...,Excellent Tutorials!
...,...,...,...
12800,4.0,When I ordered this it was listed as Photo Edi...,File Management Software with Basic Editing Ca...
12801,3.0,This software has SO much going on. Theres a ...,"Might not be for the ""novice"""
12802,4.0,I have used both more complex and less complex...,"Great, Inexpensive Software for Those Who Have..."
12803,3.0,Pinnacle Studio 20 Ultimate is a perfectly ser...,Gets the job done ... but not as easy as it sh...


In [6]:
# number of ratings in each category

# display
df['overall'].iplot(
    kind='hist',
    xTitle='Rating',
    linecolor='black',
    yTitle='count',
    title='Review Rating Distribution'
)

In [7]:
# length of each review
df['review_len'] = df['reviewText'].str.split().str.len()

# display
df['review_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Word Length Distribution')

In [8]:
# Extracting predictor and target variables
X, y = df['reviewText'].values.astype('U'),df['overall']

In [9]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [10]:
# Create pipeline for baseline Multinomial Naive Bayes model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(max_features = 5000)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [12]:
text_clf.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer(max_features=5000)),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [13]:
import numpy as np
from sklearn.metrics import classification_report

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.5009465215333648

In [14]:
# Achieves 50% accuracy over 5 class multiclass prediction. Better than random. Random would be 1/5 = %20

In [15]:
# precision is the ability of the classifier not to label as positive a sample that is negative
# recall is the ability of the classifier to find all the positive samples.

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

         1.0       0.87      0.22      0.35       477
         2.0       0.00      0.00      0.00       238
         3.0       0.64      0.01      0.03       544
         4.0       0.41      0.17      0.24      1041
         5.0       0.50      0.95      0.65      1926

    accuracy                           0.50      4226
   macro avg       0.48      0.27      0.25      4226
weighted avg       0.51      0.50      0.40      4226




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [16]:
count_vect = CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.80)
X_train_count_vectorized = count_vect.fit_transform(X_train)
X_test_count_vectorized = count_vect.transform(X_test)

In [17]:
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=2, max_df=0.80)
X_train_tfidf_vectorized = tfidf_vect.fit_transform(X_train)
X_test_tfidf_vectorized = tfidf_vect.transform(X_test)

In [18]:
lg_clf = LogisticRegression()
lg_clf.fit(X_train_count_vectorized, y_train)

y_preds = lg_clf.predict(X_test_count_vectorized)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [19]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         1.0       0.65      0.62      0.64       477
         2.0       0.50      0.25      0.33       238
         3.0       0.45      0.34      0.39       544
         4.0       0.51      0.42      0.46      1041
         5.0       0.68      0.83      0.75      1926

    accuracy                           0.61      4226
   macro avg       0.56      0.49      0.51      4226
weighted avg       0.59      0.61      0.60      4226



In [20]:
# Managed to achieve about 61% accuracy using CountVectorizer and LogisticRegression

# I tried using TFIDF, SVC, and RandomForestClassifier, but it return slight less accuracy. Around 56-57%

In [89]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import pprint


In [90]:
from sklearn.linear_model import Ridge
SGD_Classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier())])


In [91]:
SGD_Classifier.fit(X_train, y_train)
predicted = SGD_Classifier.predict(X_test)
np.mean(predicted == y_test)

0.6211547562707052

In [92]:
# Ran SDGClassifier for another baseline model and achieved 62% accuracy

In [93]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),  
    'vect__ngram_range': ((1, 1), (1, 2)),  
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),    
}

In [97]:
if __name__ == "__main__":
    gridSearch = GridSearchCV(SGD_Classifier, parameters, n_jobs=-1, verbose=1)
    gridSearch.fit(X_train, y_train)
    print()
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.5min finished



Best score: 0.640
Best parameters:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 0.5
	vect__ngram_range: (1, 2)



Maximum number of iteration reached before convergence. Consider increasing max_iter to improve the fit.



In [None]:
# tuning of hyperparameters results in a best score of 64% for the SGD model 