In [1]:
# Data Prep
# import modules and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from scipy import stats

conn = sqlite3.connect('c:/users/nick/onedrive/documents/springboard/sentiment_analysis/gr_sentiment_analysis/data/books.db')
review_stats = pd.read_sql_query('SELECT * FROM review_stats',con=conn)

# Clean up the data a bit. Remove reviews with a score of 0 since they won't be part of the prediction model 

review_stats = review_stats[review_stats['rating'] != 0]

# Review ID is not needed for this analysis so should be dropped
if review_stats.columns.contains('review_id'):
    review_stats.drop('review_id', axis=1, inplace=True)

# remove any reviews with a total AFINN or Bing count of 0, since this means there are no matching
# words in either lexicon and these reviews cannot be used in this analysis
review_stats = review_stats[(review_stats.total_afinn_count != 0) & (review_stats.total_bing_count != 0)
                           & (review_stats.total_mpqa_count != 0) & (review_stats.total_inq_count != 0)]
# this leaves 877941 total reviews

# remove outliers where the Z-score of the pos or neg word counts is < 3
review_stats = review_stats[(
    np.abs(stats.zscore(review_stats[['pos_afinn_count', 'neg_afinn_count', 
                                      'pos_bing_count', 'neg_bing_count',
                                      'pos_mpqa_count', 'neg_mpqa_count',
                                      'pos_inq_count', 'neg_inq_count']])) < 3
).all(axis=1)]

# with all outliers removed (having a count +/- 3 SDs), 889807 rows remain. 

# Rating is our target variable, the other columns are features. 
y = review_stats['rating'].values
X = review_stats.drop('rating', axis=1).values

# Create a training and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


In [14]:
# Setup MLP
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.model_selection import cross_val_score


from sklearn.model_selection import RandomizedSearchCV

mlp = MLPClassifier()

param_mlp = {'hidden_layer_sizes': [(10,), (20,), (30,), (40,)], 
             'tol': stats.uniform(1e-06, 1e-02),
             'alpha':stats.uniform(1e-08, 1e-03),
             'learning_rate': ['constant', 'invscaling', 'adaptive'],
             'activation': ['identity', 'logistic', 'relu', 'tanh']}

mlp_cv = RandomizedSearchCV(mlp, param_mlp, cv=5, n_iter=20, n_jobs=2)



In [15]:
mlp_cv.fit(X_train, y_train)

mlp_y_pred = mlp_cv.predict(X_test)

#stats 
from sklearn.metrics import classification_report
print(mlp_cv.best_params_)
print(mlp_cv.best_score_)
print(classification_report(y_test, mlp_y_pred))


{'activation': 'logistic', 'alpha': 0.00036196373407209188, 'hidden_layer_sizes': (20,), 'learning_rate': 'invscaling', 'tol': 0.0086727624161187962}
0.378095314031
             precision    recall  f1-score   support

          1       0.26      0.06      0.09      6602
          2       0.22      0.00      0.00     15786
          3       0.32      0.07      0.11     39794
          4       0.36      0.79      0.49     61421
          5       0.49      0.28      0.36     51986

avg / total       0.37      0.38      0.31    175589



In [12]:
mlp = MLPClassifier(activation='logistic', alpha=0.00020081762471937386, 
                    hidden_layer_sizes=(30,), learning_rate='invscaling', 
                   tol=0.0049744820659256839)

mlp.fit(X_train, y_train)
mlp.score(X_test, y_test)
# even with outliers removed, the score is still only .3757

0.37608847934665612

In [13]:
from sklearn.metrics import classification_report

print(classification_report(y_test, mlp.predict(X_test)))
# predictions are a tiny bit better, but not much. Either the data is useless, or I have more cleaning up to do.

             precision    recall  f1-score   support

          1       0.29      0.03      0.06      6602
          2       0.23      0.00      0.01     15786
          3       0.30      0.08      0.12     39794
          4       0.36      0.83      0.50     61421
          5       0.50      0.22      0.31     51986

avg / total       0.37      0.38      0.30    175589



[array([[ -1.49204969e-02,  -6.40278202e-03,   2.02881316e-01, ...,
          -6.03004458e-03,  -7.34108763e-03,   1.71180125e-01],
        [ -1.33374025e-02,  -1.08452281e-03,   7.24443430e-01, ...,
          -1.06280127e-03,  -1.48196982e-03,   3.66553295e-01],
        [ -1.22322492e-02,  -5.41514205e-03,  -9.78358305e-01, ...,
          -5.49542345e-03,  -5.04696322e-03,  -5.05497179e-01],
        ..., 
        [ -9.72713001e-02,  -1.59188056e-03,  -9.65943588e-02, ...,
          -1.33203330e-03,  -1.31133822e-03,   3.35858165e-01],
        [ -7.71518028e-02,  -3.34035712e-03,   4.65486792e-01, ...,
          -2.47391249e-02,  -4.07393090e-03,   1.01571890e+00],
        [  8.75217402e-03,   3.82379315e-05,   5.29473726e-02, ...,
           2.72683832e-05,   5.30448600e-05,  -2.56307366e-02]]),
 array([[ -1.38558368e-01,   7.75011659e-02,   1.20926063e-01,
          -8.69040309e-03,  -2.35262739e-02],
        [  4.76183486e-03,   6.76780815e-03,   4.45046049e-03,
          -6.3615579

[array([[ -1.49204969e-02,  -6.40278202e-03,   2.02881316e-01, ...,
          -6.03004458e-03,  -7.34108763e-03,   1.71180125e-01],
        [ -1.33374025e-02,  -1.08452281e-03,   7.24443430e-01, ...,
          -1.06280127e-03,  -1.48196982e-03,   3.66553295e-01],
        [ -1.22322492e-02,  -5.41514205e-03,  -9.78358305e-01, ...,
          -5.49542345e-03,  -5.04696322e-03,  -5.05497179e-01],
        ..., 
        [ -9.72713001e-02,  -1.59188056e-03,  -9.65943588e-02, ...,
          -1.33203330e-03,  -1.31133822e-03,   3.35858165e-01],
        [ -7.71518028e-02,  -3.34035712e-03,   4.65486792e-01, ...,
          -2.47391249e-02,  -4.07393090e-03,   1.01571890e+00],
        [  8.75217402e-03,   3.82379315e-05,   5.29473726e-02, ...,
           2.72683832e-05,   5.30448600e-05,  -2.56307366e-02]]),
 array([[ -1.38558368e-01,   7.75011659e-02,   1.20926063e-01,
          -8.69040309e-03,  -2.35262739e-02],
        [  4.76183486e-03,   6.76780815e-03,   4.45046049e-03,
          -6.3615579

In [None]:
# Check the correlation between the columns between the various opinion lexicons.