In [5]:
# Data Prep

# import modules and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
from scipy import stats

my_dir = os.path.realpath('.')
db_file = os.path.join(my_dir, '../../gr_sentiment_analysis/data/books.db')
conn = sqlite3.connect(db_file)

#conn = sqlite3.connect('c:/users/nick/onedrive/documents/springboard/sentiment_analysis/gr_sentiment_analysis/data/books.db')
review_stats = pd.read_sql_query('SELECT * FROM review_stats',con=conn)

# Clean up the data a bit. Remove reviews with a score of 0 since they won't be part of the prediction model 

review_stats = review_stats[review_stats['rating'] != 0]

# Review ID is not needed for this analysis so should be dropped
if review_stats.columns.contains('review_id'):
    review_stats.drop('review_id', axis=1, inplace=True)

    

# remove any reviews with a total AFINN or Bing count of 0, since this means there are no matching
# words in either lexicon and these reviews cannot be used in this analysis
review_stats = review_stats[(review_stats.total_afinn_count != 0) & (review_stats.total_bing_count != 0)]
# this leaves 918438 total reviews

# remove outliers where the Z-score of the pos or neg word counts is < 3
review_stats = review_stats[(
    np.abs(stats.zscore(review_stats[['pos_afinn_count', 'neg_afinn_count', 'pos_bing_count', 'neg_bing_count']])) < 3
).all(axis=1)]

# with all outliers removed (having a count +/- 3 SDs), 889807 rows remain. 

# Rating is our target variable, the other columns are features. 
y = review_stats['rating'].values
X = review_stats.drop('rating', axis=1).values

# Create a training and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


In [3]:
# Let's try a random search using decision trees and see if we get a better result

from sklearn.preprocessing import StandardScaler

from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# create our pipeline and parameter dict
# tree_steps = [('scaler', StandardScaler()),
#              ('tree', DecisionTreeClassifier())] 

param_tree = {'max_depth':[5, 10, 15, 20, None], 
              'max_features':randint(5,25),
              'min_samples_leaf': randint(1,20),
              'criterion': ['gini', 'entropy']}

tree_cv = RandomizedSearchCV(DecisionTreeClassifier(), param_tree, cv=5, n_iter=20, n_jobs=2)

In [None]:
tree_cv.fit(X_train, y_train)

tree_y_pred = tree_cv.predict(X_test)

#stats 
from sklearn.metrics import classification_report
print(tree_cv.best_params_)
print(tree_cv.best_score_)
print(classification_report(y_test, tree_y_pred))

In [7]:
tree = DecisionTreeClassifier(max_depth=5, max_features=24, min_samples_leaf=4, criterion='gini')
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, tree_pred))
# terrible. Zero precision on the 1s, 2s, and 3s. Bad results on the 4s and middling on the 5s.

             precision    recall  f1-score   support

          1       0.00      0.00      0.00      6664
          2       0.00      0.00      0.00     15953
          3       0.00      0.00      0.00     40160
          4       0.35      0.84      0.50     62340
          5       0.45      0.25      0.32     52845

avg / total       0.26      0.37      0.27    177962



  'precision', 'predicted', average, warn_for)
