In [1]:
# Data Prep

# import modules and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
from scipy import stats

my_dir = os.path.realpath('.')
db_file = os.path.join(my_dir, '../../gr_sentiment_analysis/data/books.db')
conn = sqlite3.connect(db_file)

review_stats = pd.read_sql_query('SELECT * FROM review_stats',con=conn)

# Clean up the data a bit. Remove reviews with a score of 0 since they won't be part of the prediction model 

review_stats = review_stats[review_stats['rating'] != 0]

# Review ID is not needed for this analysis so should be dropped
if review_stats.columns.contains('review_id'):
    review_stats.drop('review_id', axis=1, inplace=True)

    

# remove any reviews with a total AFINN or Bing count of 0, since this means there are no matching
# words in either lexicon and these reviews cannot be used in this analysis
review_stats = review_stats[(review_stats.total_afinn_count != 0) & (review_stats.total_bing_count != 0)]
# this leaves 918438 total reviews

# remove outliers where the Z-score of the pos or neg word counts is < 3
review_stats = review_stats[(
    np.abs(stats.zscore(review_stats[['pos_afinn_count', 'neg_afinn_count', 'pos_bing_count', 'neg_bing_count']])) < 3
).all(axis=1)]

# with all outliers removed (having a count +/- 3 SDs), 889807 rows remain. 

# Rating is our target variable, the other columns are features. 
y = review_stats['rating'].values
X = review_stats.drop('rating', axis=1).values

# Create a training and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


In [2]:
# Let's try a random search using decision trees and see if we get a better result

from scipy.stats import randint
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

# create our pipeline and parameter dict
# tree_steps = [('scaler', StandardScaler()),
#              ('tree', DecisionTreeClassifier())] 


nb_cv = cross_val_score(GaussianNB(), X, y, cv=5, n_jobs=2)
print(nb_cv)


[ 0.27819109  0.28076061  0.2828822   0.27487076  0.27645694]


In [3]:
nb = GaussianNB()

nb.fit(X_train, y_train)

from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(X_test)))


             precision    recall  f1-score   support

          1       0.09      0.39      0.15      6664
          2       0.12      0.23      0.16     15953
          3       0.26      0.21      0.23     40160
          4       0.37      0.48      0.42     62340
          5       0.54      0.09      0.16     52845

avg / total       0.36      0.28      0.27    177962



In [None]:
from sklearn.feature_selection import SelectKBest