In [1]:
# Data Prep

# import modules and load data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import os
from scipy import stats

my_dir = os.path.realpath('.')
db_file = os.path.join(my_dir, '../../gr_sentiment_analysis/data/books.db')
conn = sqlite3.connect(db_file)

review_stats = pd.read_sql_query('SELECT * FROM review_stats',con=conn)

# Clean up the data a bit. Remove reviews with a score of 0 since they won't be part of the prediction model 

review_stats = review_stats[review_stats['rating'] != 0]

# Review ID is not needed for this analysis so should be dropped
if review_stats.columns.contains('review_id'):
    review_stats.drop('review_id', axis=1, inplace=True)

# remove any reviews with a total AFINN or Bing count of 0, since this means there are no matching
# words in either lexicon and these reviews cannot be used in this analysis
review_stats = review_stats[(review_stats.total_afinn_count != 0) & (review_stats.total_bing_count != 0)
                           & (review_stats.total_mpqa_count != 0) & (review_stats.total_inq_count != 0)]
# this leaves 918438 total reviews

# remove outliers where the Z-score of the pos or neg word counts is < 3
review_stats = review_stats[(
    np.abs(stats.zscore(review_stats[['pos_afinn_count', 'neg_afinn_count', 
                                      'pos_bing_count', 'neg_bing_count',
                                      'pos_mpqa_count', 'neg_mpqa_count',
                                      'pos_inq_count', 'neg_inq_count']])) < 3
).all(axis=1)]

# with all outliers removed (having a count +/- 3 SDs), 889807 rows remain. 

# Rating is our target variable, the other columns are features. 
y = review_stats['rating'].values
X = review_stats.drop('rating', axis=1).values

# Create a training and testing set 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


In [2]:
# Setup KNN
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import RandomizedSearchCV

knn = KNeighborsClassifier()

param_knn = {'n_neighbors': stats.randint(5,20),
             'leaf_size': stats.randint(20,40)}

knn_cv = RandomizedSearchCV(knn, param_knn, cv=5, n_iter=2, n_jobs=2)


In [4]:
knn_cv.fit(X_train, y_train)

knn_y_pred = knn_cv.predict(X_test)

#stats 
from sklearn.metrics import classification_report
print(knn_cv.best_params_)
print(knn_cv.best_score_)
print(classification_report(y_test, knn_y_pred))

{'leaf_size': 35, 'n_neighbors': 13}
0.320808417663
             precision    recall  f1-score   support

          1       0.15      0.04      0.06       689
          2       0.13      0.05      0.07      1542
          3       0.23      0.24      0.24      3850
          4       0.37      0.50      0.42      6183
          5       0.35      0.28      0.31      5128

avg / total       0.30      0.32      0.30     17392

