In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV 
from joblib import parallel_backend

In [2]:
dataframe = pd.read_csv("../Preprocessing/preprocessdffixed.csv")
dataframe = dataframe = dataframe.drop(['Unnamed: 0' ], axis=1)
dataframe

Unnamed: 0,User Rating,Review,Price,Year,Genre,Occurences,Total Stars,000,10,100,...,write,year,york,young,zelda,zhi,zinczenko,zone,zoo,zusak
0,0.8750,0.197178,0.067308,0.7,1.0,0.000000,0.193075,0.0,0.914439,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.8125,0.022949,0.201923,0.2,0.0,0.000000,0.021992,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.8750,0.215730,0.134615,0.9,1.0,0.000000,0.211241,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.8750,0.243577,0.048077,0.8,0.0,0.000000,0.238507,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.9375,0.086875,0.105769,1.0,1.0,0.000000,0.086891,1.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,0.7500,0.122124,0.134615,0.3,0.0,0.000000,0.114481,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,0.5625,0.014407,0.096154,0.1,1.0,0.000000,0.012571,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,0.9375,0.245866,0.076923,0.4,0.0,0.444444,0.245879,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
345,1.0000,0.106783,0.067308,1.0,0.0,0.000000,0.109032,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Separates total stars into highly rated (1) and not highly rated (0)
dataframe.loc[dataframe['Total Stars']>0.05, 'Total Stars'] = 1
dataframe.loc[dataframe['Total Stars']<=0.05, 'Total Stars'] = 0
X_total_stars = dataframe.drop('Total Stars', axis=1).values
y_total_stars = dataframe['Total Stars']

In [4]:
# Separates rating values into highly rated (1) and not highly rated (0)
dataframe = pd.read_csv("../Preprocessing/preprocessdffixed.csv")
dataframe = dataframe = dataframe.drop(['Unnamed: 0' ], axis=1)
dataframe.loc[dataframe['User Rating']>0.8, 'User Rating'] = 1
dataframe.loc[dataframe['User Rating']<=0.8, 'User Rating'] = 0
X_reviews = dataframe.drop('User Rating', axis=1).values
y_reviews = dataframe['User Rating']
dataframe

Unnamed: 0,User Rating,Review,Price,Year,Genre,Occurences,Total Stars,000,10,100,...,write,year,york,young,zelda,zhi,zinczenko,zone,zoo,zusak
0,1.0,0.197178,0.067308,0.7,1.0,0.000000,0.193075,0.0,0.914439,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.022949,0.201923,0.2,0.0,0.000000,0.021992,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.215730,0.134615,0.9,1.0,0.000000,0.211241,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.243577,0.048077,0.8,0.0,0.000000,0.238507,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.086875,0.105769,1.0,1.0,0.000000,0.086891,1.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342,0.0,0.122124,0.134615,0.3,0.0,0.000000,0.114481,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
343,0.0,0.014407,0.096154,0.1,1.0,0.000000,0.012571,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
344,1.0,0.245866,0.076923,0.4,0.0,0.444444,0.245879,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
345,1.0,0.106783,0.067308,1.0,0.0,0.000000,0.109032,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X_train_stars, X_test_stars, y_train_stars, y_test_stars = train_test_split(X_total_stars, y_total_stars)

In [6]:
X_train_reviews, X_test_reviews, y_train_reviews, y_test_reviews = train_test_split(X_reviews, y_reviews)

In [7]:
kn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': range(1, 200, 1)
}
knn = GridSearchCV(kn, param_grid, cv = 5, n_jobs = -1)

In [11]:
knn.fit(X_train_stars, y_train_stars)
y_train_pred_stars = knn.predict(X_train_stars)
y_test_pred_stars = knn.predict(X_test_stars)
print('Stars Scores')
print('\n-Train')
print('--Accuracy')
print(knn.score(X_train_stars, y_train_stars))
print('--Precision')
print(precision_score(y_train_stars, y_train_pred_stars))
print('--Recall')
print(recall_score(y_train_stars, y_train_pred_stars))
print('--F1 Score')
print(f1_score(y_train_stars, y_train_pred_stars))
print('\n-Test')
print('--Accuracy')
print(knn.score(X_test_stars, y_test_stars))
print('--Precision')
print(precision_score(y_test_stars, y_test_pred_stars))
print('--Recall')
print(recall_score(y_test_stars, y_test_pred_stars))
print('--F1 Score')
print(f1_score(y_test_stars, y_test_pred_stars))
print(knn.best_params_)

Stars Scores

-Train
--Accuracy
0.8538461538461538
--Precision
0.8579881656804734
--Recall
0.9119496855345912
--F1 Score
0.8841463414634145

-Test
--Accuracy
0.8390804597701149
--Precision
0.9206349206349206
--Recall
0.8656716417910447
--F1 Score
0.8923076923076922
{'n_neighbors': 3}


In [13]:
knn.fit(X_train_reviews, y_train_reviews)
y_train_pred_reviews = knn.predict(X_train_reviews)
y_test_pred_reviews = knn.predict(X_test_reviews)
print('User Rating')
print('\n-Train')
print('--Accuracy')
print(knn.score(X_train_reviews, y_train_reviews))
print('--Precision')
print(precision_score(y_train_reviews, y_train_pred_reviews))
print('--Recall')
print(recall_score(y_train_reviews, y_train_pred_reviews))
print('--F1 Score')
print(f1_score(y_train_reviews, y_train_pred_reviews))
print('\n-Test')
print('--Accuracy')
print(knn.score(X_test_reviews, y_test_reviews))
print('--Precision')
print(precision_score(y_test_reviews, y_test_pred_reviews))
print('--Recall')
print(recall_score(y_test_reviews, y_test_pred_reviews))
print('--F1 Score')
print(f1_score(y_test_reviews, y_test_pred_reviews))
print(knn.best_params_)

User Rating

-Train
--Accuracy
0.6615384615384615
--Precision
0.6808510638297872
--Recall
0.9248554913294798
--F1 Score
0.784313725490196

-Test
--Accuracy
0.7241379310344828
--Precision
0.7763157894736842
--Recall
0.8939393939393939
--F1 Score
0.8309859154929577
{'n_neighbors': 61}
