# Creating a baseline for recomendations

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors

import setuptools.dist

#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Imports and setup.

In [6]:
boardgames_df = pd.read_csv('data/modern_games.csv')
users_df = pd.read_csv('data/users_encoded.csv')

## K-NN Baseline

For a starter collaborative system, use KNN to group users and suggest new games by distance.

In [7]:
user_item_matrix = pd.pivot_table(users_df, values='Rating', index=['Username'], columns=['BGGId'])

user_item_matrix

BGGId,1,3,4,5,7,8,9,11,12,13,...,341256,341284,341358,341530,341935,342010,342207,342942,343905,345584
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,,,,,,,,6.0,,,...,,,,,,,,,,
11,,,,7.0,,,,,,8.0,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
15,,10.0,,,,,,,7.0,5.0,...,,,,,,,,,,
18,,,,,,,,6.9,6.8,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272115,,,,,,,,,,7.0,...,,,,,,,,,,
272121,,,,,,,,,,,...,,,,,,,,,,
272126,,,,,,,,,,,...,,,,,,,,,,
272137,,,,,,,,,,,...,,,,,,,,,,


In [8]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

BGGId,1,3,4,5,7,8,9,11,12,13,...,341256,341284,341358,341530,341935,342010,342207,342942,343905,345584
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.9,6.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272115,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Fit the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_matrix)

In [10]:
target_user_index = 2

distances, indices = knn.kneighbors(user_item_matrix.iloc[target_user_index,:].values.reshape(1,-1), n_neighbors = 6)

In [11]:
print("Choosen user is: ",user_item_matrix.index[target_user_index])

Choosen user is:  12


In [12]:
game = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        game.append(user_item_matrix.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])

m_series = pd.Series(game,name='game')
d_series = pd.Series(distance,name='distance')
recommended = pd.concat([m_series, d_series], axis=1)
recommended = recommended.sort_values('distance',ascending=False)

print('Recommendations for {0}:\n'.format(user_item_matrix.index[target_user_index]))
for i in range(0, recommended.shape[0]):
    print(f'{recommended["game"].iloc[i]}, with distance of {recommended["distance"].iloc[i]}')

Recommendations for 12:

107991, with distance of 0.6564780731712117
229400, with distance of 0.6521453086833352
23578, with distance of 0.6470699885793922
131164, with distance of 0.6449374063106197
141988, with distance of 0.643976085735148


# Using PySpark!

In [None]:
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/06 21:11:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark has it's own dataframes, so we need to do some conversion.

In [25]:
df_spark = spark.createDataFrame(users_df)

In [28]:
# show the PySpark DataFrame
df_spark.head()

24/11/06 21:19:11 WARN TaskSetManager: Stage 1 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
24/11/06 21:19:15 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 1 (TID 1): Attempting to kill Python Worker
                                                                                

Row(Unnamed: 0=20, BGGId=213788, Rating=7.5, Username=62528)

In [34]:
# Dividing the data using random split into train_data and test_data
train_data, test_data = df_spark.randomSplit(weights=[0.7,0.3], seed=42)

In [36]:
# Build the recommendation model using ALS on the training data 
als = ALS(maxIter=5, 
          regParam=0.01, 
          userCol="Username", 
          itemCol="BGGId", 
          ratingCol="Rating") 
  
  
#Fitting the model on the train_data 
model = als.fit(test_data)


24/11/06 21:24:13 WARN TaskSetManager: Stage 36 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
24/11/06 21:24:14 WARN TaskSetManager: Stage 37 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [37]:

# Evaluate the model by computing the RMSE on the test data 
predictions = model.transform(test_data) 
  
#Displaying predictions calculated by the model 
predictions.show()


24/11/06 21:24:27 WARN TaskSetManager: Stage 70 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+----------+------+------+--------+----------+
|Unnamed: 0| BGGId|Rating|Username|prediction|
+----------+------+------+--------+----------+
|       317|126021|   7.0|   72610|  6.466961|
|       534|213648|   7.0|   94078|  7.786377|
|       683|193417|   6.0|  119159| 6.3276415|
|       214|  1252|   4.0|  117107| 3.8332164|
|       217|  1252|   4.0|  238483| 4.3477383|
|       422|158971|   6.0|   69508| 6.7827473|
|       810|165748|   2.0|   58987|   2.30791|
|       128|193500|   3.0|   85742|  2.982291|
|       762|165748|   8.0|   33194| 7.8584957|
|       544|213648|   6.0|  269656|  6.730475|
|       605|114784|   5.0|  156927| 5.1894073|
|       670|193584|   4.0|   44686|  4.321164|
|       828|158816|   8.0|  269043|  8.685877|
|       393|158971|   6.8|  135584| 6.8418007|
|       173|  1252|   6.0|   23316|  5.174064|
|       554|213648|   6.0|   74420|  6.548898|
|       114|193500|   4.0|  170725| 4.0270133|
|       875|113301|   5.0|   95898| 5.9611335|
|        48|2

In [39]:

#Printing and calculating RMSE 
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Rating",predictionCol="prediction") 
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))


24/11/06 21:24:52 WARN TaskSetManager: Stage 105 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
[Stage 139:>                                                        (0 + 3) / 3]

Root-mean-square error = 0.8199386482709955


                                                                                

In [43]:

#Filtering user with user id "5461" with book id on which it has given the reviews 
user1 = test_data.filter(test_data['Username']==target_user_index).select(['BGGId','Username']) 
  
#Displaying user1 data 
user1.show()


24/11/06 21:27:04 WARN TaskSetManager: Stage 159 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
24/11/06 21:27:05 WARN TaskSetManager: Stage 160 contains a task of very large size (2723 KiB). The maximum recommended task size is 1000 KiB.
24/11/06 21:27:06 WARN TaskSetManager: Stage 161 contains a task of very large size (2707 KiB). The maximum recommended task size is 1000 KiB.
[Stage 161:>                                                        (0 + 3) / 3]

+-----+--------+
|BGGId|Username|
+-----+--------+
+-----+--------+



                                                                                

In [44]:

#Traning and evaluating for user1 with our model trained with the help of training data  
recommendations = model.transform(user1) 
  
#Displaying the predictions of books for user1 
recommendations.orderBy('prediction',ascending=False).show()


24/11/06 21:27:17 WARN TaskSetManager: Stage 162 contains a task of very large size (2675 KiB). The maximum recommended task size is 1000 KiB.
[Stage 176:>               (0 + 8) / 10][Stage 177:>               (0 + 0) / 10]

+-----+--------+----------+
|BGGId|Username|prediction|
+-----+--------+----------+
+-----+--------+----------+







In [None]:
spark.stop()