In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors

import setuptools.dist

#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Imports and setup.

In [26]:
boardgames_df = pd.read_csv('data/modern_games.csv')
users_df = pd.read_csv('data/users_encoded.csv')

## K-NN Baseline

For a starter collaborative system, use KNN to group users and suggest new games by distance.

In [4]:
user_item_matrix = pd.pivot_table(users_df, values='Rating', index=['Username'], columns=['BGGId'])

user_item_matrix

BGGId,1,3,4,5,7,8,9,11,12,13,...,341256,341284,341358,341530,341935,342010,342207,342942,343905,345584
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,,,,,,,,6.0,,,...,,,,,,,,,,
16,,,,7.0,,,,,,8.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
20,,10.0,,,,,,,7.0,5.0,...,,,,,,,,,,
24,,,,,,,,6.9,6.8,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411282,,,,,,,,,,7.0,...,,,,,,,,,,
411291,,,,,,,,,,,...,,,,,,,,,,
411298,,,,,,,,,,,...,,,,,,,,,,
411310,,,,,,,,,,,...,,,,,,,,,,


In [5]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

BGGId,1,3,4,5,7,8,9,11,12,13,...,341256,341284,341358,341530,341935,342010,342207,342942,343905,345584
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.9,6.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411291,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
411310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Fit the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_matrix)

In [7]:
target_user_index = 2

distances, indices = knn.kneighbors(user_item_matrix.iloc[target_user_index,:].values.reshape(1,-1), n_neighbors = 6)

In [8]:
print("Choosen user is: ",user_item_matrix.index[target_user_index])

Choosen user is:  17


In [9]:
game = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        game.append(user_item_matrix.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])

m_series = pd.Series(game,name='game')
d_series = pd.Series(distance,name='distance')
recommended = pd.concat([m_series, d_series], axis=1)
recommended = recommended.sort_values('distance',ascending=False)

print('Recommendations for {0}:\n'.format(user_item_matrix.index[target_user_index]))
for i in range(0, recommended.shape[0]):
    print(f'{recommended["game"].iloc[i]}, with distance of {recommended["distance"].iloc[i]}')

Recommendations for 17:

165342, with distance of 0.6564780731712118
347704, with distance of 0.6521453086833352
36220, with distance of 0.6470699885793922
200891, with distance of 0.6449374063106197
217180, with distance of 0.643976085735148


# Using PySpark!

In [25]:
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

Spark has it's own dataframes, and also HATES how big the data is, so we need to do some conversion.

In [27]:
df_spark = spark.createDataFrame(users_df)

In [28]:
# Importing necessary libraries
from pyspark.sql.functions import isnan, when, count, col
from pyspark.sql.types import IntegerType

# replace NaNs with 0.
df_spark = df_spark.withColumn("BGGId", when(isnan(col("BGGId")), 0).otherwise(col("BGGId")))


# Cast BGGId column to IntegerType to ensure it's within the integer range
df_spark = df_spark.withColumn("BGGId", df_spark["BGGId"].cast(IntegerType()))

#Check if there are still NaNs in BGGId column
print(df_spark.select([count(when(isnan(c), 1).otherwise(0)).alias(c) for c in df_spark.columns]).show())

# Redivide the data into train and test sets
train_data, test_data = df_spark.randomSplit(weights=[0.7,0.3], seed=42)

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5,
          regParam=0.01,
          userCol="Username",
          itemCol="BGGId",
          ratingCol="Rating")

# fit the ALS model
model = als.fit(train_data)

# Re-evaluate the model
predictions = model.transform(test_data)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test_data)

#Displaying predictions calculated by the model
predictions.show()


+----------+------+------+--------+
|Unnamed: 0| BGGId|Rating|Username|
+----------+------+------+--------+
|    865021|865021|865021|  865021|
+----------+------+------+--------+

None
+----------+------+------+--------+----------+
|Unnamed: 0| BGGId|Rating|Username|prediction|
+----------+------+------+--------+----------+
|   9623395|233867|   9.0|  124011| 6.5113645|
|   9623219|171623|   9.0|   79343|  8.296216|
|       683|193417|   6.0|  182387|  6.277005|
|   9622175|123260|   9.0|   16754| 7.2008495|
|       670|193584|   4.0|   68541|  4.840332|
|       422|158971|   6.0|  106445|  6.307823|
|       554|213648|   6.0|  114091|  7.554543|
|       534|213648|   7.0|  144156| 7.8757024|
|   9622806|  2653|   8.5|  152197|  6.806265|
|   9622852|176494|   8.5|   56423|  6.302535|
|   9622937|176494|   8.5|  111313|  8.180525|
|   9622885|176494|   8.5|  165397|  8.270066|
|       173|  1252|   6.0|   35825|  9.814671|
|       214|  1252|   4.0|  179249|  8.079249|
|       762|165

In [33]:
# prompt: print the predictions for one username using the pyspark predictions above

# Assuming 'model' and 'df_spark' are defined from the previous code

user_id = 810  # Replace with the desired username

# Create a DataFrame with the target user and all items.
user_df = df_spark.filter(df_spark.Username == user_id)

# Generate predictions for the target user and all games
user_predictions = model.transform(user_df)

# Display the predictions for the target user.
user_predictions.show()

# To print the predictions for the target user in a more readable format:
user_predictions_pd = user_predictions.toPandas()

print(f"Predictions for user: {user_id}")
for _, row in user_predictions_pd.iterrows():
    print(f"BGGId: {row.BGGId}, Prediction: {row.prediction}")


+----------+-----+------+--------+----------+
|Unnamed: 0|BGGId|Rating|Username|prediction|
+----------+-----+------+--------+----------+
+----------+-----+------+--------+----------+

Predictions for user: 810


In [34]:
spark.stop()