# Creating a baseline for recomendations

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.neighbors import NearestNeighbors

## Using PySpark!

In [None]:
#importing the required pyspark library
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Imports and setup.

In [11]:
boardgames_df = pd.read_csv('data/modern_games.csv')
users_df = pd.read_csv('data/users_encoded.csv')

In [None]:
#Setup Spark Session
spark = SparkSession.builder.appName('Recommender').getOrCreate()
spark

24/10/31 10:25:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [None]:
spark.stop()

## K-NN Baseline

In [None]:
user_item_matrix = pd.pivot_table(users_df, values='Rating', index=['Username'], columns=['BGGId'])

user_item_matrix

BGGId,1,3,5,11,12,13,15,18,42,45,...,337929,338476,339031,340790,341048,341284,341358,341935,342010,342207
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7.0,,,,6.0,,,,,,,...,,,,,,,,,,
11.0,,,7.0,,,8.0,,6.0,8.0,,...,,,,,,,,,,
12.0,,,,,,,,6.0,,,...,,,,,,,,,,
15.0,,,,,7.0,5.0,,,5.0,,...,,,,,,,,,,
18.0,,,,6.9,6.8,,,,6.8,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272115.0,,,,,,7.0,,,,,...,,,,,,,,,,
272121.0,,,,,,,,,,,...,,,,,,,,,,
272126.0,,,,,,,,,,,...,,,,,,,,,,
272137.0,,,,,,,,,,,...,,,,,,,,,,


In [None]:
user_item_matrix = user_item_matrix.fillna(0)
user_item_matrix

BGGId,1,3,5,11,12,13,15,18,42,45,...,337929,338476,339031,340790,341048,341284,341358,341935,342010,342207
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11.0,0.0,0.0,7.0,0.0,0.0,8.0,0.0,6.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15.0,0.0,0.0,0.0,0.0,7.0,5.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18.0,0.0,0.0,0.0,6.9,6.8,0.0,0.0,0.0,6.8,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272115.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272121.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272126.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272137.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Dividing the data using random split into train_data and test_data
train_data, test_data = train_test_split(users_df)

In [None]:
# Fit the KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(user_item_matrix)

In [None]:
print("Choosen user is: ",user_item_matrix.index[2])

Choosen user is:  12.0


In [None]:
target_user_index = 2


distances, indices = knn.kneighbors(user_item_matrix.iloc[target_user_index,:].values.reshape(1,-1), n_neighbors = 6)

In [None]:
game = []
distance = []

for i in range(0, len(distances.flatten())):
    if i != 0:
        game.append(user_item_matrix.index[indices.flatten()[i]])
        distance.append(distances.flatten()[i])

m_series = pd.Series(game,name='game')
d_series = pd.Series(distance,name='distance')
recommended = pd.concat([m_series, d_series], axis=1)
recommended = recommended.sort_values('distance',ascending=False)

print('Recommendations for {0}:\n'.format(user_item_matrix.index[target_user_index]))
for i in range(0, recommended.shape[0]):
    print(f'{recommended["game"].iloc[i]}, with distance of {recommended["distance"].iloc[i]}')

Recommendations for 12.0:

271955.0, with distance of 0.5608233777393511
51513.0, with distance of 0.5569028565494567
85046.0, with distance of 0.5414659379265347
30149.0, with distance of 0.5397810880138878
143582.0, with distance of 0.5190442641987776




```
# This is formatted as code
```

# Surprise

In [12]:
!pip install surprise



In [45]:
from surprise import SVD, accuracy, NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split
from collections import defaultdict
from surprise import AlgoBase

In [22]:
class baseline_algorithm(AlgoBase):
    '''
    Baseline Algorithm for the Recomendation System : Predicts that every user will rate a given game as the mean rating of every game
    '''
    def __init__(self):

        # call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        sum_means = self.trainset.global_mean
        div = 1

        if self.trainset.knows_user(u):
            sum_means += np.mean([r for (_, r) in self.trainset.ur[u]])
            div += 1
        if self.trainset.knows_item(i):
            sum_means += np.mean([r for (_, r) in self.trainset.ir[i]])
            div += 1

        return sum_means / div

In [28]:
class baseline_algorithm(AlgoBase):
    '''
    Baseline Algorithm for the Recommendation System: Predicts that every user will rate a given game as 3.
    '''
    def __init__(self):
        # call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):
        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)
        return self  # No need to calculate anything for this baseline

    def estimate(self, u, i):
        return 3  # Always return 3 as the predicted rating

In [35]:
# preprocessing the data
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(users_df[['Username','BGGId','Rating']], reader)
train_base, test_base = train_test_split(data, test_size=.2, random_state=42)

# baseline model
base = baseline_algorithm()
base.fit(train_base)
base_predictions = base.test(test_base)

# evaluate the rmse result of the prediction and ground thuth
accuracy.rmse(base_predictions)

RMSE: 4.1116


4.111622313834785

In [36]:
base_predictions

[Prediction(uid=97573.0, iid=167791, r_ui=9.0, est=3, details={'was_impossible': False}),
 Prediction(uid=203206.0, iid=41114, r_ui=7.0, est=3, details={'was_impossible': False}),
 Prediction(uid=152228.0, iid=93260, r_ui=6.0, est=3, details={'was_impossible': False}),
 Prediction(uid=77817.0, iid=157354, r_ui=7.0, est=3, details={'was_impossible': False}),
 Prediction(uid=192566.0, iid=126163, r_ui=8.0, est=3, details={'was_impossible': False}),
 Prediction(uid=36057.0, iid=266192, r_ui=9.0, est=3, details={'was_impossible': False}),
 Prediction(uid=193934.0, iid=243964, r_ui=7.0, est=3, details={'was_impossible': False}),
 Prediction(uid=206994.0, iid=463, r_ui=7.0, est=3, details={'was_impossible': False}),
 Prediction(uid=161454.0, iid=2397, r_ui=6.0, est=3, details={'was_impossible': False}),
 Prediction(uid=84724.0, iid=122522, r_ui=5.0, est=3, details={'was_impossible': False}),
 Prediction(uid=238964.0, iid=125618, r_ui=7.0, est=3, details={'was_impossible': False}),
 Predictio

In [46]:
train_SVD, test_SVD = train_test_split(data, test_size=.2, random_state=42)

algo = NMF()
algo.fit(train_SVD)
predictions = algo.test(test_SVD)

# evaluate the rmse result of the prediction and ground thuth
accuracy.rmse(predictions)

RMSE: 2.3396


2.339615754875859

In [56]:
import itertools  # Import itertools for islice

# ... (your existing code) ...

# Get the first 5 ratings using itertools.islice
first_five_ratings = list(itertools.islice(train_SVD.all_ratings(), 25))

print(first_five_ratings)

[(0, 0, 7.0), (0, 251, 4.0), (0, 16, 7.0), (0, 428, 7.0), (0, 66, 4.0), (0, 2, 7.0), (0, 18, 7.0), (0, 207, 6.0), (0, 132, 7.0), (0, 2691, 6.0), (0, 2735, 7.0), (0, 578, 5.0), (0, 108, 6.0), (0, 989, 5.0), (0, 3069, 5.0), (0, 1655, 5.0), (0, 3, 6.0), (0, 216, 5.0), (0, 92, 8.0), (0, 114, 6.0), (0, 17, 8.0), (0, 144, 7.0), (1, 1, 8.0), (1, 84, 8.0), (1, 144, 6.0)]


In [47]:
predictions

[Prediction(uid=97573.0, iid=167791, r_ui=9.0, est=5, details={'was_impossible': False}),
 Prediction(uid=203206.0, iid=41114, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=152228.0, iid=93260, r_ui=6.0, est=4.693678700029443, details={'was_impossible': False}),
 Prediction(uid=77817.0, iid=157354, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=192566.0, iid=126163, r_ui=8.0, est=5, details={'was_impossible': False}),
 Prediction(uid=36057.0, iid=266192, r_ui=9.0, est=5, details={'was_impossible': False}),
 Prediction(uid=193934.0, iid=243964, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=206994.0, iid=463, r_ui=7.0, est=5, details={'was_impossible': False}),
 Prediction(uid=161454.0, iid=2397, r_ui=6.0, est=4.454636947301136, details={'was_impossible': False}),
 Prediction(uid=84724.0, iid=122522, r_ui=5.0, est=5, details={'was_impossible': False}),
 Prediction(uid=238964.0, iid=125618, r_ui=7.0, est=5, details={'was_