In [98]:
import pyspark
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [100]:
# load data
data = pd.read_csv('ratings.csv') 
matrix = data
matrix

Unnamed: 0,game_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
...,...,...,...
981543,10000,48386,5
981544,10000,49007,4
981545,10000,49383,5
981546,10000,50124,5


In [102]:
# create spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [103]:
# create spark dataframe with header and infer schema
# read our data from ratings.csv
# header -> Record For Column Names 
# inferSchema -> automatically infers column types based on the data
df = spark.read.csv('ratings.csv', header=True, inferSchema=True)
df.show()

+-------+-------+------+
|game_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
+-------+-------+------+
only showing top 20 rows



In [104]:
# # cast all the columns to int
# df = df.select(df.game_id.cast('int'), df.user_id.cast('int'), df.rating.cast('int'))
# df

In [105]:
# build test and train data
# 80% training & 20% testing
# Calling the function with the same seed will always generate the same results
(train, test) = df.randomSplit([0.8, 0.2], seed = 21)

In [106]:
# build the recommendation model using ALS on the training data
# maxIter -> maximum number of iterations to run
# regParam -> regularization parameter in ALS
als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True, userCol='user_id', itemCol='game_id', ratingCol='rating')

In [107]:
# fit the model to the training data
model = als.fit(train)

In [108]:
# evaluate the model by computing the RMSE on the test data
# evaluate the model by computing the RMSE on the test data
# algorithm which can transform one DataFrame into another DataFrame
predictions = model.transform(test)
predictions.show()

evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

+-------+-------+------+----------+
|game_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      2|   3922|     5|  4.151307|
|      2|   6630|     5| 4.4779553|
|      2|  11408|     5| 2.5717492|
|      1|   5461|     3| 4.3839583|
|      2|   5461|     4| 4.2908974|
|      1|  42404|     5|  5.257044|
|      1|  47800|     5|  5.572372|
|      1|  21487|     4| 4.2775555|
|      1|  33716|     5|  4.349917|
|      1|  51166|     4| 5.2130127|
|      1|  45493|     5|  5.046213|
|      1|  39423|     3| 4.1388044|
|      1|  50342|     3| 3.7535079|
|      1|  47746|     5| 5.0195885|
|      1|  46977|     4|  3.430758|
|      1|  10944|     5| 4.2032537|
|      2|  10944|     5| 3.5444605|
|      2|  11945|     5|  4.751916|
|      1|  45269|     4| 4.0640707|
|      2|   5436|     5| 4.7578287|
+-------+-------+------+----------+
only showing top 20 rows



In [111]:
# get input user id
user_id = int(input('Enter user id: '))

Enter user id: 10


In [115]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
userRecs.show()



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|     31|[{9412, 5.578423}...|
|     34|[{9412, 3.3297641...|
|     53|[{8946, 4.994873}...|
|     65|[{3181, 6.6603622...|
|     78|[{7639, 3.1406672...|
|     85|[{8555, 6.6500525...|
|    108|[{6449, 4.4756393...|
|    133|[{7803, 11.09249}...|
|    137|[{8555, 9.96393},...|
|    148|[{4410, 9.5292225...|
|    155|[{7334, 5.533655}...|
|    193|[{6160, 17.450933...|
|    211|[{6771, 10.063102...|
|    243|[{9182, 8.83362},...|
|    251|[{9182, 7.3292828...|
|    255|[{9412, 5.6479406...|
|    296|[{7639, 7.5977874...|
|    321|[{9344, 7.11565},...|
|    322|[{4410, 6.1085033...|
|    362|[{8181, 7.704907}...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [117]:
# show userRecs with user_id with prediction score
# userRecs.filter(userRecs['user_id']==user_id).show()
recom_list = userRecs.filter(userRecs['user_id']==user_id).select('recommendations').collect()

                                                                                

In [118]:
# print name game with game_id
games = pd.read_csv('games.csv')
for i in range(len(recom_list[0][0])):
    game_data = games.loc[games['game_id'] == recom_list[0][0][i][0]]
    print(i+1 , ':', 'game:' ,game_data['name'].values[0], 'score:', recom_list[0][0][i][1])

1 : game: X-Squad score: 16.240812301635742
2 : game: Bust-A-Move DS score: 14.142559051513672
3 : game: EA Sports UFC score: 13.57870101928711
4 : game: SpellForce: The Order of Dawn score: 12.950912475585938
5 : game: 8-Bit Armies score: 12.894984245300293


In [68]:
# make a single prediction
single_user = test.filter(test['user_id']==id_input).select(['game_id', 'user_id'])
single_user.show()

+-------+-------+
|game_id|user_id|
+-------+-------+
|   4893|      1|
+-------+-------+



In [32]:
# recommend top 5 games for the user
recommendations = model.transform(single_user)
print('recommendations')
recommendations.orderBy('prediction', ascending=False).show(n=5)

recommendations
+-------+-------+----------+
|game_id|user_id|prediction|
+-------+-------+----------+
|   8072|      5| 2.4990234|
|   6703|      5| 1.1887991|
+-------+-------+----------+



In [19]:
# list of game_id recommended to user 
game_id = recommendations.orderBy('prediction', ascending=False).select('game_id').collect()
recomendation_score = recommendations.orderBy('prediction', ascending=False).select('prediction').collect()

In [20]:
games = pd.read_csv('games.csv')
games

Unnamed: 0,game_id,name,release_date,summary,meta_score
0,1,The Legend of Zelda: Ocarina of Time,23-Nov-98,"As a young boy, Link is tricked by Ganondorf, ...",99
1,2,Tony Hawk's Pro Skater 2,20-Sep-00,As most major publishers' development efforts ...,98
2,3,Grand Theft Auto IV,29-Apr-08,[Metacritic's 2008 PS3 Game of the Year; Also ...,98
3,4,SoulCalibur,8-Sep-99,"This is a tale of souls and swords, transcendi...",98
4,5,Super Mario Galaxy,12-Nov-07,[Metacritic's 2007 Wii Game of the Year] The u...,97
...,...,...,...,...,...
9995,9996,Buku Sudoku,28-May-08,[Xbox Live Arcade] Experience the excitement o...,60
9996,9997,Guilty Gear Dust Strikers,25-Apr-06,"The no holds barred, sci-fi fighting franchise...",60
9997,9998,The Shoot,19-Oct-10,The Shoot is a first-person oriented shooter g...,60
9998,9999,Spectra (2015),10-Jul-15,Spectra is a twitch racing game. Blast your ni...,60


In [21]:
# print name game with game_id
for i in range(len(game_id)):
    game_data = games.loc[games['game_id'] == game_id[i][0]]
    print(i+1 , ':', 'game:' ,game_data['name'].values[0], '  score:', recomendation_score[i][0])