In [1]:
import pandas as pd
import numpy as np
from src.make_preference_matrix import make_preference_df
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator



In [2]:
df=make_preference_df()

In [3]:
df

Unnamed: 0,_id,game_id,score,user_id,game_number,user_number,num_username_reviews,num_game_reviews
0,5c09771d933dea68aaeebb68,Grand Theft Auto V,8,J4MESOX4D,1026,25541,15,627
1,5c09771d933dea68aaeebb69,Grand Theft Auto V,7,CardinalStorm,1026,3259,4,627
2,5c09771d933dea68aaeebb6a,Grand Theft Auto V,10,DareJedi,1026,12477,1,627
3,5c09771d933dea68aaeebb6b,Grand Theft Auto V,4,MaxBenevolent,1026,30190,1,627
4,5c09771d933dea68aaeebb6c,Grand Theft Auto V,3,BVGNOfficial,1026,51259,2,627
5,5c09771d933dea68aaeebb6d,Grand Theft Auto V,2,stedee73,1026,2801,1,627
6,5c09771d933dea68aaeebb6e,Grand Theft Auto V,7,NZAnimeManga,1026,49962,18,627
7,5c09771d933dea68aaeebb6f,Grand Theft Auto V,5,evolution98,1026,42936,2,627
8,5c09771d933dea68aaeebb70,Grand Theft Auto V,7,SotirisKmsGR,1026,9517,1,627
9,5c09771d933dea68aaeebb71,Grand Theft Auto V,7,AbradolfLinkler,1026,54319,1,627


In [4]:
filtered_df = df[df.num_username_reviews>=17]

In [5]:
filtered_df

Unnamed: 0,_id,game_id,score,user_id,game_number,user_number,num_username_reviews,num_game_reviews
6,5c09771d933dea68aaeebb6e,Grand Theft Auto V,7,NZAnimeManga,1026,49962,18,627
13,5c09771d933dea68aaeebb75,Grand Theft Auto V,8,Red-Opposom-098,1026,34793,17,627
18,5c09771d933dea68aaeebb7a,Grand Theft Auto V,10,DSouls_guy,1026,2224,19,627
24,5c09771d933dea68aaeebb80,Grand Theft Auto V,0,rodericrinehart,1026,16237,74,627
29,5c09771d933dea68aaeebb85,Grand Theft Auto V,10,MaxPayneIsGod,1026,1989,63,627
35,5c09771d933dea68aaeebb8b,Grand Theft Auto V,8,Ghoster,1026,4784,61,627
51,5c09771d933dea68aaeebb9b,Grand Theft Auto V,10,joaopontesvaz,1026,50016,72,627
52,5c09771d933dea68aaeebb9c,Grand Theft Auto V,8,badgerryan19,1026,44720,102,627
66,5c09771d933dea68aaeebbaa,Grand Theft Auto V,5,ignore78,1026,7565,25,627
67,5c09771d933dea68aaeebbab,Grand Theft Auto V,8,blopho,1026,14453,17,627


In [6]:
#print(len(df.user_id.unique()), len(df.game_id.unique()))

Unique users : 55272

Unique games : 2289

In [7]:
spark_df=spark.createDataFrame(filtered_df[['game_number', 'user_number', 'score']])

In [8]:
spark_df.show()

+-----------+-----------+-----+
|game_number|user_number|score|
+-----------+-----------+-----+
|       1026|      49962|    7|
|       1026|      34793|    8|
|       1026|       2224|   10|
|       1026|      16237|    0|
|       1026|       1989|   10|
|       1026|       4784|    8|
|       1026|      50016|   10|
|       1026|      44720|    8|
|       1026|       7565|    5|
|       1026|      14453|    8|
|       1026|      49444|   10|
|       1026|       2212|    9|
|       1026|      31285|    7|
|       1026|      44638|    1|
|       1026|      21790|    7|
|       1026|      23712|   10|
|       1026|      43451|    4|
|       1026|      26450|    3|
|       1026|      35372|    6|
|       1026|      31585|    6|
+-----------+-----------+-----+
only showing top 20 rows



In [9]:
(training, test) = spark_df.randomSplit([0.8, 0.2])

In [10]:
als = ALS(maxIter=21, 
          regParam=0.45, 
          rank=50,
          userCol="user_number",
          itemCol="game_number",
          ratingCol="score",
          coldStartStrategy="drop")

In [11]:
model = als.fit(training)

In [16]:
model.save('filtered_als_model.pkl')

In [263]:
#model.save('als_model.pkl')

In [264]:
#model=ALSModel.load('als_model.pkl')

In [12]:
predictions = model.transform(test)

In [13]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="score",
                                predictionCol="prediction")

In [14]:
rmse = evaluator.evaluate(predictions)

In [15]:
rmse

2.1693379840046565

In [269]:
user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

In [270]:
item_arr=np.array(item_factors['features'].tolist())
user_arr=np.array(user_factors['features'].tolist())

In [21]:
#np.save('item_arr.npy', item_arr)

In [22]:
#item_arr=np.load('item_arr.npy')

# Create example for model to give prediction to

In [314]:
sample1 = {'Destiny 2: Forsaken': 8,
         "Everybody's Golf": 8,
         "XCOM 2": 9,
         "NBA 2K17": 10,
         "Marvel's Spider-Man":6,
         'Far Cry 5': 8,
         "Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10}

sample2 = {
         'Far Cry 5': 8,
         #"Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10
          }

sample3 = {'Fallout 4': 10}

In [305]:
for key in sample2:
    print(df[df['game_id']==key]['game_id'].unique())

['Far Cry 5']
['Madden NFL 16']
['Fallout 4']
['Persona 5']


In [306]:
from pyspark.sql import Row

def insert_user_prefs(prefs={}):
    """Insert a new user's scores into the spark df with the rest
        of the user scores
        
        This is also completely irrelevant for giving a recommendation
        but was a nice practice in working with spark dataframes"""
    for key in prefs:
        nums=[(
            int(df[df['game_id']==key].game_number.unique()),
            len(df.user_id.unique())+1,
            sample[key]
               )]
        
        rdd=sc.parallelize(nums)
        entry=Row('game_number', 'user_number', 'score')
        new=rdd.map(lambda x: entry(*x))

        new_row=spark.createDataFrame(new)

        spark_df=spark_df.union(new_row)
    
    
# 'Game ID':  int(df[df['game_id']==key].game_number.unique()),
# 'USER ID':  len(df.user_id.unique())+1,
# 'SCORE':    sample[key])

In [307]:
def create_user_vector(prefs={}, item_factors=item_arr):
    game_ids=[]
    scores=[]
    for key in prefs:
        game_ids.append(int(df[df['game_id']==key].game_number.unique()))
        scores.append(prefs[key])
    ratings=np.array(scores)
    
    return ratings, item_factors[game_ids], game_ids

In [317]:
scores_vec, user_item_arr, game_ids = create_user_vector(prefs=sample2)

IndexError: index 2118 is out of bounds for axis 0 with size 1778

In [None]:
scores_vec.shape, user_item_arr.shape

In [None]:




X, residuals, rank, s =np.linalg.lstsq(user_item_arr, scores_vec, rcond=None)





In [297]:
user_factors=X

In [298]:
def predict_new_user(user_factors, item_arr):
    new_factor_list = []
    for i in range(len(item_arr)):
        new_factor_list.append(np.dot(user_factors, item_arr[i]))
    new_user_df = pd.DataFrame([new_factor_list], index=['new_user'])
    return np.array(new_factor_list)

In [299]:
test_arr = predict_new_user(user_factors, item_arr)

In [300]:
test_arr = np.delete(test_arr, game_ids)

In [301]:
test_arr.argsort()[-5:][::-1]

array([1715,  524, 1228, 1625,  114])

In [302]:
for ind in test_arr.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique(),)

['Q.U.B.E. 2']
['Tyler: Model 005']
['Operation Warcade']
['Paladins: Champions of the Realm']
['Deus Ex: Mankind Divided']


In [41]:
import src.collaborative_filtering

In [36]:
test_vec= src.collaborative_filtering.make_prefernce_vector(sample1)

In [37]:
test_vec

array([5.06104714, 7.77494894, 3.39601426, ..., 7.48739287, 8.84294797,
       8.51642464])

In [40]:
for ind in test_vec.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique()[0])

Yasai Ninja
Mortal Blitz
ClusterTruck
Dead Alliance
Narcosis


In [43]:
src.collaborative_filtering.give_recommendation(sample2)

['Super Bomberman R',
 'BlazBlue: Central Fiction',
 'LocoCycle',
 'Wayward Sky',
 'Fighter Within']