In [18]:
import pandas as pd
import numpy as np
from src.make_preference_matrix import make_preference_df
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
df=make_preference_df()

In [3]:
df.head()

Unnamed: 0,_id,game_id,score,user_id,game_number,user_number
0,5c09771d933dea68aaeebb68,Grand Theft Auto V,8,J4MESOX4D,910,22984
1,5c09771d933dea68aaeebb69,Grand Theft Auto V,7,CardinalStorm,910,2926
2,5c09771d933dea68aaeebb6a,Grand Theft Auto V,10,DareJedi,910,11238
3,5c09771d933dea68aaeebb6b,Grand Theft Auto V,4,MaxBenevolent,910,27179
4,5c09771d933dea68aaeebb6c,Grand Theft Auto V,3,BVGNOfficial,910,46133


In [4]:
#print(len(df.user_id.unique()), len(df.game_id.unique()))

Unique users : 49732

Unique games : 2009

In [5]:
spark_df=spark.createDataFrame(df[['game_number', 'user_number', 'score']])

In [6]:
spark_df.show()

+-----------+-----------+-----+
|game_number|user_number|score|
+-----------+-----------+-----+
|        910|      22984|    8|
|        910|       2926|    7|
|        910|      11238|   10|
|        910|      27179|    4|
|        910|      46133|    3|
|        910|       2512|    2|
|        910|      44966|    7|
|        910|      38639|    5|
|        910|       8573|    7|
|        910|      48884|    7|
|        910|      49289|   10|
|        910|       3493|   10|
|        910|       1596|   10|
|        910|      31317|    8|
|        910|      44491|    8|
|        910|      40693|    8|
|        910|       6178|   10|
|        910|      29611|    8|
|        910|       1999|   10|
|        910|      19866|   10|
+-----------+-----------+-----+
only showing top 20 rows



In [9]:
(training, test) = spark_df.randomSplit([0.8, 0.2])

als = ALS(maxIter=15, 
          regParam=0.5, 
          rank=100,
          userCol="user_number",
          itemCol="game_number",
          ratingCol="score",
          coldStartStrategy="drop")

In [13]:
#model = als.fit(training)

In [17]:
#model.save('als_model.pkl')

In [19]:
model=ALSModel.load('als_model.pkl')

In [20]:
predictions = model.transform(test)

In [21]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="score",
                                predictionCol="prediction")

In [22]:
rmse = evaluator.evaluate(predictions)

In [23]:
rmse

3.1060298612835697

user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

item_arr=np.array(item_factors['features'].tolist())
user_arr=np.array(user_factors['features'].tolist())

In [27]:
#np.save('item_arr.npy', item_arr)

In [29]:
item_arr=np.load('item_arr.npy')

# Create example for model to give prediction to

In [31]:
sample1 = {'Destiny 2: Forsaken': 8,
         "Everybody's Golf": 8,
         "XCOM 2": 9,
         "NBA 2K17": 10,
         "Marvel's Spider-Man":6}

sample2 = {'Far Cry 5': 8,
         "Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10}

sample3 = {'Fallout 4': 10}

In [49]:
for key in sample2:
    print(df[df['game_id']==key]['game_id'].unique())

['Far Cry 5']
['Madden NFL 16']
['Tricky Towers']
['Fallout 4']
['Persona 5']


In [50]:
from pyspark.sql import Row

def insert_user_prefs(prefs={}):
    """Insert a new user's scores into the spark df with the rest
        of the user scores
        
        This is also completely irrelevant for giving a recommendation
        but was a nice practice in working with spark dataframes"""
    for key in prefs:
        nums=[(
            int(df[df['game_id']==key].game_number.unique()),
            len(df.user_id.unique())+1,
            sample[key]
               )]
        
        rdd=sc.parallelize(nums)
        entry=Row('game_number', 'user_number', 'score')
        new=rdd.map(lambda x: entry(*x))

        new_row=spark.createDataFrame(new)

        spark_df=spark_df.union(new_row)
    
    
# 'Game ID':  int(df[df['game_id']==key].game_number.unique()),
# 'USER ID':  len(df.user_id.unique())+1,
# 'SCORE':    sample[key])

In [51]:
def create_user_vector(prefs={}, item_factors=item_arr):
    game_ids=[]
    scores=[]
    for key in prefs:
        game_ids.append(int(df[df['game_id']==key].game_number.unique()))
        scores.append(prefs[key])
    ratings=np.array(scores)
    
    return ratings, item_factors[game_ids], game_ids

In [80]:
scores_vec, user_item_arr, game_ids = create_user_vector(prefs=sample3)

In [81]:
scores_vec.shape, user_item_arr.shape

((1,), (1, 100))

In [82]:





X, residuals, rank, s =np.linalg.lstsq(user_item_arr, scores_vec, rcond=None)





In [83]:
user_factors=X

In [84]:
def predict_new_user(user_factors, item_arr):
    new_factor_list = []
    for i in range(len(item_arr)):
        new_factor_list.append(np.dot(user_factors, item_arr[i]))
    new_user_df = pd.DataFrame([new_factor_list], index=['new_user'])
    return np.array(new_factor_list)

In [85]:
test_arr = predict_new_user(user_factors, item_arr)

In [86]:
test_arr = np.delete(test_arr, game_ids)

In [87]:
test_arr.argsort()[-5:][::-1]

array([941, 132, 581, 593, 352])

In [38]:
for ind in test_arr.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique(),)

NameError: name 'test_arr' is not defined

In [41]:
import src.collaborative_filtering

In [36]:
test_vec= src.collaborative_filtering.make_prefernce_vector(sample1)

In [37]:
test_vec

array([5.06104714, 7.77494894, 3.39601426, ..., 7.48739287, 8.84294797,
       8.51642464])

In [40]:
for ind in test_vec.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique()[0])

Yasai Ninja
Mortal Blitz
ClusterTruck
Dead Alliance
Narcosis


In [43]:
src.collaborative_filtering.give_recommendation(sample2)

['Super Bomberman R',
 'BlazBlue: Central Fiction',
 'LocoCycle',
 'Wayward Sky',
 'Fighter Within']