In [1]:
import pandas as pd
import numpy as np
from notebook_src.make_preference_matrix import make_preference_df
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
df=make_preference_df()

In [3]:
df

Unnamed: 0,_id,game_id,score,user_id,game_number,user_number
0,5c09771d933dea68aaeebb68,Grand Theft Auto V,8,J4MESOX4D,1026,25541
1,5c09771d933dea68aaeebb69,Grand Theft Auto V,7,CardinalStorm,1026,3259
2,5c09771d933dea68aaeebb6a,Grand Theft Auto V,10,DareJedi,1026,12477
3,5c09771d933dea68aaeebb6b,Grand Theft Auto V,4,MaxBenevolent,1026,30190
4,5c09771d933dea68aaeebb6c,Grand Theft Auto V,3,BVGNOfficial,1026,51259
5,5c09771d933dea68aaeebb6d,Grand Theft Auto V,2,stedee73,1026,2801
6,5c09771d933dea68aaeebb6e,Grand Theft Auto V,7,NZAnimeManga,1026,49962
7,5c09771d933dea68aaeebb6f,Grand Theft Auto V,5,evolution98,1026,42936
8,5c09771d933dea68aaeebb70,Grand Theft Auto V,7,SotirisKmsGR,1026,9517
9,5c09771d933dea68aaeebb71,Grand Theft Auto V,7,AbradolfLinkler,1026,54319


In [6]:
#print(len(df.user_id.unique()), len(df.game_id.unique()))

Unique users : 55272

Unique games : 2289

In [7]:
spark_df=spark.createDataFrame(df[['game_number', 'user_number', 'score']])

In [8]:
spark_df.show()

+-----------+-----------+-----+
|game_number|user_number|score|
+-----------+-----------+-----+
|       1026|      25541|    8|
|       1026|       3259|    7|
|       1026|      12477|   10|
|       1026|      30190|    4|
|       1026|      51259|    3|
|       1026|       2801|    2|
|       1026|      49962|    7|
|       1026|      42936|    5|
|       1026|       9517|    7|
|       1026|      54319|    7|
|       1026|      54785|   10|
|       1026|       3890|   10|
|       1026|       1776|   10|
|       1026|      34793|    8|
|       1026|      49443|    8|
|       1026|      45211|    8|
|       1026|       6853|   10|
|       1026|      32890|    8|
|       1026|       2224|   10|
|       1026|      22067|   10|
+-----------+-----------+-----+
only showing top 20 rows



In [10]:
(training, test) = spark_df.randomSplit([0.8, 0.2])

In [11]:
als = ALS(maxIter=15, 
          regParam=0.5, 
          rank=100,
          userCol="user_number",
          itemCol="game_number",
          ratingCol="score",
          coldStartStrategy="drop")

In [12]:
model = als.fit(training)

In [13]:
model.save('als_model.pkl')

In [14]:
model=ALSModel.load('als_model.pkl')

In [15]:
predictions = model.transform(test)

In [16]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="score",
                                predictionCol="prediction")

In [17]:
rmse = evaluator.evaluate(predictions)

In [18]:
rmse

3.049170885807583

In [19]:
user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

In [20]:
item_arr=np.array(item_factors['features'].tolist())
user_arr=np.array(user_factors['features'].tolist())

In [21]:
np.save('item_arr.npy', item_arr)

In [22]:
item_arr=np.load('item_arr.npy')

# Create example for model to give prediction to

In [23]:
sample1 = {'Destiny 2: Forsaken': 8,
         "Everybody's Golf": 8,
         "XCOM 2": 9,
         "NBA 2K17": 10,
         "Marvel's Spider-Man":6}

sample2 = {'Far Cry 5': 8,
         "Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10}

sample3 = {'Fallout 4': 10}

In [24]:
for key in sample2:
    print(df[df['game_id']==key]['game_id'].unique())

['Far Cry 5']
['Madden NFL 16']
['Tricky Towers']
['Fallout 4']
['Persona 5']


In [50]:
from pyspark.sql import Row

def insert_user_prefs(prefs={}):
    """Insert a new user's scores into the spark df with the rest
        of the user scores
        
        This is also completely irrelevant for giving a recommendation
        but was a nice practice in working with spark dataframes"""
    for key in prefs:
        nums=[(
            int(df[df['game_id']==key].game_number.unique()),
            len(df.user_id.unique())+1,
            sample[key]
               )]
        
        rdd=sc.parallelize(nums)
        entry=Row('game_number', 'user_number', 'score')
        new=rdd.map(lambda x: entry(*x))

        new_row=spark.createDataFrame(new)

        spark_df=spark_df.union(new_row)
    
    
# 'Game ID':  int(df[df['game_id']==key].game_number.unique()),
# 'USER ID':  len(df.user_id.unique())+1,
# 'SCORE':    sample[key])

In [51]:
def create_user_vector(prefs={}, item_factors=item_arr):
    game_ids=[]
    scores=[]
    for key in prefs:
        game_ids.append(int(df[df['game_id']==key].game_number.unique()))
        scores.append(prefs[key])
    ratings=np.array(scores)
    
    return ratings, item_factors[game_ids], game_ids

In [80]:
scores_vec, user_item_arr, game_ids = create_user_vector(prefs=sample3)

In [81]:
scores_vec.shape, user_item_arr.shape

((1,), (1, 100))

In [82]:





X, residuals, rank, s =np.linalg.lstsq(user_item_arr, scores_vec, rcond=None)





In [83]:
user_factors=X

In [84]:
def predict_new_user(user_factors, item_arr):
    new_factor_list = []
    for i in range(len(item_arr)):
        new_factor_list.append(np.dot(user_factors, item_arr[i]))
    new_user_df = pd.DataFrame([new_factor_list], index=['new_user'])
    return np.array(new_factor_list)

In [85]:
test_arr = predict_new_user(user_factors, item_arr)

In [86]:
test_arr = np.delete(test_arr, game_ids)

In [87]:
test_arr.argsort()[-5:][::-1]

array([941, 132, 581, 593, 352])

In [38]:
for ind in test_arr.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique(),)

NameError: name 'test_arr' is not defined

In [41]:
import src.collaborative_filtering

In [36]:
test_vec= src.collaborative_filtering.make_prefernce_vector(sample1)

In [37]:
test_vec

array([5.06104714, 7.77494894, 3.39601426, ..., 7.48739287, 8.84294797,
       8.51642464])

In [40]:
for ind in test_vec.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique()[0])

Yasai Ninja
Mortal Blitz
ClusterTruck
Dead Alliance
Narcosis


In [43]:
src.collaborative_filtering.give_recommendation(sample2)

['Super Bomberman R',
 'BlazBlue: Central Fiction',
 'LocoCycle',
 'Wayward Sky',
 'Fighter Within']