In [1]:
import pandas as pd
import numpy as np
from src.make_preference_matrix import make_preference_df
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
df=make_preference_df()

In [3]:
df

Unnamed: 0,_id,game_id,score,user_id,num_username_reviews,num_game_reviews,game_number,user_number
6,5c09771d933dea68aaeebb6e,Grand Theft Auto V,7,NZAnimeManga,18,627,842,424
13,5c09771d933dea68aaeebb75,Grand Theft Auto V,8,Red-Opposom-098,17,627,842,165
18,5c09771d933dea68aaeebb7a,Grand Theft Auto V,10,DSouls_guy,19,627,842,311
24,5c09771d933dea68aaeebb80,Grand Theft Auto V,0,rodericrinehart,74,627,842,464
29,5c09771d933dea68aaeebb85,Grand Theft Auto V,10,MaxPayneIsGod,63,627,842,167
35,5c09771d933dea68aaeebb8b,Grand Theft Auto V,8,Ghoster,61,627,842,251
51,5c09771d933dea68aaeebb9b,Grand Theft Auto V,10,joaopontesvaz,72,627,842,452
52,5c09771d933dea68aaeebb9c,Grand Theft Auto V,8,badgerryan19,102,627,842,416
66,5c09771d933dea68aaeebbaa,Grand Theft Auto V,5,ignore78,25,627,842,348
67,5c09771d933dea68aaeebbab,Grand Theft Auto V,8,blopho,17,627,842,438


In [4]:
#filtered_df = df[df.num_username_reviews>=17]

In [5]:
#filtered_df

In [6]:
#print(len(df.user_id.unique()), len(df.game_id.unique()))

Unique users : 55272

Unique games : 2289

In [7]:
#print(len(filtered_df.user_id.unique()), len(filtered_df.game_id.unique()))

Unique filtered users : 501

Unique filtered games : 1900

Approximate RMSE : 2.2

In [8]:
#filtered_game_titles = list(filtered_df.game_id.unique())

In [9]:
#filtered_game_titles

In [10]:
spark_df=spark.createDataFrame(df[['game_number', 'user_number', 'score']])

In [11]:
spark_df.show()

+-----------+-----------+-----+
|game_number|user_number|score|
+-----------+-----------+-----+
|        842|        424|    7|
|        842|        165|    8|
|        842|        311|   10|
|        842|        464|    0|
|        842|        167|   10|
|        842|        251|    8|
|        842|        452|   10|
|        842|        416|    8|
|        842|        348|    5|
|        842|        438|    8|
|        842|        131|   10|
|        842|        301|    9|
|        842|        180|    7|
|        842|        369|    1|
|        842|        168|    7|
|        842|        317|   10|
|        842|        185|    4|
|        842|        436|    3|
|        842|        484|    6|
|        842|        346|    6|
+-----------+-----------+-----+
only showing top 20 rows



In [12]:
(training, test) = spark_df.randomSplit([0.8, 0.2])

In [13]:
als = ALS(maxIter=21, 
          regParam=0.45, 
          rank=50,
          userCol="user_number",
          itemCol="game_number",
          ratingCol="score",
          coldStartStrategy="drop")

In [14]:
model = als.fit(training)

In [15]:
#model.save('als_model.pkl')

In [16]:
#model=ALSModel.load('als_model.pkl')

In [17]:
predictions = model.transform(test)

In [18]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="score",
                                predictionCol="prediction")

In [19]:
rmse = evaluator.evaluate(predictions)

In [20]:
rmse

2.1840105115210537

In [21]:
#model.save('filtered_als_model_2.pkl')

In [22]:
#model=ALSModel.load('filtered_als_model_2.pkl')

In [21]:
user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

In [22]:
item_arr=np.array(item_factors['features'].tolist())
user_arr=np.array(user_factors['features'].tolist())

In [23]:
np.save('filtered_item_arr.npy', item_arr)

In [24]:
item_arr=np.load('filtered_item_arr.npy')

# Create example for model to give prediction to

In [25]:
sample1 = {'Destiny 2: Forsaken': 8,
         "Everybody's Golf": 8,
         "XCOM 2": 9,
         "NBA 2K17": 10,
         "Marvel's Spider-Man":6,
         'Far Cry 5': 8,
         "Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10}

sample2 = {
         'Far Cry 5': 8,
         #"Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10
          }

sample3 = {'Fallout 4': 10}

In [26]:
for key in sample2:
    print(df[df['game_id']==key]['game_id'].unique())

['Far Cry 5']
['Tricky Towers']
['Fallout 4']
['Persona 5']


In [306]:
from pyspark.sql import Row

def insert_user_prefs(prefs={}):
    """Insert a new user's scores into the spark df with the rest
        of the user scores
        
        This is also completely irrelevant for giving a recommendation
        but was a nice practice in working with spark dataframes"""
    for key in prefs:
        nums=[(
            int(df[df['game_id']==key].game_number.unique()),
            len(df.user_id.unique())+1,
            sample[key]
               )]
        
        rdd=sc.parallelize(nums)
        entry=Row('game_number', 'user_number', 'score')
        new=rdd.map(lambda x: entry(*x))

        new_row=spark.createDataFrame(new)

        spark_df=spark_df.union(new_row)
    
    
# 'Game ID':  int(df[df['game_id']==key].game_number.unique()),
# 'USER ID':  len(df.user_id.unique())+1,
# 'SCORE':    sample[key])

In [27]:
def create_user_vector(prefs={}, item_factors=item_arr):
    game_ids=[]
    scores=[]
    for key in prefs:
        game_ids.append(int(df[df['game_id']==key].game_number.unique()))
        scores.append(prefs[key])
    ratings=np.array(scores)
    
    return ratings, item_factors[game_ids], game_ids

In [28]:
scores_vec, user_item_arr, game_ids = create_user_vector(prefs=sample2)

In [29]:
scores_vec.shape, user_item_arr.shape

((4,), (4, 50))

In [30]:




X, residuals, rank, s =np.linalg.lstsq(user_item_arr, scores_vec, rcond=None)





In [31]:
user_factors=X

In [32]:
def predict_new_user(user_factors, item_arr):
    new_factor_list = []
    for i in range(len(item_arr)):
        new_factor_list.append(np.dot(user_factors, item_arr[i]))
    new_user_df = pd.DataFrame([new_factor_list], index=['new_user'])
    return np.array(new_factor_list)

In [33]:
test_arr = predict_new_user(user_factors, item_arr)

In [34]:
test_arr = np.delete(test_arr, game_ids)

In [35]:
test_arr.argsort()[-5:][::-1]

array([1520,  839, 1228,  715,  166])

In [302]:
for ind in test_arr.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique(),)

['Q.U.B.E. 2']
['Tyler: Model 005']
['Operation Warcade']
['Paladins: Champions of the Realm']
['Deus Ex: Mankind Divided']


In [41]:
import src.collaborative_filtering

In [36]:
test_vec= src.collaborative_filtering.make_prefernce_vector(sample1)

In [37]:
test_vec

array([5.06104714, 7.77494894, 3.39601426, ..., 7.48739287, 8.84294797,
       8.51642464])

In [40]:
for ind in test_vec.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique()[0])

Yasai Ninja
Mortal Blitz
ClusterTruck
Dead Alliance
Narcosis


In [43]:
src.collaborative_filtering.give_recommendation(sample2)

['Super Bomberman R',
 'BlazBlue: Central Fiction',
 'LocoCycle',
 'Wayward Sky',
 'Fighter Within']