In [33]:
import pandas as pd
import numpy as np
from src.make_preference_matrix import make_preference_df
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [34]:
df=make_preference_df()

In [35]:
df.head()

Unnamed: 0,_id,game_id,score,user_id,game_number,user_number
0,5c09771d933dea68aaeebb68,Grand Theft Auto V,8,J4MESOX4D,698,18776
1,5c09771d933dea68aaeebb69,Grand Theft Auto V,7,CardinalStorm,698,2369
2,5c09771d933dea68aaeebb6a,Grand Theft Auto V,10,DareJedi,698,9191
3,5c09771d933dea68aaeebb6b,Grand Theft Auto V,4,MaxBenevolent,698,22207
4,5c09771d933dea68aaeebb6c,Grand Theft Auto V,3,BVGNOfficial,698,37691


In [36]:
#print(len(df.user_id.unique()), len(df.game_id.unique()))

Unique users : 40641

Unique games : 1587

In [37]:
spark_df=spark.createDataFrame(df[['game_number', 'user_number', 'score']])

In [38]:
spark_df.show()

+-----------+-----------+-----+
|game_number|user_number|score|
+-----------+-----------+-----+
|        698|      18776|    8|
|        698|       2369|    7|
|        698|       9191|   10|
|        698|      22207|    4|
|        698|      37691|    3|
|        698|       2024|    2|
|        698|      36719|    7|
|        698|      31522|    5|
|        698|       7008|    7|
|        698|      39936|    7|
|        698|      40272|   10|
|        698|       2844|   10|
|        698|       1281|   10|
|        698|      25589|    8|
|        698|      36328|    8|
|        698|      33219|    8|
|        698|       5031|   10|
|        698|      24195|    8|
|        698|       1608|   10|
|        698|      16248|   10|
+-----------+-----------+-----+
only showing top 20 rows



In [39]:
(training, test) = spark_df.randomSplit([0.8, 0.2])

In [40]:
als = ALS(maxIter=15, 
          regParam=0.5, 
          rank=100,
          userCol="user_number",
          itemCol="game_number",
          ratingCol="score",
          coldStartStrategy="drop")

In [41]:
model = als.fit(training)

In [42]:
predictions = model.transform(test)

In [43]:
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="score",
                                predictionCol="prediction")

In [44]:
rmse = evaluator.evaluate(predictions)

In [45]:
rmse

3.004191452272342

In [46]:
user_factors = model.userFactors.toPandas()
item_factors = model.itemFactors.toPandas()

In [47]:
item_arr=np.array(item_factors['features'].tolist())
user_arr=np.array(user_factors['features'].tolist())

# Create example for model to give prediction to

In [79]:
sample1 = {'Destiny 2: Forsaken': 8,
         "Everybody's Golf": 8,
         "XCOM 2": 9,
         "NBA 2K17": 10,
         "Marvel's Spider-Man":6}

sample2 = {'Far Cry 5': 8,
         "Madden NFL 16": 6,
         "Tricky Towers": 8,
         "Fallout 4": 9,
         "Persona 5":10}

sample3 = {'Fallout 4': 10}

In [49]:
for key in sample2:
    print(df[df['game_id']==key]['game_id'].unique())

['Far Cry 5']
['Madden NFL 16']
['Tricky Towers']
['Fallout 4']
['Persona 5']


In [50]:
from pyspark.sql import Row

def insert_user_prefs(prefs={}):
    """Insert a new user's scores into the spark df with the rest
        of the user scores
        
        This is also completely irrelevant for giving a recommendation
        but was a nice practice in working with spark dataframes"""
    for key in prefs:
        nums=[(
            int(df[df['game_id']==key].game_number.unique()),
            len(df.user_id.unique())+1,
            sample[key]
               )]
        
        rdd=sc.parallelize(nums)
        entry=Row('game_number', 'user_number', 'score')
        new=rdd.map(lambda x: entry(*x))

        new_row=spark.createDataFrame(new)

        spark_df=spark_df.union(new_row)
    
    
# 'Game ID':  int(df[df['game_id']==key].game_number.unique()),
# 'USER ID':  len(df.user_id.unique())+1,
# 'SCORE':    sample[key])

In [51]:
def create_user_vector(prefs={}, item_factors=item_arr):
    game_ids=[]
    scores=[]
    for key in prefs:
        game_ids.append(int(df[df['game_id']==key].game_number.unique()))
        scores.append(prefs[key])
    ratings=np.array(scores)
    
    return ratings, item_factors[game_ids], game_ids

In [80]:
scores_vec, user_item_arr, game_ids = create_user_vector(prefs=sample3)

In [81]:
scores_vec.shape, user_item_arr.shape

((1,), (1, 100))

In [82]:
X, residuals, rank, s =np.linalg.lstsq(user_item_arr, scores_vec, rcond=None)

In [83]:
user_factors=X

In [84]:
def predict_new_user(user_factors, item_arr):
    new_factor_list = []
    for i in range(len(item_arr)):
        new_factor_list.append(np.dot(user_factors, item_arr[i]))
    new_user_df = pd.DataFrame([new_factor_list], index=['new_user'])
    return np.array(new_factor_list)

In [85]:
test_arr = predict_new_user(user_factors, item_arr)

In [86]:
test_arr = np.delete(test_arr, game_ids)

In [87]:
test_arr.argsort()[-5:][::-1]

array([941, 132, 581, 593, 352])

In [88]:
for ind in test_arr.argsort()[-5:][::-1]:
    print(df[df['game_number']==ind]['game_id'].unique(),
         test_arr[ind])

['Terraria'] 11.7549551803573
['Night in the Woods'] 11.723471161077088
['Steel Rats'] 11.634178453697391
['F1 2018'] 11.551484758575167
['Astro Bot: Rescue Mission'] 11.542812334587476


In [2]:
import src.spark_als

In [3]:
new_model=src.spark_als.make_model

In [4]:
src.spark_als.evaluate_model(new_model)

AttributeError: 'function' object has no attribute 'transform'