In [214]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS, ALSModel
import pandas as pd
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.master('local[4]').getOrCreate()

In [None]:
# ratings_df = pd.read_csv('data/wa_ratings_data.csv')
# ratings_df = ratings_df.rename(columns={'Unnamed: 0':'Username'})
# ratings_df = ratings_df.set_index('Username')
# ratings_df.drop('Unnamed: 1', axis=1, inplace=True)
# ind = []
# for index in ratings_df.index:
#     if ratings_df.loc[index, :].isnull().all() == True:
#         ind.append(index)
# ratings_df.drop(ind, inplace=True)
# als_data = ratings_df.stack().reset_index().rename(columns={'Username':'user','level_1':'board_game', 0:'rating'})
# board_games = dict(enumerate(ratings_df.columns))
# board_game_index = dict((y,x) for x,y in board_games.iteritems())
# for game in board_game_index.keys():
#     als_data['board_game'].replace(to_replace=game, value=board_game_index[game], inplace=True)
# users = dict(enumerate(ratings_df.index))
# user_index = dict((y,x) for x,y in users.iteritems())
# for user in user_index.keys():
#     als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)
# users = dict(enumerate(ratings_df.index))
# user_index = dict((y,x) for x,y in users.iteritems())
# for user in user_index.keys():
#     als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)

In [3]:
board_game_index = np.load('board_game_dict.npy').item()
user_index = np.load('wa_user_dict.npy').item()
als_data = pd.read_csv('als_ready_wa_ratings_data.csv')
board_games = dict((y,x) for x,y in board_game_index.iteritems())

In [4]:
als_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128417 entries, 0 to 128416
Data columns (total 4 columns):
Unnamed: 0    128417 non-null int64
user          128417 non-null int64
board_game    128417 non-null int64
rating        128417 non-null float64
dtypes: float64(1), int64(3)
memory usage: 3.9 MB


In [5]:
als_small_index = np.random.randint(0, high=len(als_data), size=int(len(als_data)*0.1))

In [6]:
als_small_df = als_data.iloc[als_small_index, :]

In [21]:
als_spark_df = spark.createDataFrame(als_data)

In [22]:
als_spark_df.cache()

DataFrame[Unnamed: 0: bigint, user: bigint, board_game: bigint, rating: double]

In [23]:
als_model = ALS(
    itemCol='user',
    userCol='board_game',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=100,
    maxIter=10
    )

In [24]:
als_fit_model = als_model.fit(als_spark_df)

In [213]:
als_fit_model.save('als_model')

In [25]:
preds_train_data = als_fit_model.transform(als_spark_df)

In [26]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(preds_train_data)
print("Root-mean-square error, rank=100, maxIter=10 = " + str(rmse))

Root-mean-square error, rank=100, maxIter=10 = 0.552759981777


In [15]:
# als_model_150_20 = ALS(
#     itemCol='user',
#     userCol='board_game',
#     ratingCol='rating',
#     nonnegative=True,    
#     regParam=0.1,
#     rank=150,
#     maxIter=20
#     )

In [16]:
# als_fit_model_150_20 = als_model_150_20.fit(als_small_spark_df)

In [17]:
# preds_train_data_150_20 = als_fit_model_150_20.transform(als_small_spark_df)

In [20]:
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                 predictionCol="prediction")
# rmse = evaluator.evaluate(preds_train_data_150_20)
# print("Root-mean-square error, rank=150, maxIter=20 = " + str(rmse))

Root-mean-square error, rank=150, maxIter=20 = 0.165427034306


In [None]:
preds_train_df = preds_train_data.toPandas()

In [108]:
ratings_df = pd.read_csv('data/wa_ratings_data.csv')
ratings_df = ratings_df.rename(columns={'Unnamed: 0':'Username'})
ratings_df = ratings_df.set_index('Username')
ratings_df.drop('Unnamed: 1', axis=1, inplace=True)
null_ind = []
for index in ratings_df.index:
    if ratings_df.loc[index, :].isnull().all() == True:
        null_ind.append(index)
ratings_df.drop(null_ind, inplace=True)
ratings_df.fillna(0, inplace=True)

In [215]:
ratings_df.to_csv('new_wa_ratings_data.csv')

In [110]:
ratings_df.loc[:, 'Wiz-War']

Username
12thManStanding       0.0
2 Salt City Gamers    0.0
253bri                0.0
2CoF                  0.0
5th_Para_Bde          0.0
ADiplomat             0.0
AGKorson              0.0
AGRAYDAY              0.0
AGallela              0.0
AJzer                 0.0
AMcBain               0.0
A_fungus              0.0
AaronElWhite          0.0
AbeLincoln            0.0
Acruix                0.0
Aesdel                0.0
AesopDoom             0.0
Affynity              0.0
AgentDib              0.0
Agrona                0.0
Alayna                0.0
AlexCPG               0.0
Alita99               0.0
Almighty Malachai     0.0
Altauri               0.0
AnNguyen              0.0
AndrewPetty           0.0
AndyLTD               0.0
AndySzy               0.0
AngryRedMan           0.0
                     ... 
webwizardry           0.0
weeemann              0.0
wellerlance           0.0
whisky_bear           0.0
will98683             0.0
willdo                0.0
winteriscoming        0.0
wiz

In [111]:
from scipy.spatial.distance import cosine
import operator

In [112]:
new_user = pd.DataFrame({'new_user': {'Wiz-War':10, 'Terra Mystica':10, 'Twilight Imperium':10}}, index=ratings_df.columns).T
new_user.fillna(0, inplace=True)

In [113]:
new_user.fillna(0, inplace=True)

In [114]:
test = ratings_df.loc['zublord', :]

In [115]:
cos_sim = cosine(new_user, test)

In [116]:
cos_sim

0.92367710119104007

In [219]:
als_model = ALSModel.load('/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/als_model/')

In [220]:
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    pred_array += als_model.transform(user).toPandas()['prediction'].values
    top_3_games = []
for i in xrange(3):
    top_3_games.append(pred_array.argmax())
    pred_array = np.delete(pred_array, pred_array.argmax())
games = []
for ind in top_3_games:
    games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]

In [222]:
print new_game1 + ',' + new_game2 + ',' + new_game3

Battlefield: Europe,18PA (First Edition),Bywater's War: Command at Sea XI


In [172]:
top_3_keys

[708, 766, 841]

In [160]:
user_0_df = pd.DataFrame(list(product([0], als_data['board_game'].unique())))
user_0_df = user_0_df.rename(columns={0:'user', 1:'board_game'})
user_0 = spark.createDataFrame(user_0_df)

In [None]:
preds_user_0 = als_fit_model.transform(user_0)

In [None]:
preds_user_0_df = preds_user_0.toPandas()

In [None]:
sorted_preds_user_0 = preds_user_0_df.sort_values('prediction')

In [None]:
top_games = []
rated_games_user_0 = als_data[als_data['user'] == 0]['board_game'].values
games = list(sorted_preds_user_0['board_game'].values)
while len(top_games) < 5:
    game = games.pop()
    if game in rated_games_user_0:
        continue
    else:
        top_games.append(board_games[game])

In [None]:
top_games

In [None]:
sorted_df = predictions_df.sort_values(['user', 'board_game'])

In [None]:
sorted_df

In [None]:
als_model = 