In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS, ALSModel
import pandas as pd
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.master('local[4]').getOrCreate()

In [4]:
ratings_df = pd.read_csv('data/wa_ratings_data.csv')
ratings_df = ratings_df.rename(columns={'Unnamed: 0':'Username'})
ratings_df = ratings_df.set_index('Username')
ratings_df.drop('Unnamed: 1', axis=1, inplace=True)
ind = []
for index in ratings_df.index:
    if ratings_df.loc[index, :].isnull().all() == True:
        ind.append(index)
ratings_df.drop(ind, inplace=True)
als_data = ratings_df.stack().reset_index().rename(columns={'Username':'user','level_1':'board_game', 0:'rating'})
board_games = dict(enumerate(ratings_df.columns))
board_game_index = dict((y,x) for x,y in board_games.iteritems())
for game in board_game_index.keys():
    als_data['board_game'].replace(to_replace=game, value=board_game_index[game], inplace=True)
users = dict(enumerate(ratings_df.index))
user_index = dict((y,x) for x,y in users.iteritems())
for user in user_index.keys():
    als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)
users = dict(enumerate(ratings_df.index))
user_index = dict((y,x) for x,y in users.iteritems())
for user in user_index.keys():
    als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)

In [5]:
# board_game_index = np.load('board_game_dict.npy').item()
# user_index = np.load('wa_user_dict.npy').item()
# als_data = pd.read_csv('als_ready_wa_ratings_data.csv')
# board_games = dict((y,x) for x,y in board_game_index.iteritems())

In [97]:
als_data

Unnamed: 0,user,board_game,rating
0,0,279,6.00
1,0,291,5.00
2,0,541,6.00
3,0,792,8.90
4,0,911,6.00
5,0,1346,6.00
6,0,1435,9.00
7,0,1475,7.00
8,0,1673,5.00
9,0,1777,6.00


In [6]:
als_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128417 entries, 0 to 128416
Data columns (total 3 columns):
user          128417 non-null int64
board_game    128417 non-null int64
rating        128417 non-null float64
dtypes: float64(1), int64(2)
memory usage: 2.9 MB


In [7]:
als_spark_df = spark.createDataFrame(als_data)

In [8]:
als_spark_df.cache()

DataFrame[user: bigint, board_game: bigint, rating: double]

In [114]:
als_model = ALS(
    itemCol='board_game',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=100,
    maxIter=10
    )

In [115]:
als_fit_model = als_model.fit(als_spark_df)

In [None]:
preds_train_data = als_fit_model.transform(als_spark_df)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(preds_train_data)
print("Root-mean-square error, rank=100, maxIter=10 = " + str(rmse))

In [None]:
preds_train_df = preds_train_data.toPandas()

In [15]:
ratings_df = pd.read_csv('data/wa_ratings_data.csv')
ratings_df = ratings_df.rename(columns={'Unnamed: 0':'Username'})
ratings_df = ratings_df.set_index('Username')
ratings_df.drop('Unnamed: 1', axis=1, inplace=True)
null_ind = []
for index in ratings_df.index:
    if ratings_df.loc[index, :].isnull().all() == True:
        null_ind.append(index)
ratings_df.drop(null_ind, inplace=True)
ratings_df.fillna(0, inplace=True)

In [11]:
new_user = pd.DataFrame({'new_user': {'Wiz-War':10, 'Terra Mystica':10, 'Twilight Imperium':10}}, index=ratings_df.columns).T
new_user.fillna(0, inplace=True)

In [141]:
from scipy.spatial.distance import cosine
import operator
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
print top_3_keys
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    preds = als_fit_model.transform(user).toPandas()
    preds.sort_values('board_game', inplace=True)
    pred_array += preds['prediction'].values
top_3_games = pred_array[0].argsort()[-10:]
print top_3_games
print sorted(pred_array[0])[-10:]
games = []
for ind in top_3_games:
    games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
new_game4 = games[3]
new_game5 = games[4]
new_game6 = games[5]
new_game7 = games[6]
new_game8 = games[7]
new_game9 = games[8]
new_game10 = games[9]
print new_game1
print new_game2
print new_game3
print new_game4
print new_game5
print new_game6
print new_game7
print new_game8
print new_game9
print new_game10

[708, 766, 841]
[ 1024 16394   474  1025 13574  7735 18978 10508   901 14134]
[29.906192779541016, 29.906192779541016, 29.906192779541016, 29.906192779541016, 30.469478607177734, 30.638707160949707, 31.298493385314941, 31.641600608825684, 31.641600608825684, 31.641600608825684]
Ambush!: Battle Hymn: Leatherneck (aka Battle Hymn: Leatherneck)
Strat-O-Matic Baseball (Strat-o-Matic Edition 1985)
ASL Achtung! Panzer! (aka Achtung! Panzer!)
Ambush!: Shell Shock (aka Shell Shock!)
Puerto Rico: Limited Anniversary Edition (English edition)
Gloomhaven
Twilight Struggle (English first edition, Third Printing)
Luftwaffe (Avalon Hill Second printing)
Alexander the Great (Avalon Hill English First Edition)
Rise and Decline of the Third Reich (First Edition)


In [117]:
als_fit_model.save('als_model')
als_model = ALSModel.load('als_model/')

In [143]:
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
print top_3_keys
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    preds = als_model.transform(user).toPandas()
    preds.sort_values('board_game', inplace=True)
    pred_array += preds['prediction'].values
top_3_games = pred_array[0].argsort()[-10:]
print top_3_games
print sorted(pred_array[0])[-10:]
games = []
for ind in top_3_games:
    games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
new_game4 = games[3]
new_game5 = games[4]
new_game6 = games[5]
new_game7 = games[6]
new_game8 = games[7]
new_game9 = games[8]
new_game10 = games[9]
print new_game1
print new_game2
print new_game3
print new_game4
print new_game5
print new_game6
print new_game7
print new_game8
print new_game9
print new_game10

[708, 766, 841]
[ 1024 16394   474  1025 13574  7735 18978 10508   901 14134]
[29.906192779541016, 29.906192779541016, 29.906192779541016, 29.906192779541016, 30.469478607177734, 30.638707160949707, 31.298493385314941, 31.641600608825684, 31.641600608825684, 31.641600608825684]
Ambush!: Battle Hymn: Leatherneck (aka Battle Hymn: Leatherneck)
Strat-O-Matic Baseball (Strat-o-Matic Edition 1985)
ASL Achtung! Panzer! (aka Achtung! Panzer!)
Ambush!: Shell Shock (aka Shell Shock!)
Puerto Rico: Limited Anniversary Edition (English edition)
Gloomhaven
Twilight Struggle (English first edition, Third Printing)
Luftwaffe (Avalon Hill Second printing)
Alexander the Great (Avalon Hill English First Edition)
Rise and Decline of the Third Reich (First Edition)


In [108]:
pred_array.shape

(1, 20621)

In [None]:
import pandas as pd
ratings_df = pd.read_csv('data/wa_ratings_data.csv')

In [None]:
json.dump?

In [23]:
just_ranking_info = pd.read_csv('just_ranking_info.csv')
just_ranking_info.drop('Unnamed: 0', axis=1, inplace=True)

In [34]:
just_ranking_info

Unnamed: 0,Title,Geek Rating,Avg Rating,Num Ratings
0,Pandemic Legacy: Season 1,8.469,8.66,15951
1,Through the Ages: A New Story of Civilization,8.306,8.77,6279
2,Twilight Struggle,8.226,8.36,26424
3,Terra Mystica,8.153,8.29,22791
4,Star Wars: Rebellion,8.130,8.56,7286
5,Scythe,8.082,8.35,12711
6,7 Wonders Duel,8.029,8.20,18324
7,Caverna: The Cave Farmers,8.022,8.19,16163
8,The Castles of Burgundy,7.994,8.11,24014
9,Puerto Rico,7.991,8.10,47075


In [38]:
just_ranking_info[just_ranking_info['Title'] == 'Mad Gab']['Num Ratings']/66420.

13397    0.015793
Name: Num Ratings, dtype: float64

In [33]:
just_ranking_info['Num Ratings'].apply(lambda x: x/66420.)

0        0.240154
1        0.094535
2        0.397832
3        0.343135
4        0.109696
5        0.191373
6        0.275881
7        0.243345
8        0.361548
9        0.708747
10       0.715387
11       0.090470
12       0.266095
13       0.229238
14       0.182565
15       0.158582
16       0.087699
17       0.623592
18       0.299609
19       0.260117
20       0.299247
21       0.079855
22       0.417992
23       0.281798
24       0.158220
25       0.140078
26       0.059470
27       0.260343
28       0.068263
29       0.116802
           ...   
13370    0.011412
13371    0.001777
13372    0.008988
13373    0.003418
13374    0.004321
13375    0.002620
13376    0.001054
13377    0.005435
13378    0.001987
13379    0.002936
13380    0.009470
13381    0.002138
13382    0.121891
13383    0.005330
13384    0.005495
13385    0.004246
13386    0.004743
13387    0.009741
13388    0.003117
13389    0.004140
13390    0.004321
13391    0.004276
13392    0.006474
13393    0.012014
13394    0

In [148]:
new_user2 = pd.DataFrame({'new_user': {'Pandemic':10, 'Agricola':10, 'Carcassonne':10}}, index=ratings_df.columns).T
new_user2.fillna(0, inplace=True)

In [151]:
input_games = ['Pandemic', 'Agricola', 'Carcassonne']

In [152]:
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user2)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    preds = als_model.transform(user).toPandas()
    preds.sort_values('board_game', inplace=True)
    pred_array += preds['prediction'].values
pred_array = pred_array[0]
for i, game in enumerate(pred_array):
    try:
        pred_array[i] *= ((just_ranking_info[just_ranking_info
                           ['Title'] == board_games[i]]
                           ['Num Ratings'].values[0]/66420.)/2+1)
    except IndexError:
        pred_array[i] = 0
top_3_games = pred_array.argsort()[-3:]
print top_3_games
print sorted(pred_array)[-6:][::-1]
games = []
for ind in top_3_games:
    if board_games[ind] not in input_games:
        games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
print new_game1
print new_game2
print new_game3

[13567 12699   792]
[33.882404358894554, 32.930124842363178, 31.352418564163536, 30.862876658581783, 30.046297865522732, 29.778903185843845]


IndexError: list index out of range