In [1]:
import pyspark as ps
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS, ALSModel
import pandas as pd
import numpy as np
from itertools import product
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spark = SparkSession.builder.master('local[4]').getOrCreate()

In [3]:
ratings_df = pd.read_csv('data/wa_ratings_data.csv')
ratings_df = ratings_df.rename(columns={'Unnamed: 0':'Username'})
ratings_df = ratings_df.set_index('Username')
ratings_df.drop('Unnamed: 1', axis=1, inplace=True)
ind = []
for index in ratings_df.index:
    if ratings_df.loc[index, :].isnull().all() == True:
        ind.append(index)
ratings_df.drop(ind, inplace=True)
ratings_df.fillna(0, inplace=True)

In [None]:
# als_data = ratings_df.stack().reset_index().rename(columns={'Username':'user','level_1':'board_game', 0:'rating'})
# board_games = dict(enumerate(ratings_df.columns))
# board_game_index = dict((y,x) for x,y in board_games.iteritems())
# for game in board_game_index.keys():
#     als_data['board_game'].replace(to_replace=game, value=board_game_index[game], inplace=True)
# users = dict(enumerate(ratings_df.index))
# user_index = dict((y,x) for x,y in users.iteritems())
# for user in user_index.keys():
#     als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)
# users = dict(enumerate(ratings_df.index))
# user_index = dict((y,x) for x,y in users.iteritems())
# for user in user_index.keys():
#     als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)

In [6]:
board_game_index = np.load('/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/board_game_dict.npy').item()
user_index = np.load('/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/wa_user_dict.npy').item()
als_data = pd.read_csv('/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/als_ready_wa_ratings_data.csv')
board_games = dict((y,x) for x,y in board_game_index.iteritems())

In [9]:
als_data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
als_data.info()

In [14]:
als_spark_df = spark.createDataFrame(als_data)

In [15]:
als_spark_df.cache()

DataFrame[user: bigint, board_game: bigint, rating: double]

In [16]:
als_model = ALS(
    itemCol='board_game',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=100,
    maxIter=10
    )

In [17]:
als_fit_model = als_model.fit(als_spark_df)

In [None]:
preds_train_data = als_fit_model.transform(als_spark_df)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(preds_train_data)
print("Root-mean-square error, rank=100, maxIter=10 = " + str(rmse))

In [None]:
preds_train_df = preds_train_data.toPandas()

In [21]:
from scipy.spatial.distance import cosine
import operator
new_user = pd.DataFrame({'new_user': {'Wiz-War':10, 'Terra Mystica':10, 'Twilight Imperium':10}}, index=ratings_df.columns).T
new_user.fillna(0, inplace=True)

In [None]:
from scipy.spatial.distance import cosine
import operator
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
print top_3_keys
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    preds = als_fit_model.transform(user).toPandas()
    preds.sort_values('board_game', inplace=True)
    pred_array += preds['prediction'].values
top_3_games = pred_array[0].argsort()[-10:]
print top_3_games
print sorted(pred_array[0])[-10:]
games = []
for ind in top_3_games:
    games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
new_game4 = games[3]
new_game5 = games[4]
new_game6 = games[5]
new_game7 = games[6]
new_game8 = games[7]
new_game9 = games[8]
new_game10 = games[9]
print new_game1
print new_game2
print new_game3
print new_game4
print new_game5
print new_game6
print new_game7
print new_game8
print new_game9
print new_game10

In [19]:
als_fit_model.save('als_model2')
als_model = ALSModel.load('als_model2/')

In [22]:
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], als_data['board_game'].unique())))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
print top_3_keys
pred_array = np.zeros((1, len(als_data['board_game'].unique())))
for user in user_input_df:
    preds = als_model.transform(user).toPandas()
    preds.sort_values('board_game', inplace=True)
    pred_array += preds['prediction'].values
top_3_games = pred_array[0].argsort()[-10:]
print top_3_games
print sorted(pred_array[0])[-10:]
games = []
for ind in top_3_games:
    games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
new_game4 = games[3]
new_game5 = games[4]
new_game6 = games[5]
new_game7 = games[6]
new_game8 = games[7]
new_game9 = games[8]
new_game10 = games[9]
print new_game1
print new_game2
print new_game3
print new_game4
print new_game5
print new_game6
print new_game7
print new_game8
print new_game9
print new_game10

[708, 766, 841]
[ 1024 16394   474  1025 13574  7735 18978 10508   901 14134]
[29.906192779541016, 29.906192779541016, 29.906192779541016, 29.906192779541016, 30.469478607177734, 30.638707160949707, 31.298493385314941, 31.641600608825684, 31.641600608825684, 31.641600608825684]
Ambush!: Battle Hymn: Leatherneck (aka Battle Hymn: Leatherneck)
Strat-O-Matic Baseball (Strat-o-Matic Edition 1985)
ASL Achtung! Panzer! (aka Achtung! Panzer!)
Ambush!: Shell Shock (aka Shell Shock!)
Puerto Rico: Limited Anniversary Edition (English edition)
Gloomhaven
Twilight Struggle (English first edition, Third Printing)
Luftwaffe (Avalon Hill Second printing)
Alexander the Great (Avalon Hill English First Edition)
Rise and Decline of the Third Reich (First Edition)


In [23]:
pred_array.shape

(1, 20621)

In [324]:
just_ranking_info2

Unnamed: 0,Title,Geek Rating,Avg Rating,Num Ratings
0,12707,8.469,8.66,15951
1,18208,8.306,8.77,6279
2,18969,8.226,8.36,26424
3,17052,8.153,8.29,22791
4,16126,8.130,8.56,7286
5,14862,8.082,8.35,12711
6,287,8.029,8.20,18324
7,3604,8.022,8.19,16163
8,17261,7.994,8.11,24014
9,13567,7.991,8.10,47075


In [323]:
just_ranking_info = pd.read_csv('/Users/ericyatskowitz/galvanize_work/MeepleFinder/Erics_Web_App/just_ranking_info.csv')
just_ranking_info.drop('Unnamed: 0', axis=1, inplace=True)

ValueError: labels ['Unnamed: 0'] not contained in axis

In [26]:
ratings_df[ratings_df['Gloomhaven']!=0]['Gloomhaven'].mean()

9.464285714285714

In [None]:
just_ranking_info

In [27]:
new_user2 = pd.DataFrame({'new_user': {'Pandemic':10, 'Agricola':10, 'Carcassonne':10}}, index=ratings_df.columns).T
new_user2.fillna(0, inplace=True)

In [28]:
input_games = ['Pandemic', 'Agricola', 'Carcassonne']

In [100]:
games_in_both_dfs = []
for game in ratings_df.columns:
    if game in just_ranking_info.index:
        games_in_both_dfs.append(game)

In [117]:
just_ranking_info.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8416 entries, Pandemic Legacy: Season 1 to Don't Spill the Beans
Data columns (total 4 columns):
Geek Rating    8416 non-null float64
Avg Rating     8416 non-null float64
Num Ratings    8416 non-null int64
Title          8416 non-null object
dtypes: float64(2), int64(1), object(1)
memory usage: 328.8+ KB


In [118]:
just_ranking_info.drop_duplicates(subset=['Title'], keep='first', inplace=True)

In [122]:
just_ranking_info['Title'] = just_ranking_info.index

In [123]:
just_ranking_info

Unnamed: 0_level_0,Geek Rating,Avg Rating,Num Ratings,Title
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Pandemic Legacy: Season 1,8.469,8.66,15951,Pandemic Legacy: Season 1
Through the Ages: A New Story of Civilization,8.306,8.77,6279,Through the Ages: A New Story of Civilization
Twilight Struggle,8.226,8.36,26424,Twilight Struggle
Terra Mystica,8.153,8.29,22791,Terra Mystica
Star Wars: Rebellion,8.130,8.56,7286,Star Wars: Rebellion
Scythe,8.082,8.35,12711,Scythe
7 Wonders Duel,8.029,8.20,18324,7 Wonders Duel
Caverna: The Cave Farmers,8.022,8.19,16163,Caverna: The Cave Farmers
The Castles of Burgundy,7.994,8.11,24014,The Castles of Burgundy
Puerto Rico,7.991,8.10,47075,Puerto Rico


In [97]:
len(just_ranking_info)

13400

In [124]:
just_ranking_info.drop_duplicates(subset=['Title'], keep='first', inplace=True)
for game in board_game_index.keys():
    just_ranking_info['Title'].replace(to_replace=game, value=board_game_index[game], inplace=True)
geek_ratings = just_ranking_info['Geek Rating']
num_ratings = just_ranking_info['Num Ratings']

In [170]:
just_ranking_info.to_csv('just_ranking_info.csv')

In [171]:
just_ranking_info_2 = pd.read_csv('just_ranking_info.csv')

In [172]:
just_ranking_info_2

Unnamed: 0,Title,Geek Rating,Avg Rating,Num Ratings
0,12707,8.469,8.66,15951
1,18208,8.306,8.77,6279
2,18969,8.226,8.36,26424
3,17052,8.153,8.29,22791
4,16126,8.130,8.56,7286
5,14862,8.082,8.35,12711
6,287,8.029,8.20,18324
7,3604,8.022,8.19,16163
8,17261,7.994,8.11,24014
9,13567,7.991,8.10,47075


In [168]:
cos_sim_dict = {}
for ind in ratings_df.index:
    cos_sim_dict[ind] = cosine(ratings_df.loc[ind, :], new_user2)
sorted_dict = sorted(cos_sim_dict.items(), key=operator.itemgetter(1))
top_3 = sorted_dict[:3]
top_3_keys = [user_index[top_3[i][0]] for i in xrange(len(top_3))]
user_input_df = []
for user in top_3_keys:
    user_df = pd.DataFrame(list(product([user], just_ranking_info.index)))
    user_df = user_df.rename(columns={0:'user', 1:'board_game'})
    user_input_df.append(spark.createDataFrame(user_df))
count = 0
for user in user_input_df:
    preds = als_model.transform(user).toPandas()
    preds.set_index('board_game', inplace=True)
    if count == 0:
        pred_array = preds['prediction']
    else:
        pred_array += preds['prediction']
    count += 1
pred_array *= ((num_ratings/66420.)/8+1)*((geek_ratings/8.471)/8+1)
top_3_games = pred_array.sort_values(ascending=False)[:6][::-1].index
games = []
for ind in top_3_games:
    if board_games[ind] not in input_games:
        games.append(board_games[ind])
new_game1 = games[0]
new_game2 = games[1]
new_game3 = games[2]
print new_game1
print new_game2
print new_game3

Int64Index([5997, 7088, 12707, 7735, 6732, 792], dtype='int64', name=u'board_game')
Eclipse
Food Chain Magnate
Pandemic Legacy: Season 1


In [155]:
board_games[12707]

'Pandemic Legacy: Season 1'

In [152]:
pred_array.sort_values(ascending=False)[:6][::-1].index

Int64Index([2550, 19868, 12707, 7088, 792, 6732], dtype='int64', name=u'board_game')

In [61]:
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

In [56]:
bg_data_with_dummies = pd.read_csv('model_ready_bg_data.csv')

In [57]:
bg_data_with_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13500 entries, 0 to 13499
Columns: 3319 entries, Title to designer_Łukasz Woźniak
dtypes: float64(1), int64(3317), object(1)
memory usage: 341.8+ MB


In [282]:
import datetime

In [280]:
small_bg_data = bg_data_with_dummies.iloc[0:5000,:]

In [281]:
small_bg_data = small_bg_data.set_index('Title')

In [283]:
print datetime.datetime.now()
Y = pdist(small_bg_data, 'cosine')
Y = squareform(Y)
small_bg_data_sim = pd.DataFrame(Y, index=small_bg_data.index)
print datetime.datetime.now()

2017-03-21 14:42:01.887389
2017-03-21 14:42:57.645992


In [289]:
small_bg_data.index[small_bg_data_sim.loc['Power Grid', :].sort_values()[:11].index][1:]

Index([u'Power Grid Deluxe: Europe/North America', u'Funkenschlag: EnBW',
       u'Mégawatts', u'Pampas Railroads', u'Black Gold', u'Svea Rike',
       u'Baltimore & Ohio', u'Continental Divide', u'New England Railways',
       u'Colonia'],
      dtype='object', name=u'Title')

In [217]:
Z = linkage(Y, method = 'average')

In [248]:
bg_data = pd.read_csv('data/boardgame_data.csv')
bg_data = bg_data.drop('Unnamed: 0', axis=1)
ind = []
bg_data['Year Published'] = bg_data['Year Published'].apply(lambda x: x.strip('\(\)'))
for i in xrange(len(bg_data)):
    try:
        int(bg_data['Year Published'][i])
    except ValueError:
        ind.append(i)
for i in ind:
    try:
        bg_data.set_value(i, 'Title', bg_data['Title'][i]+' '+bg_data['Year Published'][i])
        bg_data.set_value(i, 'Year Published', 0)
    except TypeError:
        bg_data.set_value(i, 'Title', bg_data['Year Published'][i])
        bg_data.set_value(i, 'Year Published', 0)
bg_data['Year Published'] = pd.to_numeric(bg_data['Year Published'], errors='ignore')
bg_data['Year Published'] = bg_data['Year Published'].astype(int)
bg_data['Min Playing Time'] = bg_data['Play Time'].apply(lambda x: x.split(',')[0].strip('[]'))
bg_data['Max Playing Time'] = bg_data['Play Time'].apply(lambda x: x.split(',')[1].strip('[]'))
bg_data = bg_data.drop('Play Time', axis=1)
bg_data['Min Playing Time'] = pd.to_numeric(bg_data['Min Playing Time'], errors='coerce')
bg_data['Max Playing Time'] = pd.to_numeric(bg_data['Max Playing Time'], errors='coerce')
bg_data['Min Playing Time'] = bg_data['Min Playing Time'].fillna(0)
bg_data['Max Playing Time'] = bg_data['Max Playing Time'].fillna(0)
bg_data['Min Playing Time'] = bg_data['Min Playing Time'].astype(int)
bg_data['Max Playing Time'] = bg_data['Max Playing Time'].astype(int)
bg_data['Age'] = pd.to_numeric(bg_data['Age'], errors='coerce')
bg_data['Age'] = bg_data['Age'].fillna(bg_data['Age'].mean())
bg_data['Age'] = bg_data['Age'].astype(int)
bg_data['Complexity'] = bg_data['Complexity'].astype(float)
bg_data['Min Players'] = pd.to_numeric(bg_data['Min Players'], errors='coerce')
bg_data['Min Players'] = bg_data['Min Players'].fillna(bg_data['Min Players'].mean())
bg_data['Min Players'] = bg_data['Min Players'].astype(int)
bg_data['Max Players'] = pd.to_numeric(bg_data['Max Players'], errors='coerce')
bg_data['Max Players'] = bg_data['Max Players'].fillna(bg_data['Max Players'].mean())
bg_data['Max Players'] = bg_data['Max Players'].astype(int)
bg_data['Min Best Players'] = bg_data['Best Players'].apply(lambda x: x.split('\xe2\x80\x93')[0].strip('+'))
bg_data['Min Best Players'] = pd.to_numeric(bg_data['Min Best Players'], errors='coerce')
bg_data['Min Best Players'] = bg_data['Min Best Players'].fillna(bg_data['Min Best Players'].mean())
bg_data['Min Best Players'] = bg_data['Min Best Players'].astype(int)
bg_data['Max Best Players'] = bg_data['Best Players']
for i in xrange(len(bg_data)):
    bg_data.set_value(i, 'Designers', bg_data.loc[i, 'Designers'].strip('[]').split(', '))
    bg_data.set_value(i, 'Artists', bg_data.loc[i, 'Artists'].strip('[]').split(', '))
    bg_data.set_value(i, 'Publishers', bg_data.loc[i, 'Publishers'].strip('[]').split(', '))
    bg_data.set_value(i, 'Categories', bg_data.loc[i, 'Categories'].strip('[]').split(', '))
    bg_data.set_value(i, 'Mechanisms', bg_data.loc[i, 'Mechanisms'].strip('[]').split(', '))
    try:
        bg_data.set_value(i, 'Max Best Players', bg_data['Best Players'][i].split('\xe2\x80\x93')[1].strip('+'))
    except IndexError:
        bg_data.set_value(i, 'Max Best Players', bg_data['Best Players'][i].split('\xe2\x80\x93')[0].strip('+'))
bg_data['Max Best Players'] = pd.to_numeric(bg_data['Max Best Players'], errors='coerce')
bg_data['Max Best Players'] = bg_data['Max Best Players'].fillna(bg_data['Max Best Players'].mean())
bg_data['Max Best Players'] = bg_data['Max Best Players'].astype(int)
bg_data = bg_data.drop('Best Players', axis=1)

In [249]:
bg_data.drop(['Designers', 'Artists', 'Publishers'], axis=1, inplace=True)

In [250]:
categories = bg_data['Categories'].str.join(sep='*').str.get_dummies(sep='*')

In [251]:
mechanisms = bg_data['Mechanisms'].str.join(sep='*').str.get_dummies(sep='*')

In [252]:
categories.drop('Memory', axis=1, inplace=True)

In [256]:
bg_data_2 = bg_data_2.set_index('Title')

In [257]:
bg_data_2.drop(['Categories', 'Mechanisms'], axis=1, inplace=True)

In [None]:
Y = pdist(bg_data_2, 'cosine')
Y = squareform(Y)

In [261]:
bg_data_sim = pd.DataFrame(Y, index=bg_data_2.index)

In [279]:
bg_data_2.index[bg_data_sim.loc['7 Wonders Duel', :].sort_values()[:11].index][1:]

Index([u'Time Barons', u'Utopian Rummy', u'Star System', u'Bhazum',
       u'Burger Joint', u'Morels', u'Sparta', u'Lost Cities', u'SeaSim',
       u'The Valkyrie Incident'],
      dtype='object', name=u'Title')

In [290]:
dict1 = np.load('data/us_ratings_data_1.npy').item()
dict2 = np.load('data/us_ratings_data_2.npy').item()
dict3 = np.load('data/us_ratings_data_3.npy').item()
dict4 = np.load('data/us_ratings_data_air_1.npy').item()
dict5 = np.load('data/us_ratings_data_air_2.npy').item()
dict6 = np.load('data/us_ratings_data_old_pro_1.npy').item()
dict7 = np.load('data/us_ratings_data_old_pro_2.npy').item()

In [292]:
def merge_dicts(*dict_args):
    """
    Given any number of dicts, shallow copy and merge into a new dict,
    precedence goes to key value pairs in latter dicts.
    """
    result = {}
    for dictionary in dict_args:
        result.update(dictionary)
    return result

In [293]:
us_ratings_dict = merge_dicts(dict1, dict2, dict3, dict4, dict5, dict6, dict7)

In [301]:
us_ratings_dict['yortek'] != []

True

In [302]:
us_ratings_dict_clean = dict((key, value) for key, value in us_ratings_dict.iteritems() if value != [])

In [303]:
len(us_ratings_dict_clean)

21883

In [304]:
us_ratings_dict_train = dict((key, value) for key, value in us_ratings_dict_clean.iteritems() if value != [])

{'the_redstar_swl': [(u'Pok\xe9mon Trading Card Game', 1.0, u'Feb 2005'),
  (u'Carcassonne: The River', 6.9, u'Mar 2005'),
  (u'Carcassonne', 8.0, u'Nov 2005'),
  (u'Dominoes', 6.0, u'Apr 2005'),
  (u"Memoir '44: Winter/Desert Board Map", 5.0, u'Nov 2005'),
  (u"Memoir '44", 10.0, u'Oct 2005'),
  (u'Tour of Darkness', 9.0, u'Sep 2007'),
  (u'Blokus', 8.0, u'Sep 2009'),
  (u'Ogre', 9.0, u'Oct 2005'),
  (u'SET', 7.5, u'Mar 2005'),
  (u'Dungeons & Dragons Basic Game', 8.0, u'Oct 2005'),
  (u'Savage Worlds', 10.0, u'Sep 2007'),
  (u'Heroscape Master Set: Rise of the Valkyrie', 10.0, u'Mar 2005'),
  (u'Betrayal at House on the Hill', 6.0, u'Mar 2005'),
  (u'Sergeants! On the Eastern Front', 9.0, u'Oct 2005'),
  (u'HeroClix', 5.0, u'Mar 2005'),
  (u'Risk', 7.5, u'Oct 2005'),
  (u'Apples to Apples', 7.0, u'Oct 2005')],
 'razzer99': [(u'Carcassonne: Expansion 2 \u2013 Traders & Builders',
   7.0,
   u'Jul 2009'),
  (u'Catan: Cities & Knights', 7.0, u'Jul 2009'),
  (u'Munchkin Cthulhu', 5.0, u'

In [325]:
bg_ranking_data = pd.read_csv('data/game_rankings.csv')
bg_ranking_data['Year Published'] = bg_ranking_data['Year Published'].fillna(0)
bg_ranking_data['Year Published'] = bg_ranking_data['Year Published'].astype(int)
bg_ranking_data['Title'] = bg_ranking_data['Title'].apply(lambda x: x.strip())
bg_ranking_data.drop(['Rank', 'Year Published'], axis=1, inplace=True)

In [328]:
bg_ranking_data

Unnamed: 0,Title,Geek Rating,Avg Rating,Num Ratings
0,Pandemic Legacy: Season 1,8.469,8.66,15951
1,Through the Ages: A New Story of Civilization,8.306,8.77,6279
2,Twilight Struggle,8.226,8.36,26424
3,Terra Mystica,8.153,8.29,22791
4,Star Wars: Rebellion,8.130,8.56,7286
5,Scythe,8.082,8.35,12711
6,7 Wonders Duel,8.029,8.20,18324
7,Caverna: The Cave Farmers,8.022,8.19,16163
8,The Castles of Burgundy,7.994,8.11,24014
9,Puerto Rico,7.991,8.10,47075


In [313]:
us_ratings = {}
for key in us_ratings_dict_clean.keys():
    count = 0
    for value in us_ratings_dict_clean[key]:
        if count == 0:
            us_ratings[key] = {value[0]: value[1]}
        else:
            us_ratings[key].update({value[0]: value[1]})
        count += 1

In [315]:
us_ratings_df = pd.DataFrame(us_ratings)

In [333]:
count = 0
for game in bg_ranking_data['Title'].values:
    if game not in us_ratings_df.columns:
        count += 1
print count

1190


In [334]:
games_in_both_dfs = []
for game in bg_ranking_data['Title'].values:
    if game in us_ratings_df.columns:
        games_in_both_dfs.append(game)

In [336]:
bg_ranking_data = bg_ranking_data.set_index('Title')

In [338]:
for game in bg_ranking_data.index:
    if game not in games_in_both_dfs:
        bg_ranking_data.drop(game, inplace=True)

In [340]:
bg_ranking_data['Title'] = bg_ranking_data.index

In [341]:
bg_ranking_data.drop_duplicates(subset=['Title'], keep='first', inplace=True)

In [343]:
for game in board_game_index.keys():
    bg_ranking_data['Title'].replace(to_replace=game, value=board_game_index[game], inplace=True)

In [344]:
bg_ranking_data = bg_ranking_data.set_index('Title')

In [346]:
bg_ranking_data.to_csv('just_ranking_info.csv')

In [347]:
geek_ratings = bg_ranking_data['Geek Rating']
num_ratings = bg_ranking_data['Num Ratings']

In [None]:
us_ratings_df.rename(index=user_index, columns=board_game_index, inplace=True)

In [348]:
us_als_data = us_ratings_df.stack().reset_index().rename(columns={'level_0':'user','level_1':'board_game', 0:'rating'})

In [350]:
us_als_data

Unnamed: 0,user,board_game,rating
0,0 1 1 2 3 5 8,A Game of Thrones: The Board Game (Second Edit...,9.5
1,0 1 1 2 3 5 8,Abyss,6.7
2,0 1 1 2 3 5 8,Airlines Europe,7.7
3,0 1 1 2 3 5 8,Arkham Horror (Fantasy Flight First Edition),8.8
4,0 1 1 2 3 5 8,Arkham Horror: Curse of the Dark Pharaoh Expan...,6.0
5,0 1 1 2 3 5 8,Arkham Horror: Dunwich Horror Expansion (Engli...,6.0
6,0 1 1 2 3 5 8,Arkham Horror: Innsmouth Horror Expansion (Fan...,8.0
7,0 1 1 2 3 5 8,Arkham Horror: Kingsport Horror Expansion (Fan...,7.0
8,0 1 1 2 3 5 8,Arkham Horror: The Black Goat of the Woods Exp...,7.0
9,0 1 1 2 3 5 8,Arkham Horror: The King in Yellow Expansion (E...,7.0


In [352]:
us_als_data.to_csv('us_als_data.csv', encoding='utf-8')

In [359]:
board_games = dict(enumerate(us_ratings_df.columns))
board_game_index = dict((y,x) for x,y in board_games.iteritems())
users = dict(enumerate(us_ratings_df.index))
user_index = dict((y,x) for x,y in users.iteritems())

In [364]:
np.save('board_game_dict.npy', board_game_index)
np.save('us_user_dict.npy', user_index)

In [365]:
us_als_data

Unnamed: 0,level_0,board_game,rating
0,0 1 1 2 3 5 8,A Game of Thrones: The Board Game (Second Edit...,9.5
1,0 1 1 2 3 5 8,2031,6.7
2,0 1 1 2 3 5 8,Airlines Europe,7.7
3,0 1 1 2 3 5 8,Arkham Horror (Fantasy Flight First Edition),8.8
4,0 1 1 2 3 5 8,Arkham Horror: Curse of the Dark Pharaoh Expan...,6.0
5,0 1 1 2 3 5 8,Arkham Horror: Dunwich Horror Expansion (Engli...,6.0
6,0 1 1 2 3 5 8,4596,8.0
7,0 1 1 2 3 5 8,Arkham Horror: Kingsport Horror Expansion (Fan...,7.0
8,0 1 1 2 3 5 8,Arkham Horror: The Black Goat of the Woods Exp...,7.0
9,0 1 1 2 3 5 8,Arkham Horror: The King in Yellow Expansion (E...,7.0


In [362]:
for game in board_game_index.keys():
    us_als_data['board_game'].replace(to_replace=game, value=board_game_index[game], inplace=True)

KeyboardInterrupt: 

In [356]:
for user in user_index.keys():
    us_als_data['user'].replace(to_replace=user, value=user_index[user], inplace=True)

KeyError: 'user'