In [90]:
import numpy as np
import csv
import pandas as pd
import pyprind
import scipy.sparse as spr
import graphlab as gl
import gzip
from sklearn.cluster import KMeans
from math import isnan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
sns.set_style("whitegrid")
sns.set_context("poster")
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
%matplotlib inline

## Load Data

In [91]:
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
profiles_file = 'data/profiles.csv'
artists_file = 'data/artists.csv'
artists_file_info = 'data/artist_infos.json'

In [93]:
df = pd.read_csv(train_file, header=0)

## Baseline Model

In [94]:
## Train-Test Split
trainlist=[]
validatelist=[]
take=4
i = 0
for k, v in df.groupby('user'):
    if v.artist.count() > 10:
        validatelist.append(v[-take:]) 
        trainlist.append(v[:-take])
    else:
        trainlist.append(v)
traindf=pd.concat(trainlist)
validatedf=pd.concat(validatelist)
print traindf.shape, validatedf.shape

(3254976, 3) (899828, 3)


In [8]:
pd.DataFrame.to_csv(traindf, 'data/traindf')
pd.DataFrame.to_csv(validatedf, 'data/validatedf')

In [62]:
"""traindf = pd.read_csv("data/traindf")
validatedf = pd.read_csv("data/validatedf")"""

In [95]:
ybar = traindf.plays.mean()

uuids=traindf.user.unique()#unique-users
uiids=traindf.artist.unique()#unique-items
uuidmap={v:k for k,v in enumerate(uuids)}#of length U
uiidmap={v:k for k,v in enumerate(uiids)}#of length M

groupby_user = traindf.groupby('user')
user_biases = dict()
user_means = dict()
user_medians = dict()
for id in uuids:
    user_mean = groupby_user.get_group(id).plays.mean()
    user_biases[id] = user_mean - ybar
    user_means[id] = user_mean
    user_medians[id] = groupby_user.get_group(id).plays.median()

groupby_item = traindf.groupby('artist')
item_biases = dict()
item_means = dict()
for id in uiids:
    item_mean = groupby_item.get_group(id).plays.mean()
    item_biases[id] = item_mean - ybar
    item_means[id]  = item_mean
    

train_avgs={'mean':ybar, 'users':user_biases, 'items':item_biases}

predictions={}
user_biases_validate = validatedf['user'].apply(lambda x: user_biases[x])
item_biases_validate = validatedf['artist'].apply(lambda x: item_biases[x])

predictions['baseline'] = (ybar + user_biases_validate + item_biases_validate).values
predictions['user_mean'] = validatedf['user'].apply(lambda x: user_means[x])
predictions['user_median'] = validatedf['user'].apply(lambda x: user_medians[x])



## GraphLab Create

In [96]:
#### Load Data as SFrames ####

In [97]:
sf_train = gl.SFrame.read_csv("data/traindf", header=True, verbose=False, column_type_hints={"plays":int})
sf_train.remove_column('X1')
sf_train.rename({'user':'user_id', "artist": "item_id"})

user_id,item_id,plays
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,fb01635c-51fc-4cad-b71f- 62e18bb3433b ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,83998f9c-846b-4294-aede- d7735531c901 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,b071f9fa-14b0-4217-8e97-e b41da73f598 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,144ef525-85e9-40c3-8335-0 2c32d0861f3 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,8000598a-5edb-401c-8e6d- 36b167feaf38 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,7b885d42-3c41-4f43-9944-a 5855ec5155e ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,c995a379-60b9-404b- bd97-a7e2de0751d3 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,3d6bbeb7-f90e- 4d10-b440-e153c0d10b53 ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,9fdaa16b-a6c4-4831-b87c- bc9ca8ce7eaa ...,0
00000c289a1829a808ac09c00 daf10bc3c4e223b ...,e8374874-4178-4869-b92e- fef6bf30dc04 ...,0


In [98]:
sf_validate = gl.SFrame.read_csv("data/validatedf", header=True, verbose=False, column_type_hints={"plays":int})
sf_validate.remove_column('X1')
sf_validate.rename({'user':'user_id', "artist": "item_id"})

user_id,item_id,plays
00001411dc427966b17297bf4 d69e7e193135d89 ...,2c916f95-f6ba-46fa- b9d8-dc0d9379f603 ...,0
00001411dc427966b17297bf4 d69e7e193135d89 ...,5aeb21a3-4606-4a0d-8369-5 0c768d1e99a ...,0
00001411dc427966b17297bf4 d69e7e193135d89 ...,7b52603c-a84c- 4a76-8dc8-899bb4dd9fa7 ...,0
00001411dc427966b17297bf4 d69e7e193135d89 ...,c485632c-b784-4ee9-8ea1-c 5fb365681fc ...,0
000063d3fe1cf2ba248b9e3c3 f0334845a27a6bf ...,24f1766e-9635-4d58-a4d4-9 413f9f98a4c ...,0
000063d3fe1cf2ba248b9e3c3 f0334845a27a6bf ...,17167af8-c1da-45cc- bba2-9d23f068b7a3 ...,0
000063d3fe1cf2ba248b9e3c3 f0334845a27a6bf ...,ef58d4c9-0d40-42ba-bfab- 9186c1483edd ...,0
000063d3fe1cf2ba248b9e3c3 f0334845a27a6bf ...,61ed9c9c-79eb- 4e8f-8015-bd599ac0ab49 ...,0
00007a47085b9aab8af55f52e c8846ac479ac4fe ...,328068c9-ee60-408e- 8c24-b1dc1ae08b94 ...,0
00007a47085b9aab8af55f52e c8846ac479ac4fe ...,0a77bec1-12ef-4caa-b36a- f533001fcd29 ...,0


In [99]:
sf_test = gl.SFrame.read_csv(test_file, header=True, verbose=False)
sf_test.rename({'user':'user_id', "artist": "item_id"})

Id,user_id,item_id
1,306e19cce2522fa2d39ff5dfc 870992100ec22d2 ...,4ac4e32b-bd18-402e-adad- ae00e72f8d85 ...
2,9450d351278df4938bdea4ed8 6aec940a4e927ac ...,1f574ab1-a46d-4586-9331-f 0ded23e0411 ...
3,801909d6955f59033c88595d3 d7f8a6a5dcd53cc ...,3eb72791-6322-466b- 87d3-24d74901eb2d ...
4,e3ed47445c127fbeff47fb58f 6bbf2f3b4535d82 ...,61604b45-8a91-4e33-a1b6-4 5d7b1fec4e5 ...
5,a73f46652103f3a5f74291593 10f6928f79644aa ...,5dfdca28-9ddc-4853-933c- 8bc97d87beec ...
6,55f1c89e3a102de38a0bdfcb1 fe660b028c5c0af ...,ef58d4c9-0d40-42ba-bfab- 9186c1483edd ...
7,7ad7619f38bf52de421b795f3 159949422b5bbee ...,a3cb23fc-acd3-4ce0-8f36-1 e5aa6a18432 ...
8,2a1ab291185cfc6fb99655648 1e178666e9731f9 ...,5dfdca28-9ddc-4853-933c- 8bc97d87beec ...
9,3f407fff902ab403f06668f6b e3d10bb9e9a02cd ...,000fc734-b7e1-4a01-92d1-f 544261b43f5 ...
10,2eea5a2b68e6e9e0d9dd6b3c1 c0e1e5840fc9ce3 ...,84783313-2a3f-4f17-ab9a- 425a5d527879 ...


In [100]:
#### Models ####

In [101]:
#### Item Similarity Models (Jaccard similarity) ####
item_sim_model_jaccard = gl.item_similarity_recommender.create(sf_train, user_id="user_id", item_id="item_id", target="plays")

result_item_sim_model_jaccard = item_sim_model_jaccard.predict(sf_validate)

predictions['item_sim_model_jaccard'] = result_item_sim_model_jaccard

In [102]:
#### Item Similarity Models (Cosine similarity) ####
item_sim_model_cosine = gl.item_similarity_recommender.create(sf_train, user_id="user_id", item_id="item_id", target="plays", similarity_type="cosine")

result_item_sim_model_cosine = item_sim_model_cosine.predict(sf_validate)

predictions['item_sim_model_cosine'] = result_item_sim_model_cosine

In [103]:
#### Item Similarity Models (Pearson similarity) ####
item_sim_model_pearson = gl.item_similarity_recommender.create(sf_train, user_id="user_id", item_id="item_id", target="plays", similarity_type="pearson")

result_item_sim_model_pearson = item_sim_model_pearson.predict(sf_validate)

predictions['item_sim_model_pearson'] = result_item_sim_model_pearson

In [104]:
#### Popularity Model  ####
popularity_model = gl.popularity_recommender.create(sf_train, user_id="user_id", item_id="item_id", target="plays")

result_popularity_model = popularity_model.predict(sf_validate)

predictions['popularity_model'] = result_popularity_model

In [109]:
#### Factorization Recommender ####
user_info = gl.SFrame.read_csv(profiles_file, header=True, verbose=False).dropna()
user_info.rename({"user": "user_id"})

item_info = gl.SFrame.read_csv(artists_file, header=True, verbose=False).dropna()
item_info.rename({"artist": "item_id"})

factorization_model = gl.factorization_recommender.create(sf_train, target='plays', \
        user_data=user_info, \
        item_data=item_info, max_iterations=500, num_factors=10, regularization=0.0001)

predictions['factorization_model'] = factorization_model.predict(sf_validate)

In [110]:
#### Alternating Least Squares Model ####
als_model = gl.factorization_recommender.create(sf_train, target='plays', \
        solver='als')

predictions['als_model'] = als_model.predict(sf_validate)

## Comparing Models:

In [113]:
print "Mean Absolute Errors:"
print " "
print "Baseline: \t\t\t %s" % mean_absolute_error(validatedf.plays, predictions['baseline'])
print "User Mean: \t\t\t %s" % mean_absolute_error(validatedf.plays, predictions['user_mean'])
print "User Median: \t\t\t %s" % mean_absolute_error(validatedf.plays, predictions['user_median'])
print "Item Similarity (Jaccard): \t %s" % mean_absolute_error(validatedf.plays, predictions['item_sim_model_jaccard'])
print "Item Similarity (Cosine): \t %s" % mean_absolute_error(validatedf.plays, predictions['item_sim_model_cosine'])
print "Item Similarity (Pearson): \t %s" % mean_absolute_error(validatedf.plays, predictions['item_sim_model_pearson'])
print "Popularity Model: \t\t %s" % mean_absolute_error(validatedf.plays, predictions['popularity_model'])
print "Factorization Model: \t\t %s" % mean_absolute_error(validatedf.plays, predictions['factorization_model'])
print "Alternating Least Squares Model: %s" % mean_absolute_error(validatedf.plays, predictions['als_model'])


Mean Absolute Errors:
 
Baseline: 			 180.750413294
User Mean: 			 162.44725936
User Median: 			 137.265611872
Item Similarity (Jaccard): 	 249.901443691
Item Similarity (Cosine): 	 249.901503881
Item Similarity (Pearson): 	 249.901505264
Popularity Model: 		 249.901503925
Factorization Model: 		 249.901676252
Alternating Least Squares Model: 249.901581465


## Code to store predictions

In [None]:
with open("data/item_sim_model_jaccard.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, item_sim_model_jaccard[i]])
        mybar.update()

# MISC
### Old Ridge Regression Code

In [None]:
"""# Design Matrix
from sklearn.linear_model import Ridge
features=np.concatenate([uuids,uiids])
features.shape

def getmats(indf):
    stvals=indf[['user', 'artist', 'plays']].values
    designm=np.zeros((stvals.shape[0], features.shape[0]))
    ratings=np.zeros(stvals.shape[0])
    loop_size = len(stvals)
    mybar = pyprind.ProgBar(loop_size)
    for i, row in enumerate(stvals):
        designm[i,:]=np.concatenate([1*(row[0]==uuids), 1*(row[1]==uiids)])
        ratings[i]=row[2]
        mybar.update()
    return designm, ratings



#Carrying out the ridge regression
designm, ratings = getmats(train_df)

validatedm, validaterats = getmats(validate_df)



#alphas=[0.01, 0.1, 10, 100, 1000]
alphas=[7,8,9,10,11,12,13]
vdict={}
rdict={}
loop_size = len(alphas)
mybar = pyprind.ProgBar(loop_size)
for a in alphas:
    regr=Ridge(alpha=a).fit(designm, ratings)
    vpreds=regr.predict(validatedm)
    mae=mean_absolute_error(validaterats, vpreds)
    vdict[a]=mae
    rdict[a]=regr
    mybar.update()

minerroralpha=min(vdict, key=vdict.get)
print minerroralpha
regr=rdict[minerroralpha] 

regr.intercept_, ybar

testdm, _ = getmats(test_df)

predictions['baseline_r']=regr.predict(testdm)

np.savetxt("baseline_r.csv", predictions['baseline_r'], delimiter=",")"""