In [2]:
import numpy as np
import csv
import pandas as pd
import pyprind
import scipy.sparse as spr
import graphlab as gl
from sklearn.cluster import KMeans
from math import isnan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error
sns.set_style("whitegrid")
sns.set_context("poster")
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
%matplotlib inline

In [5]:
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
profiles_file = 'data/profiles.csv'
artists_file = 'data/artists.csv'

# Experimentation

### User Profile Matrix
#### Load the User data into a DataFrame

In [3]:
profiles = pd.read_csv(profiles_file, header=0)
user_ids = profiles.user

In [4]:
profile_data = pd.DataFrame({'sex': profiles.sex.apply(lambda s: -1 if type(s) == float and np.isnan(s) else s).values, \
                                 'age': profiles.age.apply(lambda s: int(s) if not isnan(s) else -1).values, \
                                 'country': profiles.country.values}, \
                                index=user_ids.values)

In [5]:
profile_data.head()

Unnamed: 0,age,country,sex
fa40b43298ba3f8aa52e8e8863faf2e2171e0b5d,25,Sweden,f
5909125332c108365a26ccf0ee62636eee08215c,29,Iceland,m
d1867cbda35e0d48e9a8390d9f5e079c9d99ea96,30,United States,m
63268cce0d68127729890c1691f62d5be5abd87c,21,Germany,m
02871cd952d607ba69b64e2e107773012c708113,24,Netherlands,m


In [6]:
columns = list(set(profile_data.sex)) + list(set(profile_data.country)) + [15, 20, 25, 30, 35, 40, 45, 50]

In [7]:
#This was helpful: http://stackoverflow.com/questions/2272149/round-to-5-or-other-number-in-python
def custom_round(x, base=5):
    rounded = int(base * round(float(x)/base))
    if rounded <15:
        return 15
    elif rounded > 50:
        return 50
    else:
        return rounded

#### Create a Matrix with 1 and 0 with a auser has a certaint characteristic

In [8]:
# User Profile Matrix
profile_matrix = np.zeros((len(profile_data.index.values), len(columns)))
loop_size = len(profile_data.index.values)
mybar = pyprind.ProgBar(loop_size)

for i, user in enumerate(profile_data.index.values):
    profile = profile_data.ix[user]

    #Sex indices
    if profile['sex'] == 'm':
        profile_matrix[i, 0] = 1  
    elif profile['sex'] == 'f':
        profile_matrix[i, 1] = 1
        
    # Country indices
    country = profile['country']
    country_col = columns.index(country)
    profile_matrix[i, country_col] = 1
    
    # Age indices
    age = custom_round(profile['age'])
    profile_matrix[i, columns.index(age)] = 1
    
    mybar.update()

In [9]:
user_pos_by_id = {}
for i, user_id in enumerate(user_ids):
    user_pos_by_id[user_id] = i

In [10]:
train_data = pd.read_csv(train_file, header=0)

In [11]:
train = pd.DataFrame({'user': train_data.user.values, \
                        'artist': train_data.artist.values, \
                        'plays': train_data.plays.apply(lambda s: int(s) if not isnan(s) else 0).values}, \
                        index=train_data.user.values)

In [12]:
train.ix["44ce793a6cd9d20f13f4a576a818ef983314bb5d"].head()

Unnamed: 0,artist,plays,user
44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81,44ce793a6cd9d20f13f4a576a818ef983314bb5d
44ce793a6cd9d20f13f4a576a818ef983314bb5d,5441c29d-3602-4898-b1a1-b77fa23b8e50,70,44ce793a6cd9d20f13f4a576a818ef983314bb5d
44ce793a6cd9d20f13f4a576a818ef983314bb5d,f4857fb9-e255-4dc6-bd01-e4ca7cc68544,21,44ce793a6cd9d20f13f4a576a818ef983314bb5d
44ce793a6cd9d20f13f4a576a818ef983314bb5d,6ffb8ea9-2370-44d8-b678-e9237bbd347b,56,44ce793a6cd9d20f13f4a576a818ef983314bb5d
44ce793a6cd9d20f13f4a576a818ef983314bb5d,63011a8d-0117-4f7e-9991-1ef1f337ff70,13,44ce793a6cd9d20f13f4a576a818ef983314bb5d


### Train/Validate Split

In [13]:
train.head()

Unnamed: 0,artist,plays,user
eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03,5a8e07d5-d932-4484-a7f7-e700793a9c94,554,eb1c57ddc9e0e2d005169d3a1a96e8dd95e3af03
44ce793a6cd9d20f13f4a576a818ef983314bb5d,a3a92047-be1c-4f3e-8960-c4f8570984df,81,44ce793a6cd9d20f13f4a576a818ef983314bb5d
da9cf3f557161d54b76f24db64be9cc76db008e3,eeb1195b-f213-4ce1-b28c-8565211f8e43,708,da9cf3f557161d54b76f24db64be9cc76db008e3
8fa49ab25d425edcf05d44bfc1d5aea895287d81,a1419808-65d3-4d40-998c-1a0bac65eabc,265,8fa49ab25d425edcf05d44bfc1d5aea895287d81
b85fcaef67d2669cd99b334b5e8c8705263db2cf,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,220,b85fcaef67d2669cd99b334b5e8c8705263db2cf


In [14]:
trainlist=[]
validatelist=[]
take=4
i = 0
loop_size = len(train.groupby('user'))
mybar = pyprind.ProgBar(loop_size)
for k, v in train.groupby('user'):
    if v.artist.count() > 10:
        validatelist.append(v[-take:]) 
        trainlist.append(v[:-take])
    else:
        trainlist.append(v)
    mybar.update()
train_df=pd.concat(trainlist)
validate_df=pd.concat(validatelist)

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:02:09


In [15]:
print len(set(train_df.user.values))
print len(set(validate_df.user.values))

233286
224957


In [16]:
# CALCULATE THE NUMBER OF SONGS WE ARE ESTIMATING IN OUR TRIAN_TEST SAMPLE
num_songs_estimating = 0

for user, user_data in train_df.iteritems():
    for artist, plays in user_data.iteritems():
        num_songs_estimating += 1
        
print num_songs_estimating

9764928


In [17]:
KM = KMeans(n_clusters=20, \
            init='k-means++', \
            n_init=10, \
            max_iter=300, \
            tol=0.0001, \
            precompute_distances='auto', \
            verbose=0, \
            random_state=37)
# Calls fit and then predict
predict = KM.fit_predict(profile_matrix)

In [18]:
print "The objective function: %f" % KM.score(profile_matrix) 

The objective function: -206859.457500


In [19]:
# Examine the predicted clusters
print predict[1:10]

[ 4  4  9 17  9 19  1  7 10]


### Global-Median

In [20]:
global_median = np.median(np.array(train_df.plays.values))
print global_median

118.0


### User-Medians

In [21]:
user_medians = {}

uuids = list(set(train_df.user))
loop_size = len(uuids)
mybar = pyprind.ProgBar(loop_size)
for k, user in enumerate(uuids):
    user_medians[user] = np.median(np.array(train_df.ix[user].plays.values))
    mybar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:01:45


In [22]:
medians_df = pd.DataFrame.from_dict(user_medians, orient='index')
medians_df.rename(columns={0:"user_median"}, inplace=True)
medians_df["global_median"] = global_median
medians_df["user"] = medians_df.index.values

In [23]:
medians_df.head()

Unnamed: 0,user_median,global_median,user
f283c15ed4180e686384dc1de2a5cbf5f95ae269,4.0,118,f283c15ed4180e686384dc1de2a5cbf5f95ae269
5909125332c108365a26ccf0ee62636eee08215c,415.0,118,5909125332c108365a26ccf0ee62636eee08215c
0eae120959c04371c23af09abaf71305ab2a1b3c,237.5,118,0eae120959c04371c23af09abaf71305ab2a1b3c
734f7337c7d33e99fa60a6361a5df8e3fb939ecf,242.0,118,734f7337c7d33e99fa60a6361a5df8e3fb939ecf
02871cd952d607ba69b64e2e107773012c708113,172.0,118,02871cd952d607ba69b64e2e107773012c708113


### Create a Dataframe with actual plays of validation set and the user_medians of the train set

In [24]:
validate_df.head()

Unnamed: 0,artist,plays,user
00001411dc427966b17297bf4d69e7e193135d89,e105c272-b5d7-4135-82ef-d60bded54345,1244,00001411dc427966b17297bf4d69e7e193135d89
00001411dc427966b17297bf4d69e7e193135d89,9fa07bb4-8312-4cd7-a19a-4fdbc5797148,2641,00001411dc427966b17297bf4d69e7e193135d89
00001411dc427966b17297bf4d69e7e193135d89,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,622,00001411dc427966b17297bf4d69e7e193135d89
00001411dc427966b17297bf4d69e7e193135d89,451f9db1-f75f-44f9-b218-f8bdf22035a1,2427,00001411dc427966b17297bf4d69e7e193135d89
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,50eec634-7c42-41ee-9b1f-b41d9ca28b26,8,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf


In [25]:
final_df = validate_df
final_df["user_median"] = 0

In [26]:
final_df.head()

Unnamed: 0,artist,plays,user,user_median
00001411dc427966b17297bf4d69e7e193135d89,e105c272-b5d7-4135-82ef-d60bded54345,1244,00001411dc427966b17297bf4d69e7e193135d89,0
00001411dc427966b17297bf4d69e7e193135d89,9fa07bb4-8312-4cd7-a19a-4fdbc5797148,2641,00001411dc427966b17297bf4d69e7e193135d89,0
00001411dc427966b17297bf4d69e7e193135d89,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,622,00001411dc427966b17297bf4d69e7e193135d89,0
00001411dc427966b17297bf4d69e7e193135d89,451f9db1-f75f-44f9-b218-f8bdf22035a1,2427,00001411dc427966b17297bf4d69e7e193135d89,0
000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,50eec634-7c42-41ee-9b1f-b41d9ca28b26,8,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,0


In [54]:
"""loop_size = len(medians_df.user_median)
mybar = pyprind.ProgBar(loop_size)
test = []
for k, user in enumerate(list(set(medians_df.user))):
    #test.append(medians_df.ix[user].user_median)
    if user in final_df.user:
        final_df.loc[user, "user_median"] = medians_df.ix[user].user_median
    mybar.update()"""


0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:36:04


In [55]:
#final_df.to_csv("user_medians_validation.csv", sep='\t', encoding='utf-8')

In [58]:
final_df = pd.read_csv("user_medians_validation.csv", sep='\t', encoding='utf-8')

In [59]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,artist,plays,user,user_median
0,00001411dc427966b17297bf4d69e7e193135d89,e105c272-b5d7-4135-82ef-d60bded54345,1244,00001411dc427966b17297bf4d69e7e193135d89,854.5
1,00001411dc427966b17297bf4d69e7e193135d89,9fa07bb4-8312-4cd7-a19a-4fdbc5797148,2641,00001411dc427966b17297bf4d69e7e193135d89,854.5
2,00001411dc427966b17297bf4d69e7e193135d89,ffb18e19-64a4-4a65-b4ce-979e00c3c69d,622,00001411dc427966b17297bf4d69e7e193135d89,854.5
3,00001411dc427966b17297bf4d69e7e193135d89,451f9db1-f75f-44f9-b218-f8bdf22035a1,2427,00001411dc427966b17297bf4d69e7e193135d89,854.5
4,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,50eec634-7c42-41ee-9b1f-b41d9ca28b26,8,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,4.0


### Validation Performance
### MAE for User_median: 137

In [60]:
mean_absolute_error(final_df.plays, final_df.user_median)

137.26561187249118

In [62]:
final_df_submit = final_df[["user", "user_median"]]

In [63]:
final_df_submit.head()

Unnamed: 0,user,user_median
0,00001411dc427966b17297bf4d69e7e193135d89,854.5
1,00001411dc427966b17297bf4d69e7e193135d89,854.5
2,00001411dc427966b17297bf4d69e7e193135d89,854.5
3,00001411dc427966b17297bf4d69e7e193135d89,854.5
4,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,4.0


### Making the prediction

In [65]:
test_df = pd.read_csv(test_file, header=0)

In [66]:
test_df.head()

Unnamed: 0,Id,user,artist
0,1,306e19cce2522fa2d39ff5dfc870992100ec22d2,4ac4e32b-bd18-402e-adad-ae00e72f8d85
1,2,9450d351278df4938bdea4ed86aec940a4e927ac,1f574ab1-a46d-4586-9331-f0ded23e0411
2,3,801909d6955f59033c88595d3d7f8a6a5dcd53cc,3eb72791-6322-466b-87d3-24d74901eb2d
3,4,e3ed47445c127fbeff47fb58f6bbf2f3b4535d82,61604b45-8a91-4e33-a1b6-45d7b1fec4e5
4,5,a73f46652103f3a5f7429159310f6928f79644aa,5dfdca28-9ddc-4853-933c-8bc97d87beec


In [None]:
test_df["artist"] = 0
test_df.rename(columns={"artist": "plays"}, inplace=True)
test_df.head()

In [None]:
loop_size = len(medians_df.user_median)
mybar = pyprind.ProgBar(loop_size)
i = 0
for k, user in enumerate(list(set(medians_df.user))):
    if user in test_df["user"].values:
        i += 1
        test_df.loc[user, "plays"] = medians_df.ix[user].user_median
    mybar.update()

0%                          100%
[                              ]

In [89]:
#test_df.user.values
#medians_df.ix["734f7337c7d33e99fa60a6361a5df8e3fb939ecf"].user_median

array(['306e19cce2522fa2d39ff5dfc870992100ec22d2',
       '9450d351278df4938bdea4ed86aec940a4e927ac',
       '801909d6955f59033c88595d3d7f8a6a5dcd53cc', ...,
       'f7fefb72eea0fa7976eb2fc5c0f1fe9f5d1149bd',
       '6af6ded3c4a0894e65aed78d48f979a0ac3ce1ae',
       'de29005de66c93b31a34991b0c72d9f70b9fc313'], dtype=object)

## GraphLab

In [76]:
gl.canvas.set_target('ipynb')

In [77]:
# The below will download a 118 MB file.
sf = gl.SFrame.read_csv(train_file, header=True, verbose=False, column_type_hints={"plays":int})

In [78]:
sf.rename({'user':'user_id', "artist": "item_id"})
#sf.rename({"artist": "item_id"})

user_id,item_id,plays
eb1c57ddc9e0e2d005169d3a1 a96e8dd95e3af03 ...,5a8e07d5-d932-4484-a7f7-e 700793a9c94 ...,554
44ce793a6cd9d20f13f4a576a 818ef983314bb5d ...,a3a92047-be1c- 4f3e-8960-c4f8570984df ...,81
da9cf3f557161d54b76f24db6 4be9cc76db008e3 ...,eeb1195b-f213-4ce1-b28c- 8565211f8e43 ...,708
8fa49ab25d425edcf05d44bfc 1d5aea895287d81 ...,a1419808-65d3-4d40-998c- 1a0bac65eabc ...,265
b85fcaef67d2669cd99b334b5 e8c8705263db2cf ...,a3cb23fc-acd3-4ce0-8f36-1 e5aa6a18432 ...,220
feed7a0dc74c5251283a1505a df453a2061d08f7 ...,1cc5adcd-1422-4b5c-a3cd- 3ecd4f43f506 ...,2113
cbb86d88a8d2d0bab8956807c 6c45cd0c752324b ...,9c9f1380-2516-4fc9-a3e6-f 9f61941d090 ...,127
5641e1e6f04868a61dc29f722 7e34f4640163e9b ...,832a43c7-aa7d-439b- a6b4-4f1afa671c24 ...,305
9f748976d303db79f61bf570d 9549d6335b11b0e ...,2fddb92d-24b2-46a5-bf28-3 aed46f4684c ...,705
056d5d2467dc63c4520963323 e2ebf9576b58229 ...,847e8284-8582-4b0e- 9c26-b042a4f49e57 ...,7


In [79]:
sf_test = gl.SFrame.read_csv(test_file, header=True, verbose=False, column_type_hints={"plays":int})
sf_test.rename({'user':'user_id', "artist": "item_id"})

Id,user_id,item_id
1,306e19cce2522fa2d39ff5dfc 870992100ec22d2 ...,4ac4e32b-bd18-402e-adad- ae00e72f8d85 ...
2,9450d351278df4938bdea4ed8 6aec940a4e927ac ...,1f574ab1-a46d-4586-9331-f 0ded23e0411 ...
3,801909d6955f59033c88595d3 d7f8a6a5dcd53cc ...,3eb72791-6322-466b- 87d3-24d74901eb2d ...
4,e3ed47445c127fbeff47fb58f 6bbf2f3b4535d82 ...,61604b45-8a91-4e33-a1b6-4 5d7b1fec4e5 ...
5,a73f46652103f3a5f74291593 10f6928f79644aa ...,5dfdca28-9ddc-4853-933c- 8bc97d87beec ...
6,55f1c89e3a102de38a0bdfcb1 fe660b028c5c0af ...,ef58d4c9-0d40-42ba-bfab- 9186c1483edd ...
7,7ad7619f38bf52de421b795f3 159949422b5bbee ...,a3cb23fc-acd3-4ce0-8f36-1 e5aa6a18432 ...
8,2a1ab291185cfc6fb99655648 1e178666e9731f9 ...,5dfdca28-9ddc-4853-933c- 8bc97d87beec ...
9,3f407fff902ab403f06668f6b e3d10bb9e9a02cd ...,000fc734-b7e1-4a01-92d1-f 544261b43f5 ...
10,2eea5a2b68e6e9e0d9dd6b3c1 c0e1e5840fc9ce3 ...,84783313-2a3f-4f17-ab9a- 425a5d527879 ...


In [80]:
#http://blog.dato.com/choosing-a-recommender-model

### Train/Validate split

In [91]:
(train_set, test_set) = sf.random_split(0.8)

#Comment that before running final output
sf = train_set
sf_test = test_set

#### Item Similarity Models (Jaccard similarity)

In [82]:
item_sim_model = gl.item_similarity_recommender.create(sf, user_id="user_id", item_id="item_id", target="plays")

result_ismodel = item_sim_model.predict(sf_test)

scaled_result_ismodel = (result_ismodel - result_ismodel.min())/(result_ismodel.max() - result_ismodel.min()) * sf['plays'].max()
print scaled_result_ismodel

[220.98930849550774, 460.5599048125369, 580.8977691713282, 590.9799447444939, 260.2241414171353, 503.05911503801485, 1541.218156743547, 153.83807861770052, 1882.7327619596406, 423.2535054754637, 19.451074662635545, 1804.0541101103718, 35.60992685520734, 295.2800659310563, 962.4527916413724, 308.7455464554072, 178.10332215300096, 1532.8240006663357, 507.61831788854863, 481.28035539324065, 211.39138469075758, 161.8503201461162, 226.13620976344697, 4798.203206305778, 737.3841438972244, 710.8622290677372, 1043.932317919479, 33.8047909015708, 122.89899911043803, 744.5497589476129, 716.4625064218919, 50.307116200797736, 332.98525948813455, 3883.5667787442303, 197.56431341874725, 413.1230104809141, 29.836082257697242, 143.9592871002036, 708.4990002955782, 582.6739582522437, 79.8629295809786, 145.67102756457444, 722.009898273399, 406.2816996262716, 248.10584345899207, 1410.7039933374042, 845.4841844643817, 520.5449003129596, 494.7588393814824, 587.9980067427198, 694.2932049820575, 36.916609325

In [None]:
with open("data/item_sim_jaccard_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, scaled_result_ismodel[i]])
        mybar.update()

#### Item Similarity Models (Cosine similarity)

In [87]:
item_sim_model2 = gl.item_similarity_recommender.create(sf, user_id="user_id", item_id="item_id", target="plays", similarity_type="cosine")

result_ismodel2 = item_sim_model2.predict(sf_test)

scaled_result_ismodel2 = (result_ismodel2 - result_ismodel2.min())/(result_ismodel2.max() - result_ismodel2.min()) * sf['plays'].max()
print scaled_result_ismodel2

[350.7574879204341, 733.7418092554427, 667.2831471841013, 1244.0566684010228, 321.2029414306743, 932.9411466426824, 4183.495631641452, 299.7596111440166, 3204.504677455189, 737.9235717639305, 42.312769576939296, 3261.850421628807, 64.01617587625799, 550.8284935861909, 1559.0099250632402, 674.1211101739327, 375.7865841436792, 2911.8006561440748, 630.6303595974387, 685.59717990996, 203.01114315807953, 451.16785853779675, 573.0609924855906, 7992.472500210376, 1573.1034557329042, 1711.1679675213786, 3138.1290690352807, 69.35284751061621, 222.04115165594328, 1238.9912775689147, 943.8311918127941, 94.4219782047116, 622.1760205676496, 10081.179628488733, 164.13666893631964, 359.38241485649337, 52.85025629803167, 108.61318909761539, 1471.6967770859708, 1079.8465061599975, 165.01078754109656, 279.9977048455774, 1251.3797568672574, 637.922509733607, 1719.1156378067162, 2565.3478647113925, 1510.5966767717575, 959.8217627119491, 936.6298004403778, 786.4161456299378, 561.5201832031988, 73.429562418

In [None]:
with open("data/item_sim_cosine_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, scaled_result_ismodel2[i]])
        mybar.update()

#### Item Similarity Models (Pearson similarity)

In [89]:
item_sim_model3 = gl.item_similarity_recommender.create(sf, user_id="user_id", item_id="item_id", target="plays", similarity_type="pearson")

result_ismodel3 = item_sim_model3.predict(sf_test)

scaled_result_ismodel3 = (result_ismodel3 - result_ismodel3.min())/(result_ismodel3.max() - result_ismodel3.min()) * sf['plays'].max()
print scaled_result_ismodel3

[714.9953049211526, 158.49975667200852, 456.0762808767589, 1069.9057474827334, 393.36385328888645, 765.324479976131, 3888.0069225631682, 134.8929559329846, 2674.8563336823686, 381.26944485414685, 71.355403533259, 2621.974413043326, 0.0, 660.9148009226828, 836.6521940328792, 992.3597653745004, 726.5478934962476, 2214.8697018381777, 457.1461315342248, 623.6653654997014, 287.6353815256864, 826.1241866080624, 97.33115877242568, 7099.412934247599, 1218.4668556957565, 1401.6904449331105, 2846.2167631107804, 0.0, 35.015003856922725, 1296.733765428936, 626.6732660126806, 0.0, 299.5758053657405, 8121.721687239829, 0.0, 544.0374776340823, 0.0, 402.4179196674946, 844.5527588078842, 506.1010390326096, 479.4471514773242, 128.52792551014375, 1404.4240669224603, 620.299510015522, 310.5371810121976, 1894.5006339734182, 989.9512963009379, 633.4093222477247, 730.9262839832588, 797.8015661778081, 162.8383093254406, 174.640223648631, 1163.0474196772998, 3476.1762083916674, 560.2207163845829, 587.439401645

In [None]:
with open("data/item_sim_pearson_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, scaled_result_ismodel3[i]])
        mybar.update()

#### Popularity Model

In [None]:
#Popularity Model
popularity_model = gl.popularity_recommender.create(sf, user_id="user_id", item_id="item_id", target="plays")

result_pm = popularity_model.predict(sf_test)

scaled_result_pm = (result_pm - result_pm.min())/(result_pm.max() - result_pm.min()) * sf['plays'].max()
#Scaled results look very strange. I stick to the unnormilzed once.
#print scaled_result_pm

In [90]:
result_pm

dtype: float
Rows: 4154804
[349.8149028933809, 184.20293978188715, 254.09487534626038, 286.12068965517244, 340.1960881406289, 216.83710407239818, 254.99787117154236, 340.1960881406289, 288.88788040576617, 141.04631578947368, 386.2808277027027, 199.76539179104478, 133.38118811881188, 323.3389679715302, 245.81038374717832, 386.3238395477769, 341.7984685234126, 173.59254190899762, 181.52311790993468, 181.78495692511598, 248.31994635798983, 362.22479580641226, 226.9754782254782, 379.63953488372096, 252.41912358642972, 397.16, 214.1608606557377, 191.73986194995686, 206.35928143712576, 446.4553586788233, 204.83849452120057, 165.2882298424467, 170.4765683114307, 318.022884283247, 146.48602719033232, 274.41233964569335, 255.96879675144262, 334.2321340561853, 341.7984685234126, 131.00301204819277, 325.13953488372096, 235.8340807174888, 461.8585320676082, 308.3623324711968, 107.41271551724138, 264.4872425453428, 273.3251993067591, 159.99216369177188, 166.60046838407493, 342.77262485918135, 297.0

In [None]:
with open("data/popularity_model_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, result_pm[i]])
        mybar.update()

#### Factorization Recommender

In [92]:
user_info = gl.SFrame.read_csv(profiles_file, header=True, verbose=False).dropna()
user_info.rename({"user": "user_id"})

item_info = gl.SFrame.read_csv(artists_file, header=True, verbose=False).dropna()
item_info.rename({"artist": "item_id"})

regularization_vals = [0.1, 0.001, 0.0001, 0.00001, 0.000001]

factorization_model = [gl.factorization_recommender.create(sf, target='plays', \
        user_data=user_info, \
        item_data=item_info, max_iterations=50, num_factors=5, regularization=r)
          for r in regularization_vals]

In [None]:
result_fm = factorization_model.predict(sf_test)

scaled_result_fm = (result_fm - result_fm.min())/(result_fm.max() - result_fm.min()) * sf['plays'].max()
print scaled_result_fm

with open("data/factorization_model_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, scaled_result_fm[i]])
        mybar.update()

In [None]:
result_fm

#### Alternating Least Squares Model

In [None]:
als_model = gl.factorization_recommender.create(sf, target='plays', solver = 'als')

result_als = als_model.predict(sf_test)

scaled_result_als = (result_als - result_als.min())/(result_als.max() - result_als.min()) * sf['plays'].max()
print scaled_result_als

with open("data/als_model_results.csv", 'w') as soln_fh:
    soln_csv = csv.writer(soln_fh,
                          delimiter=',',
                          quotechar='"',
                          quoting=csv.QUOTE_MINIMAL)
    soln_csv.writerow(['Id', 'plays'])
    loop_size = 4154804
    mybar = pyprind.ProgBar(loop_size)
    for i in range(4154804):
        soln_csv.writerow([i + 1, scaled_result_als[i]])
        mybar.update()

In [None]:
result_als

### Comparing the Models

In [104]:

#gl.recommender.util.compare_models(sf_test, [item_sim_model, item_sim_model2, item_sim_model3], model_names=["ItemSimilarity1", "ItemSimilarity2", "ItemSimilarity3"])

## Ridge Regression

In [20]:
# Design Matrix
from sklearn.linear_model import Ridge
features=np.concatenate([uuids,uiids])
features.shape

(235286,)

In [19]:
def getmats(indf):
    stvals=indf[['user', 'artist', 'plays']].values
    designm=np.zeros((stvals.shape[0], features.shape[0]))
    ratings=np.zeros(stvals.shape[0])
    loop_size = len(stvals)
    mybar = pyprind.ProgBar(loop_size)
    for i, row in enumerate(stvals):
        designm[i,:]=np.concatenate([1*(row[0]==uuids), 1*(row[1]==uiids)])
        ratings[i]=row[2]
        mybar.update()
    return designm, ratings

In [None]:
#Carrying out the ridge regression
designm, ratings = getmats(train_df)

0%                          100%
[                              ]

In [None]:
validatedm, validaterats = getmats(validate_df)

In [122]:
#alphas=[0.01, 0.1, 10, 100, 1000]
alphas=[7,8,9,10,11,12,13]
vdict={}
rdict={}
loop_size = len(alphas)
mybar = pyprind.ProgBar(loop_size)
for a in alphas:
    regr=Ridge(alpha=a).fit(designm, ratings)
    vpreds=regr.predict(validatedm)
    mae=mean_absolute_error(validaterats, vpreds)
    vdict[a]=mae
    rdict[a]=regr
    mybar.update()

0%   100%
[#######] | ETA: 00:00:00
Total time elapsed: 00:06:58


In [123]:
minerroralpha=min(vdict, key=vdict.get)
print minerroralpha
regr=rdict[minerroralpha] 

11


In [124]:
regr.intercept_, ybar

(252.07751787456544, 252.7676107730693)

In [None]:
testdm, _ = getmats(test_df)

In [164]:
predictions['baseline_r']=regr.predict(testdm)

In [242]:
np.savetxt("baseline_r.csv", predictions['baseline_r'], delimiter=",")