In [2]:
import numpy as np
import csv
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
%matplotlib inline
import pyprind

In [3]:
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
df = pd.read_csv(train_file, header=0)

In [4]:
trainlist=[]
validatelist=[]
take=4
i = 0
for k, v in df.groupby('user'):
    if v.artist.count() > 10:
        validatelist.append(v[-take:]) 
        trainlist.append(v[:-take])
    else:
        trainlist.append(v)
train_df=pd.concat(trainlist)
validate_df=pd.concat(validatelist)

In [5]:
print train_df.shape, validate_df.shape

(3254976, 3) (899828, 3)


In [6]:
test_df = pd.read_csv(test_file, header=0)

In [7]:
test_df["plays"] = 0
#test_df.head()

### Data Distribution

In [8]:
"""#your code here
artists_user = test_df.groupby('user')['artist'].count()
plt.hist(artists_user, alpha=0.5, bins=np.arange(1,50,1), label="artists by user");
plt.xlabel('# artists')
plt.ylabel('# users')
#plt.xticks(np.arange(0,90,5))
plt.legend()"""

'#your code here\nartists_user = test_df.groupby(\'user\')[\'artist\'].count()\nplt.hist(artists_user, alpha=0.5, bins=np.arange(1,50,1), label="artists by user");\nplt.xlabel(\'# artists\')\nplt.ylabel(\'# users\')\n#plt.xticks(np.arange(0,90,5))\nplt.legend()'

In [9]:
"""np.mean(df.groupby('user')['artist'].count())"""

"np.mean(df.groupby('user')['artist'].count())"

In [10]:
"""#your code here
users_artist = df.groupby('artist')['user'].count()
plt.hist(users_artist, alpha=0.5, bins=np.arange(1,35000,1000), label="users by artist");
plt.xlabel('# users')
plt.ylabel('# artists')
plt.legend()"""

'#your code here\nusers_artist = df.groupby(\'artist\')[\'user\'].count()\nplt.hist(users_artist, alpha=0.5, bins=np.arange(1,35000,1000), label="users by artist");\nplt.xlabel(\'# users\')\nplt.ylabel(\'# artists\')\nplt.legend()'

In [11]:
np.mean(train_df.groupby('artist')['user'].count())

1627.488

In [12]:
"""pd.DataFrame.to_csv(traindf, 'data/traindf')
pd.DataFrame.to_csv(validatedf, 'data/validatedf')"""

"pd.DataFrame.to_csv(traindf, 'data/traindf')\npd.DataFrame.to_csv(validatedf, 'data/validatedf')"

In [13]:
ybar = train_df.plays.mean()
ybar

253.87795148105548

In [14]:
uuids=train_df.user.unique()#unique-users
uiids=train_df.artist.unique()#unique-items
uuidmap={v:k for k,v in enumerate(uuids)}#of length U
uiidmap={v:k for k,v in enumerate(uiids)}#of length M

In [15]:
groupby_user = train_df.groupby('user')
user_biases = dict()
user_means = dict()
user_medians = dict()
loop_size = len(uuids)
mybar = pyprind.ProgBar(loop_size)
for id in uuids:
    user_mean = groupby_user.get_group(id).plays.mean()
    user_biases[id] = user_mean - ybar
    user_means[id] = user_mean
    user_medians[id] = groupby_user.get_group(id).plays.median()
    mybar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:03:30


In [16]:
groupby_item = train_df.groupby('artist')
item_biases = dict()
item_means = dict()
loop_size = len(uiids)
mybar = pyprind.ProgBar(loop_size)
for id in uiids:
    item_mean = groupby_item.get_group(id).plays.mean()
    item_biases[id] = item_mean - ybar
    item_means[id]  = item_mean
    mybar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:02


In [17]:
train_avgs={'mean':ybar, 'users':user_biases, 'items':item_biases}

### Baseline Model 

In [19]:
predictions={}
user_biases_validate = validate_df['user'].apply(lambda x: user_biases[x])
item_biases_validate = validate_df['artist'].apply(lambda x: item_biases[x])
predictions['baseline'] = (ybar + user_biases_validate + item_biases_validate).values

In [20]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(validate_df.plays, predictions['baseline'])

180.75041329445514

In [21]:
predictions['user_mean'] = validate_df['user'].apply(lambda x: user_means[x])
mean_absolute_error(validate_df.plays, predictions['user_mean'])

162.44725936000501

In [22]:
predictions['user_median'] = validate_df['user'].apply(lambda x: user_medians[x])
mean_absolute_error(validate_df.plays, predictions['user_median'])

137.26561187249118

### Ridge Regression

In [20]:
# Design Matrix
from sklearn.linear_model import Ridge
features=np.concatenate([uuids,uiids])
features.shape

(235286,)

In [19]:
def getmats(indf):
    stvals=indf[['user', 'artist', 'plays']].values
    designm=np.zeros((stvals.shape[0], features.shape[0]))
    ratings=np.zeros(stvals.shape[0])
    loop_size = len(stvals)
    mybar = pyprind.ProgBar(loop_size)
    for i, row in enumerate(stvals):
        designm[i,:]=np.concatenate([1*(row[0]==uuids), 1*(row[1]==uiids)])
        ratings[i]=row[2]
        mybar.update()
    return designm, ratings

In [None]:
#Carrying out the ridge regression
designm, ratings = getmats(train_df)

0%                          100%
[                              ]

In [None]:
validatedm, validaterats = getmats(validate_df)

In [122]:
#alphas=[0.01, 0.1, 10, 100, 1000]
alphas=[7,8,9,10,11,12,13]
vdict={}
rdict={}
loop_size = len(alphas)
mybar = pyprind.ProgBar(loop_size)
for a in alphas:
    regr=Ridge(alpha=a).fit(designm, ratings)
    vpreds=regr.predict(validatedm)
    mae=mean_absolute_error(validaterats, vpreds)
    vdict[a]=mae
    rdict[a]=regr
    mybar.update()

0%   100%
[#######] | ETA: 00:00:00
Total time elapsed: 00:06:58


In [123]:
minerroralpha=min(vdict, key=vdict.get)
print minerroralpha
regr=rdict[minerroralpha] 

11


In [124]:
regr.intercept_, ybar

(252.07751787456544, 252.7676107730693)

In [None]:
testdm, _ = getmats(test_df)

In [164]:
predictions['baseline_r']=regr.predict(testdm)

In [242]:
np.savetxt("baseline_r.csv", predictions['baseline_r'], delimiter=",")