In [31]:
import numpy as np
import csv
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import scipy.sparse as sp
from sklearn.linear_model import Ridge
%matplotlib inline

In [2]:
train_file = 'data/train.csv'
test_file  = 'data/test.csv'
df = pd.read_csv(train_file, header=0)

In [3]:
df = pd.read_csv(train_file, header=0)
testdf = pd.read_csv(test_file, header=0)

In [5]:
traindf = pd.DataFrame.from_csv('data/traindf2')
validatedf = pd.DataFrame.from_csv('data/validatedf2')

In [6]:
uuids = df.user.unique()#unique-users
uiids = df.artist.unique()#unique-items
uuidmap={v:k for k,v in enumerate(uuids)}#of length U
uiidmap={v:k for k,v in enumerate(uiids)}#of length M

In [8]:
ybar = traindf.plays.mean()
global_median = traindf.plays.median()

In [9]:
#your code here
groupby_user = traindf.groupby('user')
user_biases = dict()
user_means = dict()
user_medians = dict()
user_biases_median = dict()
for id in uuids:
    group = groupby_user.get_group(id)
    user_mean = group.plays.mean()
    user_biases[id] = user_mean - ybar
    user_means[id] = user_mean
    user_medians[id] = group.plays.median()
    user_biases_median[id] = group.plays.median() - global_median

In [10]:
groupby_item = traindf.groupby('artist')
item_biases = dict()
item_means = dict()
item_medians = dict()
item_biases_median = dict()
for id in uiids:
    group = groupby_item.get_group(id)
    item_mean = group.plays.mean()
    item_biases[id] = item_mean - ybar
    item_means[id]  = item_mean
    item_medians[id] = group.plays.median()
    item_biases_median[id] = group.plays.median() - global_median

In [43]:
#given the initial data frame, and the number of latent factors lshape
def design_p(indf, lshape, inps):
    #the number of columns in the added part of the feature matrix
    qshape=lshape*len(uuids)
    #the number of columns we are coming in with from the previous regression
    pshape=lshape*len(uiids)
    #the number of features from the baseline regression
    fshape=2
    #userid and itemid along with star rating from the input dataframe
    stvals=indf[['user', 'artist', 'plays']].values
    #the design matrix of size N rows X M+U+L*U columns
    designp=np.zeros(stvals.shape[0], dtype=object)
    #ratings column vector of N rows
    plays=np.zeros(stvals.shape[0])
    #for each row in the dataframe:
    for i, row in enumerate(stvals):
        newrow = np.zeros(qshape + pshape)
        #get userid, restaurant id and rating from the row
        user=row[0]
        artist=row[1]
        plays[i]=row[2]
        #use the index corresponding to the userid and L to figure how many slots
        #in the design matrix to take up and where to take them up from
        #for e.g., if index is 2(ie third index) and L=2,this will be from index 4 on
        #(which is the 5th index. )
        posq=uuidmap[user]*lshape
        putinat=fshape+posq

        #use the index corresponding to the business id to get the indexes of the incoming p
        posp=uiidmap[artist]*lshape
        #fill the baseline part of the design matrix in for this row
        
        #designp[i,:-qshape]=np.concatenate([1*(userid==uuids), 1*(bizid==uiids)])
        newrow[:2] =[user_medians[user],item_medians[artist]]
        #Fill L of the slots from putinat onwards to L elements from the p matrix
        #if the index is 4(the fifth index) and L=2, this is the 8th and 9th element of the
        #p coefficients
        newrow[putinat:putinat+lshape]=inps[posp:posp+lshape]
        newrow = sp.coo_matrix(newrow)
        designp[i] = newrow
    #return the constructed design matrix and ratings
    return sp.vstack(designp), plays



In [47]:

#given the initial data frame, and the number of latent factors lshape
def design_q(indf, lshape, inqs):
    #the number of columns in the added part of the feature matrix
    pshape=lshape*len(uiids)
    #the number of columns we are coming in with from the previous regression
    qshape=lshape*len(uuids)
    #the number of features from the baseline regression
    fshape=2
    #userid and itemid along with star rating from the input dataframe
    stvals=indf[['user', 'artist','plays']].values
    #the design matrix of size N rows X M+U+L*U columns
    designq=np.zeros(stvals.shape[0], dtype=object)
    #ratings column vector of N rows
    plays=np.zeros(stvals.shape[0])
    #for each row in the dataframe:
    for i, row in enumerate(stvals):
        newrow = np.zeros(qshape + pshape)
        #get userid, restaurant id and rating from the row
        user=row[0]
        artist=row[1]
        #set the ith element of the rating vector to the rating from the matching row
        plays[i]=row[2]
        #use the index corresponding to the bizid and L to figure how many slots
        #in the design matrix to take up and where to take them up from
        #for e.g., if index is 2(ie third index) and L=2,this will be from index 4 on
        #(which is the 5th index. )
        posp=uiidmap[artist]*lshape
        putinat=fshape+posp

        #use the index corresponding to the userid to get the indexes of the incoming p
        posq=uuidmap[user]*lshape
        
        newrow[:2] =[user_medians[user],item_medians[artist]]

        #Fill L of the slots from putinat onwards to L elements from the p matrix
        #if the index is 4(the fifth index) and L=2, this is the 8th and 9th element of the
        #p coefficients
        newrow[putinat:putinat+lshape]=inqs[posp:posp+lshape]
        newrow = sp.coo_matrix(newrow)
        designq[i] = newrow
        
    #return the constructed design matrix and ratings
    return sp.vstack(designq), plays



In [45]:
L=1
initps=np.random.rand(L*len(uiids))
initqs=np.random.rand(L*len(uuids))

In [57]:
#NOTICE THE TWO ALPHAS BELOW AND READ THE EXPLANATION ABOVE
from sklearn.metrics import mean_squared_error 
vdict2={}
rdict2={}
convdict={}
maxiters=100
alpha=1
inps=initps
inqs=initqs
sums=[]
conv=[]
reachedit=0
for it in range(maxiters):
    #create design_p with inps randomly chosen
    designp, rats=design_p(traindf.head(10000), L, inps)
    #fit
    regrp=Ridge(alpha=alpha).fit(designp, rats)
    inqsold=inqs
    inqs=regrp.coef_[-inqs.shape[0]:]
    #use regression coefficients as the new inqs
    designq, rats=design_q(traindf.head(10000), L, inqs)
    regrq=Ridge(alpha=alpha).fit(designq, rats)
    inpsold=inps
    inps=regrq.coef_[-inps.shape[0]:]
    #just to see how far from 0 these are
    sums.append((inqs.sum(), inps.sum()))
    #see if the coefficients are converging
    pconv=mean_squared_error(inpsold, inps)
    qconv=mean_squared_error(inqsold, inqs)
    conv.append((pconv, qconv))
    if it > 9 and it % 10 ==0:
        print "Iteration ",it, pconv, qconv
    reachedit=it
    if pconv < 0.005 and qconv < 0.005:
        break



In [60]:
#fit once more using the new inps
designp, rats=design_p(traindf.head(10000), L, inps)
regrp=Ridge(alpha=alpha).fit(designp, rats)

In [61]:
from sklearn.metrics import mean_absolute_error
valdesignp, validaterats = design_p(traindf.head(10000), L, inps)
vpreds=regrp.predict(valdesignp)
mean_absolute_error(rats, vpreds)

226.31874280199739

In [None]:
#Now predict on the validation set
valdesignp, validaterats = design_p(validatedf, L, inps)
vpreds=regrp.predict(valdesignp)
rmse=get_rmse(validaterats, vpreds)
vdict2[a]=rmse
rdict2[a]=regrp
convdict[a]=(reachedit, conv, sums)