## Import the Librairies

In [103]:
import pandas as pd 
import numpy as np
import pyprind
from scipy import stats
%matplotlib inline

## Loading the Data

In [2]:
train = pd.read_csv('Data/train.csv')
train0 = train

In [14]:
train = train0.loc[np.in1d(train0.artist, train0.artist.unique()[:5])] # Only use the 20 first artists

In [4]:
test = pd.read_csv('Data/test.csv')

## Description of the Algorithm

0. Compute the baseline
1. Compute the Common Support as : $NCommon_{ij}$ How many users rated/played both music
2. Compute the Rho_Correlation : $\rho(\bf{Y_{u_i}} - \bf{\overline{Y_u}} ; \bf{Y_{u_j}} - \bf{\overline{Y_u}})$
3. Compute the Similarity : $ \frac{N_Common \rho_{mj}}{N_Common + reg}$
4. Apply the Central Dogma : $Y_{um} = Y_{um}^{Baseline} + \frac{\sum_{j \in S^k(m)} s_{mj} (Y_{uj} - Y_{um}^{Baseline})}{\sum_{j \in S^k(m)} s_{mj}}$

## Step 0 : Compute the Baseline

In [147]:
u = test.loc[2827,'user']
m = test.loc[2827, 'artist']

In [148]:
def computeBaseline(u,m,train):
    Y_bar = train.plays.median()
    Y_u = train.loc[train.user == u].plays.median()
    Y_m = train.loc[train.artist == m].plays.median()
    Y_baseline = Y_bar + Y_u - Y_bar + Y_m - Y_bar
    return Y_baseline

In [297]:
#computeBaseline(u,m,train)

In [296]:
#test.loc[np.in1d(test.artist, train.artist) & np.in1d(test.user, train.user)] # Only use the 20 first artists

## Step 1 : Compute the Common Support

In [49]:
unique_artist = train.artist.unique()

In [132]:
N_support = np.empty(shape = (len(unique_artist),len(unique_artist)))
support=[[[] for i in range(len(unique_artist))] for i in range(len(unique_artist))]

prog_bar = pyprind.ProgBar(len(unique_artist))
for i,artist1 in enumerate(unique_artist):
    prog_bar.update()
    for j,artist2 in enumerate(unique_artist):
        if(i<=j):
            if(artist1 == artist2):
                support[i][j]   = train.loc[train.artist == artist1]
                N_support[i,j]  = train.loc[train.artist == artist1].shape[0]
            if(artist1 != artist2):
                support[i][j]  = set(train.loc[(train.artist == artist1), 'user']).intersection(set(train.loc[(train.artist == artist2), 'user']))
                N_support[i,j] = len(support[i][j])
                # Symetry
                support[j][i]  = support[i][j]
                N_support[j,i] = N_support[i,j]

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:00


## Step 2 : Compute the correlation coefficient

##### Compute the User Average

In [18]:
prog_bar = pyprind.ProgBar(len(train.user.unique()))
user_avg = pd.DataFrame(columns=['User', 'AVG'])
for u in train.user.unique():
    user_avg = user_avg.append({'User': u, 'AVG' : train.loc[train.user == u].plays.median()}, ignore_index=True)
    prog_bar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:08:30


##### Compute the similarity

In [141]:
def computeSim(artist1, artist2, unique_artist, support, N_support, user_avg, train, reg = 3):
    commonUser = support[unique_artist.tolist().index(artist1)][unique_artist.tolist().index(artist2)]
    N_Common = N_support[unique_artist.tolist().index(artist1)][unique_artist.tolist().index(artist2)]
    user_artist1 = []
    user_artist2 = []
    for i in commonUser:
        user_artist1.append(int(train.loc[(train.user == i) & (train.artist == artist1)].plays) - float(user_avg.loc[user_avg.User == i, 'AVG']))
        user_artist2.append(int(train.loc[(train.user == i) & (train.artist == artist2)].plays) - float(user_avg.loc[user_avg.User == i, 'AVG']))
    rho = scipy.stats.pearsonr(user_artist1, user_artist2)[0]
    rho_shrunk = N_Common * rho / (N_Common + reg) 
    return((1-rho_shrunk)/2)

In [142]:
sim = np.empty(shape = (len(unique_artist),len(unique_artist)))
prog_bar = pyprind.ProgBar(len(unique_artist))
for i,artist1 in enumerate(unique_artist):
    prog_bar.update()
    for j,artist2 in enumerate(unique_artist):
        if(i<=j):
            if(artist1 == artist2):
                sim[i,j]  = 1
            if(artist1 != artist2):
                sim[i,j] = computeSim(artist1, artist2, unique_artist, support, N_support, user_avg, train)
                sim[j,i] = sim[i,j] 

0%  100%
[#####] | ETA: 00:00:00
Total time elapsed: 00:00:58


In [143]:
sim

array([[ 1.        ,  0.83275199,  0.65431326,  0.91161113,  0.93058629],
       [ 0.83275199,  1.        ,  0.93132009,  0.93181818,  0.95188342],
       [ 0.65431326,  0.93132009,  1.        ,  0.94631708,  0.99808896],
       [ 0.91161113,  0.93181818,  0.94631708,  1.        ,  0.95920893],
       [ 0.93058629,  0.95188342,  0.99808896,  0.95920893,  1.        ]])

## Step 3 : Apply The Final Algo

In [295]:
def MakePrediction(u,m,unique_artist,train,k):
    # Get the base artist
    base_artist = train.loc[train.user == u].artist.unique()
    
    # Construct the dataframe with the similarities and the base_artist, sort it DESC
    a = np.where(np.in1d(unique_artist, m))[0]
    b = np.where(np.in1d(unique_artist, base_artist))[0]
    sim_base = sim[a][0][b]

    res = pd.DataFrame(columns = ['artist_id', 'artist_name', 'sim'])
    res.loc[:, 'sim']  = sim_base
    res.artist_id = b
    res.artist_name = base_artist
    res = res.sort_values(by = 'sim', ascending = False)
    
    # Apply the final Algo
    num = 0
    denom = 0
    Yum_base = computeBaseline(u, m, train)
    for i in res.index[:k]:
        Yuj = int(train[(train.user == u) & (train.artist == res.loc[i, 'artist_name'])].plays)
        num += res.loc[i, 'sim']*(Yuj-Yum_base)
        denom += res.loc[i, 'sim']

    Y_um = Yum_base + num/denom
    
    return Y_um