## Import the Librairies

In [29]:
import pandas as pd 
import numpy as np
import boto
import pyprind
from scipy import stats
import scipy
#%matplotlib inline

## Boto Stuff

In [30]:
s3 = boto.connect_s3(aws_access_key_id='xxxx', 
                     aws_secret_access_key='xxxx')

In [31]:
s3_bucket_p2 = s3.get_bucket('practicals3')

In [46]:
k = s3_bucket_p2.new_key('res.csv')

## Loading the Data

In [3]:
#train = pd.read_csv('Data/train.csv')
#test = pd.read_csv('Data/test.csv')
#train0 = train

In [27]:
train = pd.read_csv('https://s3-us-west-1.amazonaws.com/practicals3/train.csv')
test  = pd.read_csv('https://s3-us-west-1.amazonaws.com/practicals3/test.csv')

In [3]:
#train = train0.loc[np.in1d(train0.artist, train0.artist.unique()[:20])] # Only use the 20 first artists
#train = train0

In [4]:
#test = pd.read_csv('Data/test.csv')

In [5]:
unique_artist = train.artist.unique()

## Description of the Algorithm

0. Compute the baseline
1. Compute the Common Support as : $NCommon_{ij}$ How many users rated/played both music
2. Compute the Rho_Correlation : $\rho(\bf{Y_{u_i}} - \bf{\overline{Y_u}} ; \bf{Y_{u_j}} - \bf{\overline{Y_u}})$
3. Compute the Similarity : $ \frac{N_Common \rho_{mj}}{N_Common + reg}$
4. Apply the Central Dogma : $Y_{um} = Y_{um}^{Baseline} + \frac{\sum_{j \in S^k(m)} s_{mj} (Y_{uj} - Y_{um}^{Baseline})}{\sum_{j \in S^k(m)} s_{mj}}$

# I. Approach by filtering given u,m, don't compute all the support etc.

## Adding IDs ...

In order to improve the speed of the various algorithm, we implement IDs instead of using the text to perform the location. 

##### Artist

In [37]:
artist_id = pd.DataFrame(columns = ['artist'])
artist_id.loc[:, 'artist'] = train.artist.unique()
artist_id.loc[:, 'Id'] = artist_id.index
artist_id.index = artist_id.artist.values

train.loc[:, 'artist_id'] = artist_id.loc[train.artist, 'Id'].values
test.loc[:, 'artist_id'] = artist_id.loc[test.artist, 'Id'].values

##### User

In [38]:
user_id = pd.DataFrame(columns = ['user'])
user_id.loc[:, 'user'] = train.user.unique()
user_id.loc[:, 'Id']   = user_id.index
user_id.index = user_id.user.values

train.loc[:, 'user_id'] = user_id.loc[train.user, 'Id'].values
test.loc[:, 'user_id']  = user_id.loc[test.user, 'Id'].values

## Step 0 : Compute the Baseline

In [8]:
# u    = test.loc[0, 'user_id']
# m    = test.loc[0, 'artist_id']
# u =0
# m=0

In [9]:
def computeBaseline(u,m,train):
    Y_bar = train.plays.median()
    Y_u = train.loc[train.user_id == u].plays.median()
    Y_m = train.loc[train.artist_id == m].plays.median()
    Y_baseline = Y_bar + Y_u - Y_bar + Y_m - Y_bar
    return Y_baseline

In [10]:
#computeBaseline(u,m,train)

## Step 1 : Get the Support

In [11]:
def GetSupport(u,m,train):

    potential_artists = train.loc[train.user_id == u, 'artist_id'].unique()

    N_support = np.empty(shape = (len(potential_artists)))
    support = [[] for i in range(len(potential_artists))]
    prog_bar = pyprind.ProgBar(len(potential_artists))

    # Loop on the artist
    for j,artist2 in enumerate(potential_artists):
        prog_bar.update()
        support[j]   = set(train.loc[(train.artist_id == m), 'user_id']).intersection(set(train.loc[(train.artist_id == artist2), 'user_id']))
        N_support[j] = len(support[j])
        
    return (N_support, support, potential_artists)

In [12]:
#N_support, support, base_artist = GetSupport(u,m,train)

## Step2 : Compute the similarity Matrix

##### Compute the user average

In [13]:
# prog_bar = pyprind.ProgBar(len(train.user.unique()))
# user_avg = pd.DataFrame(columns=['User', 'AVG'])
# for u in train.user_id.unique():
#     user_avg = user_avg.append({'User': u, 'AVG' : train.loc[train.user_id == u].plays.median()}, ignore_index=True)
#     prog_bar.update()

In [14]:
# user_avg = pd.read_csv('user_avg2.csv')
# user_avg.index = user_avg.User.values

##### Similarity Vector

In [15]:
def computeSim(artist1, artist2, unique_artist, support, N_support, train_small, train, reg = 3, verbose=False):
    commonUser = support[unique_artist.tolist().index(artist2)]
    N_Common = N_support[unique_artist.tolist().index(artist2)]
    user_artist1 = []
    user_artist2 = []
    if(verbose):
        prog_bar = pyprind.ProgBar(len(commonUser))
    
    ix_artist1 = (train.artist_id == artist1)
    ix_artist2 = (train.artist_id == artist2)
    
    for i in commonUser:
        #print(i, end = ",")
        if(verbose):
            prog_bar.update()
        u_avg = train.loc[train.user_id == i].plays.median()
        user_artist1.append(int(train_small.loc[(train_small.user_id == i) & ix_artist1].plays) - u_avg)
        user_artist2.append(int(train_small.loc[(train_small.user_id == i) & ix_artist2].plays) - u_avg)
        
    rho = scipy.stats.pearsonr(user_artist1, user_artist2)[0]
    rho_shrunk = N_Common * rho / (N_Common + reg) 
    return((1-rho_shrunk)/2)

In [16]:
# sim = np.empty(shape = (len(base_artist)))
# artist1 = m
# prog_bar = pyprind.ProgBar(len(base_artist))

# #train_small
# ix = np.in1d(train.artist_id, np.append(m, base_artist))
# train_small = train.loc[ix]

# for j,artist2 in enumerate(base_artist):
#     sim[j] = computeSim(artist1, artist2, base_artist, support, N_support,train_small, train)
#     prog_bar.update()

## Step3 : Adjust the Baseline - Get the Final Prediction

In [17]:
# #input:
# k = 3

# res = pd.DataFrame(columns = ['artist_id', 'sim'])
# res.loc[:, 'artist_id'] = base_artist
# res.loc[:, 'sim'] = sim
# res = res.sort_values(by = 'sim', ascending = False)

# # Apply the final Algo
# num = 0
# denom = 0
# Yum_base = computeBaseline(u, m, train)

# for i in res.index[:k]:
#     Yuj = int(train[(train.user_id == u) & (train.artist_id == res.loc[i, 'artist_id'])].plays)
#     num += res.loc[i, 'sim']*(Yuj-Yum_base)
#     denom += res.loc[i, 'sim']

# Y_um = Yum_base + num/denom

## Main

In [19]:
def MakePrediction(u,m,train,k=5):
    # Compute Support
    print('Compute Support')
    N_support, support, base_artist = GetSupport(u,m,train)

    # Compute the sim
    print('Compute Sim')
    sim = np.empty(shape = (len(base_artist)))
    artist1 = m
    prog_bar = pyprind.ProgBar(len(base_artist))

    #train_small
    ix = np.in1d(train.artist_id, np.append(m, base_artist))
    train_small = train.loc[ix]

    for j,artist2 in enumerate(base_artist):
        sim[j] = computeSim(artist1, artist2, base_artist, support, N_support,train_small, train)
        prog_bar.update()
        
    #input:
#    k = 5

    res = pd.DataFrame(columns = ['artist_id', 'sim'])
    res.loc[:, 'artist_id'] = base_artist
    res.loc[:, 'sim'] = sim
    res = res.sort_values(by = 'sim', ascending = False)

    # Apply the final Algo
    print('Apply Final Algo')
    num = 0
    denom = 0
    Yum_base = computeBaseline(u, m, train)

    for i in res.index[:k]:
        Yuj = int(train[(train.user_id == u) & (train.artist_id == res.loc[i, 'artist_id'])].plays)
        num += res.loc[i, 'sim']*(Yuj-Yum_base)
        denom += res.loc[i, 'sim']

    Y_um = Yum_base + num/denom
    
    return Y_um

In [45]:
final_res = pd.DataFrame(columns = ['Id', 'Prediction'])
count = 0

for r in test.iterrows():
    print('*********' + str(r[1]['Id']) + '*********')
    temp = MakePrediction(r[1]['user_id'], r[1]['artist_id'], train)
    final_res = final_res.append({'Id':r[1]['Id'], 'Prediction' :temp}, ignore_index = True)
    
    count += 1
    if (count == 100):
        print('done')
        count = 0
        final_res.to_csv('res.csv', index = False)
        k.set_contents_from_filename('res.csv')
        
#print("Saving")
