# Recommender system demo

This notebook implements a demo of *content-based* recommender system. 

Idea: create a vector for a user based on his listening history (by averaging 
normalized tracks vector). Calculate cosine similarities to determine tracks in 
database to recommend.

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
tracks_db = pd.read_csv('../data/features.csv', index_col='track_id')
tracks_db.head()

Unnamed: 0_level_0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artists_ids,genres
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0aQA9DP54h37OevE7hRc2a,,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4,"[""6fOMl44jA4Sp5b9PpYCkzz""]","[[""hip hop"", ""pop rap"", ""rap""]]"
4ixBAI0E1M6df9mP27etZK,,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4,"[""46VwdSiSccsgzNhJP9sGhQ"", ""2272CVpxVafY3L0UHu...","[[""icelandic hip hop"", ""icelandic pop""]]"
3jO5IAvslWxaTxswokg66i,,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4,"[""2LRoIwlKmHjgvigdNGBHNo"", ""5J7rXWjtn5HzUkJ4Je...","[[""colombian pop"", ""latin"", ""pop reggaeton"", ""..."
5iS3TEmmRcDJU2pTJQ98zB,,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4,"[""3MaQwUQBigrjcMYHk8bDKm"", ""5Y24oYaTX3VOvtAcG2...","[[""finnish dance pop"", ""finnish hip hop"", ""fin..."
29xPIFa0othUcFrEGyJCZu,,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4,"[""2s7fZ34RK8RUJQ2ewsh6I5"", ""3tTvSeZiFDP3CY5EdP...","[[""francoton"", ""french hip hop"", ""pop urbaine""..."


Let's pretend we have some `n_streams` values.

In [3]:
tracks_db = tracks_db.assign(
                n_streams=np.random.randint(0, 1e6, size=len(tracks_db)))
tracks_db.head()

Unnamed: 0_level_0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artists_ids,genres,n_streams
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0aQA9DP54h37OevE7hRc2a,,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4,"[""6fOMl44jA4Sp5b9PpYCkzz""]","[[""hip hop"", ""pop rap"", ""rap""]]",44315
4ixBAI0E1M6df9mP27etZK,,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4,"[""46VwdSiSccsgzNhJP9sGhQ"", ""2272CVpxVafY3L0UHu...","[[""icelandic hip hop"", ""icelandic pop""]]",45751
3jO5IAvslWxaTxswokg66i,,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4,"[""2LRoIwlKmHjgvigdNGBHNo"", ""5J7rXWjtn5HzUkJ4Je...","[[""colombian pop"", ""latin"", ""pop reggaeton"", ""...",484011
5iS3TEmmRcDJU2pTJQ98zB,,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4,"[""3MaQwUQBigrjcMYHk8bDKm"", ""5Y24oYaTX3VOvtAcG2...","[[""finnish dance pop"", ""finnish hip hop"", ""fin...",241342
29xPIFa0othUcFrEGyJCZu,,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4,"[""2s7fZ34RK8RUJQ2ewsh6I5"", ""3tTvSeZiFDP3CY5EdP...","[[""francoton"", ""french hip hop"", ""pop urbaine""...",502823


### Preprocessing

Let's keep useful features only.

In [4]:
feature_names = [
    'danceability', 'energy', 'key', 'loudness',	
    'mode',	'speechiness', 'acousticness', 'instrumentalness',
    'liveness',	'valence', 'tempo', 'duration_ms', 'time_signature', 'n_streams']
tracks_db = tracks_db[feature_names]
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,n_streams
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4,44315
4ixBAI0E1M6df9mP27etZK,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4,45751
3jO5IAvslWxaTxswokg66i,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4,484011
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4,241342
29xPIFa0othUcFrEGyJCZu,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4,502823


Categorical variables have to be represented properly.

In [5]:
cat_features = ['key', 'mode', 'time_signature']
for cat_feature in cat_features:
    dummies = pd.get_dummies(tracks_db[cat_feature], prefix=f'{cat_feature}')
    tracks_db = pd.concat([tracks_db, dummies], axis=1)

tracks_db.drop(columns=cat_features, inplace=True)
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,0,0,1,0,0,0,0,1,0
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,0,0,1,0,0,0,1,0
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,0,0,1,0,0,0,1,0
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,0,1,0,1,0,0,0,0,1,0
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,0,0,1,0,0,0,1,0


All the features should be in the same range for vector operations.

In [6]:
scaler = MinMaxScaler()
tracks_db_scaled = pd.DataFrame(
    scaler.fit_transform(tracks_db.to_numpy()), 
    columns=tracks_db.columns, index=tracks_db.index)
tracks_db_scaled.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.662944,0.795214,0.836288,0.253623,0.241449,0.0,0.096867,0.409505,0.695481,0.175277,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4ixBAI0E1M6df9mP27etZK,0.742132,0.492049,0.684643,0.048344,0.202213,0.0,0.150113,0.482305,0.521722,0.155861,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3jO5IAvslWxaTxswokg66i,0.872081,0.461934,0.746186,0.299172,0.209256,0.0,0.10301,0.374115,0.586569,0.130187,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5iS3TEmmRcDJU2pTJQ98zB,0.48934,0.615524,0.664825,0.089959,0.779678,0.588889,0.095843,0.353893,0.756387,0.213194,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
29xPIFa0othUcFrEGyJCZu,0.836548,0.682782,0.750117,0.421325,0.023742,0.008616,0.064305,0.514661,0.613415,0.123509,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Getting recommendations

Let's pretend we have some listening history. Each time user listens to 
some track, it gets added to his dataframe.

In [7]:
tracks_user = tracks_db.sample(1, random_state=7)
tracks_user.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_9,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3ayMReTwLoo6jAg7adDqyz,0.828,0.79,-3.021,0.0719,0.167,1e-05,0.353,0.379,104.016,247126,...,0,1,0,1,0,0,0,0,1,0


First we need to build user's vector. Let's create it with EMA - more recent
tracks contributes more to the vector. 

In [8]:
class UserVector:
    def __init__(self, tracks, scaler, beta=0.9):
        self.beta = beta
        self.popularity_col_idx = self._get_popularity_col_idx(tracks)
        self.vec = self.build_vec(tracks, scaler)

    def build_vec(self, tracks, scaler):
        vec = tracks.to_numpy()
        vec = scaler.transform(vec) 

        # Determine n_streams column number and remove it
        vec = np.delete(vec, self.popularity_col_idx, 1)

        # EMA  
        vec_weighted = 0
        for t in range(1, vec.shape[0]+1):
            vec_weighted = self.beta*vec_weighted + (1-self.beta)*vec[t-1]  
                
        vec_weighted = vec_weighted.reshape(1, -1)

        return vec_weighted

    def add_track(self, track):
        track_vec = track.to_numpy().reshape(1, -1)
        track_vec = scaler.transform(track_vec)

        # Determine n_streams column number and remove it
        track_vec = np.delete(track_vec, self.popularity_col_idx, 1)

        self.vec = self.beta*self.vec + (1-self.beta)*track_vec

    def _get_popularity_col_idx(self, df):
        return np.nonzero(df.columns == 'n_streams')[0][0]

When computing similarity we take into account both audio feautures and 
popularity (`popularity_rate=-0.5` rewards less known tracks, `popularity_rate=0.5`
rewards most popular tracks).

In [9]:
def get_similarites(tracks_db_scaled, user_vector, popularity_rate=0):
    n_streams = tracks_db_scaled.n_streams.to_numpy().reshape(-1, 1)
    tracks_db_scaled = tracks_db_scaled.drop(['n_streams'], axis=1)
    
    audio_similarities = cosine_similarity(
                            tracks_db_scaled, user_vector.vec)
    similarities = audio_similarities + popularity_rate*n_streams
    
    return similarities

In [10]:
user_vector = UserVector(tracks_user, scaler)
similarities = get_similarites(tracks_db_scaled, user_vector, popularity_rate=0)
tracks_db_sim = tracks_db.assign(similarity=similarities)

# Drop tracks that are already there in user history
# tracks_db = tracks_db.drop(tracks_user.index)

tracks_db_sim.sort_values('similarity', ascending=False).iloc[:10]

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3ayMReTwLoo6jAg7adDqyz,0.828,0.79,-3.021,0.0719,0.167,1e-05,0.353,0.379,104.016,247126,...,1,0,1,0,0,0,0,1,0,1.0
3HF2E09nlXRlLQO2TJtQJh,0.752,0.743,-5.006,0.0749,0.142,0.0,0.256,0.386,116.013,177040,...,1,0,1,0,0,0,0,1,0,0.997896
5KPKjcE2trO2hBWRzkb2Mo,0.781,0.854,-4.115,0.074,0.0263,0.000216,0.344,0.41,103.021,160552,...,1,0,1,0,0,0,0,1,0,0.99705
3S9U5VgWx3CT0xUuhaufEK,0.782,0.902,-4.335,0.0615,0.217,0.0,0.374,0.434,129.944,194341,...,1,0,1,0,0,0,0,1,0,0.997
6WkJ2OK163XXS2oARUC9JM,0.72,0.741,-6.529,0.0287,0.101,1.4e-05,0.289,0.357,119.896,198053,...,1,0,1,0,0,0,0,1,0,0.996975
3AqwdLkl62kRTE8TwXxeIM,0.73,0.748,-5.609,0.165,0.13,0.0,0.348,0.305,109.969,145187,...,1,0,1,0,0,0,0,1,0,0.996909
3znlxKwQTRVrrerCcZPus2,0.828,0.714,-6.852,0.108,0.185,3e-06,0.244,0.407,95.967,143179,...,1,0,1,0,0,0,0,1,0,0.996901
0A5gdlrpAuQqZ2iFgnqBFW,0.813,0.761,-5.687,0.085,0.0529,0.0119,0.337,0.29,124.957,173067,...,1,0,1,0,0,0,0,1,0,0.996553
066QkK1heGoXBUPBbcOhEG,0.823,0.842,-7.539,0.0477,0.0359,0.0,0.381,0.396,91.995,216520,...,1,0,1,0,0,0,0,1,0,0.996444
2LOkdHHNKdEx24cQELBaD8,0.767,0.869,-3.87,0.0645,0.276,0.0,0.413,0.474,125.007,245798,...,1,0,1,0,0,0,0,1,0,0.996415


It's very easy to extend user's vector with a newly listened track.

In [11]:
user_vector.add_track(tracks_db.loc['4YeP3VXJQbXzee7dSiGYS6'])

similarities = get_similarites(tracks_db_scaled, user_vector, popularity_rate=0)
tracks_db_sim = tracks_db.assign(similarity=similarities)

# Drop tracks that are already there in user history
# tracks_db = tracks_db.drop(tracks_user.index)

tracks_db_sim.sort_values('similarity', ascending=False).iloc[:10]

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10,key_11,mode_0,mode_1,time_signature_0,time_signature_1,time_signature_3,time_signature_4,time_signature_5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7KOlJ92bu51cltsD9KU5I7,0.865,0.687,-3.744,0.217,0.242,0.0,0.158,0.486,139.023,242407,...,1,0,1,0,0,0,0,1,0,0.998291
5zyEUkL2SNiJQ4iQVnhWkk,0.858,0.731,-4.39,0.248,0.0951,0.000213,0.166,0.585,121.488,132827,...,1,0,1,0,0,0,0,1,0,0.998229
4V5Dv8IjHNTk2YJzU9vgsW,0.843,0.671,-5.708,0.22,0.0517,0.0,0.286,0.56,137.932,169421,...,1,0,1,0,0,0,0,1,0,0.998028
6zy9jttMkpMU5gaKoRYphQ,0.887,0.631,-5.77,0.286,0.176,0.0,0.145,0.476,126.928,255190,...,1,0,1,0,0,0,0,1,0,0.998021
1883mCuSqnfnrWoq8i3myH,0.88,0.569,-6.517,0.221,0.0809,5.8e-05,0.214,0.474,134.924,190187,...,1,0,1,0,0,0,0,1,0,0.997993
7clRpfJrnZMKIedZ5osP1x,0.831,0.666,-7.428,0.257,0.0337,0.000369,0.192,0.465,138.0,224348,...,1,0,1,0,0,0,0,1,0,0.997919
7hGMh0mpoxgju622XPhgEc,0.914,0.732,-5.432,0.243,0.093,0.0,0.0894,0.495,122.987,203902,...,1,0,1,0,0,0,0,1,0,0.997753
1NIZfoTuiAHIm9VUAwsPZy,0.862,0.655,-5.938,0.255,0.0948,0.0,0.104,0.416,129.869,149183,...,1,0,1,0,0,0,0,1,0,0.997736
6csZYoffpZ7iuSw83x2zVy,0.913,0.788,-2.889,0.263,0.0546,0.00064,0.168,0.544,120.934,143314,...,1,0,1,0,0,0,0,1,0,0.997727
0UebZnJi6DMrj34wgXoVkF,0.798,0.645,-6.844,0.284,0.163,0.0,0.247,0.457,147.017,182941,...,1,0,1,0,0,0,0,1,0,0.997617
