In [1]:
# TODO:
# * taking time into account
# * taking popularity into account

# Recommender system demo

This notebook implements a demo of *content-based* recommender system. 

Idea: create a vector for user based on his listening history (by summing 
normalized tracks vector). Calculate cosine similarities to determine tracks in 
database to recommend.

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [3]:
tracks_db = pd.read_csv('../data/features.csv', index_col='track_id')
tracks_db.head()

Unnamed: 0_level_0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artists_ids,genres
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0aQA9DP54h37OevE7hRc2a,,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4,"[""6fOMl44jA4Sp5b9PpYCkzz""]","[[""hip hop"", ""pop rap"", ""rap""]]"
4ixBAI0E1M6df9mP27etZK,,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4,"[""46VwdSiSccsgzNhJP9sGhQ"", ""2272CVpxVafY3L0UHu...","[[""icelandic hip hop"", ""icelandic pop""]]"
3jO5IAvslWxaTxswokg66i,,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4,"[""2LRoIwlKmHjgvigdNGBHNo"", ""5J7rXWjtn5HzUkJ4Je...","[[""colombian pop"", ""latin"", ""pop reggaeton"", ""..."
5iS3TEmmRcDJU2pTJQ98zB,,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4,"[""3MaQwUQBigrjcMYHk8bDKm"", ""5Y24oYaTX3VOvtAcG2...","[[""finnish dance pop"", ""finnish hip hop"", ""fin..."
29xPIFa0othUcFrEGyJCZu,,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4,"[""2s7fZ34RK8RUJQ2ewsh6I5"", ""3tTvSeZiFDP3CY5EdP...","[[""francoton"", ""french hip hop"", ""pop urbaine""..."


### Version 1: using pure audio features only

In [4]:
feature_names = [
    'danceability', 'energy', 'key', 'loudness',	
    'mode',	'speechiness', 'acousticness', 'instrumentalness',
    'liveness',	'valence', 'tempo', 'duration_ms', 'time_signature']
tracks_db = tracks_db[feature_names]
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4
4ixBAI0E1M6df9mP27etZK,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4
3jO5IAvslWxaTxswokg66i,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4
29xPIFa0othUcFrEGyJCZu,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4


Categorical variables have to be represented properly.

In [5]:
cat_features = ['key', 'mode', 'time_signature']
for cat_feature in cat_features:
    dummies = pd.get_dummies(tracks_db[cat_feature], prefix=f'{cat_feature}_')
    tracks_db = pd.concat([tracks_db, dummies], axis=1)

tracks_db.drop(columns=cat_features, inplace=True)
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,0,0,1,0,0,0,0,1,0
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,0,0,1,0,0,0,1,0
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,0,0,1,0,0,0,1,0
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,0,1,0,1,0,0,0,0,1,0
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,0,0,1,0,0,0,1,0


All the features should be in the same range.

In [6]:
scaler = MinMaxScaler()
tracks_db_scaled = pd.DataFrame(
    scaler.fit_transform(tracks_db.to_numpy()), columns=tracks_db.columns)
tracks_db_scaled.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
0,0.662944,0.795214,0.836288,0.253623,0.241449,0.0,0.096867,0.409505,0.695481,0.175277,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.742132,0.492049,0.684643,0.048344,0.202213,0.0,0.150113,0.482305,0.521722,0.155861,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.872081,0.461934,0.746186,0.299172,0.209256,0.0,0.10301,0.374115,0.586569,0.130187,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.48934,0.615524,0.664825,0.089959,0.779678,0.588889,0.095843,0.353893,0.756387,0.213194,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.836548,0.682782,0.750117,0.421325,0.023742,0.008616,0.064305,0.514661,0.613415,0.123509,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


Let's pretend we have a record of user's listening history (no timestamps).

In [7]:
tracks_user = tracks_db.sample(5)
tracks_user.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1nqaSoiLuqevDes4S18sO7,0.845,0.787,-2.683,0.0374,0.261,3e-06,0.102,0.882,124.983,224997,...,0,0,0,0,1,0,0,0,1,0
4NIOFeptzAsbkOUnyBE6FY,0.715,0.6,-7.9,0.0344,0.314,6e-06,0.0991,0.366,119.0,180748,...,0,0,0,0,1,0,0,0,1,0
7fTsw0IKbbPDN19keS32tF,0.853,0.777,-4.023,0.0684,0.101,0.000615,0.332,0.547,94.031,239373,...,0,0,0,1,0,0,0,0,1,0
3yQbBL8hsYtUhXSrmbKq8X,0.319,0.509,-8.868,0.0305,0.642,0.0,0.39,0.427,144.55,223240,...,0,1,0,0,1,0,0,1,0,0
2h5vv3kvkeCxFjKUT7sLvu,0.661,0.699,-5.876,0.302,0.0953,0.0,0.1,0.598,92.845,185983,...,0,0,0,0,1,0,0,0,1,0


In [8]:
def get_similarites(tracks_db_scaled, tracks_user, scaler):
    user_vector = tracks_user.to_numpy()
    user_vector_scaled = scaler.transform(user_vector)
    user_vector_scaled = np.sum(user_vector_scaled, axis=0, keepdims=True)

    return cosine_similarity(
        tracks_db_scaled, user_vector_scaled)

In [9]:
similarities = get_similarites(tracks_db_scaled, tracks_user, scaler)
track_db = tracks_db.assign(similarity=similarities)
track_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,0,1,0,0,0,0,1,0,0.728231
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,0,1,0,0,0,1,0,0.84178
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,0,1,0,0,0,1,0,0.838337
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,1,0,1,0,0,0,0,1,0,0.708872
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,0,1,0,0,0,1,0,0.888753


In [10]:
track_db.sort_values('similarity', ascending=False).iloc[:10]

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2xjd6t95oEuxTn9vjWkC0n,0.918,0.814,-0.641,0.0582,0.449,0.000951,0.378,0.743,128.003,204561,...,0,0,0,1,0,0,0,1,0,0.923281
5jiGCDUwv8gqWLpWMv4hok,0.904,0.905,-0.753,0.157,0.233,0.000294,0.241,0.796,120.089,196141,...,0,0,0,1,0,0,0,1,0,0.922718
6V2yiB8b0769W0MV1pQ95M,0.913,0.86,-1.383,0.131,0.25,0.000226,0.261,0.736,120.022,230000,...,0,0,0,1,0,0,0,1,0,0.922332
6WBEzul5kA06o2OR1pXqIt,0.766,0.905,-2.249,0.0676,0.353,1.6e-05,0.34,0.82,135.019,217040,...,0,0,0,1,0,0,0,1,0,0.921959
1U3CZqgtekCchvUXV6o9l1,0.891,0.891,-2.139,0.0561,0.314,0.036,0.244,0.721,117.057,249733,...,0,0,0,1,0,0,0,1,0,0.921912
3pM81OKNrV6d2zJb76nQ6M,0.887,0.884,0.076,0.0716,0.404,0.00722,0.0895,0.963,130.009,206860,...,0,0,0,1,0,0,0,1,0,0.921635
3ds1VoDnWeSJ3XbkSamt94,0.871,0.931,-3.647,0.0503,0.305,0.00275,0.232,0.767,125.03,225600,...,0,0,0,1,0,0,0,1,0,0.921422
38I8OemgIPYHdP5uBNY8BO,0.871,0.931,-3.647,0.0504,0.305,0.00275,0.232,0.766,125.026,225600,...,0,0,0,1,0,0,0,1,0,0.921419
0j6LeC4rexb63tw2IaY5Gq,0.765,0.95,-2.409,0.0913,0.441,1.3e-05,0.342,0.882,130.011,178619,...,0,0,0,1,0,0,0,1,0,0.921119
6FMYZDm6d2bNSK4Q5FI8SC,0.76,0.964,-1.922,0.106,0.261,1.3e-05,0.264,0.921,173.95,147168,...,0,0,0,1,0,0,0,1,0,0.921096


### Version 2: adding time

In [17]:
# release_dates = pd.date_range(
#     end=datetime.today(), periods=len(track_db)).tolist()
# tracks_db = track_db.assign(release_date=release_dates)
# tracks_db_scaled = tracks_db_scaled.assign(release_date=release_dates)
# tracks_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity,release_date
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,1,0,0,0,0,1,0,0.795361,1872-05-18 12:49:57.539606
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,1,0,0,0,1,0,0.826087,1872-05-19 12:49:57.539606
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,1,0,0,0,1,0,0.823895,1872-05-20 12:49:57.539606
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,0,1,0,0,0,0,1,0,0.633268,1872-05-21 12:49:57.539606
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,1,0,0,0,1,0,0.922411,1872-05-22 12:49:57.539606


In [11]:
tracks_user = tracks_db.sample(5)
tracks_user.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2CfJo2Paeti7GwinvbO9QE,0.706,0.879,-3.927,0.0392,0.173,0.0,0.329,0.95,130.024,205713,...,0,0,1,0,1,0,0,0,1,0
2tZA8zNo96UD8rdq5vC2CD,0.67,0.84,-4.296,0.0402,0.0655,0.0,0.143,0.711,126.001,181381,...,0,0,0,0,1,0,0,0,1,0
32lexhcz93AinKK0qLTZaZ,0.799,0.676,-5.644,0.169,0.41,0.000777,0.325,0.91,104.956,175200,...,0,0,0,0,1,0,0,0,1,0
5zGjTtwwLfICEeCdjRIz5Y,0.855,0.801,-4.455,0.093,0.0653,0.0622,0.102,0.747,120.019,149345,...,0,0,1,0,1,0,0,0,1,0
5B7AL4ZcRVNsUzR8WUVxmT,0.376,0.77,-5.51,0.0623,0.107,0.0,0.171,0.365,96.14,203573,...,0,0,0,1,0,0,0,0,1,0


In [None]:
def get_similarites(tracks_db_scaled, tracks_user, scaler):
    user_vector = tracks_user.to_numpy()
    user_vector = scaler.transform(user_vector)

    beta = 0.9

    # 1st method - without bias correction
    weights = np.logspace(0, user_vector.shape[0]-1, 
                          num=user_vector.shape[0], base=beta)
    weights *= (1-beta)[::-1].reshape(-1, 1)
    user_vector *= weights

    return cosine_similarity(
        tracks_db_scaled, user_vector)

In [None]:
def get_similarites(tracks_db_scaled, tracks_user, scaler):
    user_vector = tracks_user.to_numpy()
    user_vector = scaler.transform(user_vector)

    beta = 0.9

    # 2nd method - with bias correction   
    user_vector_weighted = 0
    for i in user_vector.shape[0]:
        user_vector_weighted = (
            (beta*user_vector_weighted + (1-beta)*user_vector[i]) / (1-beta**i))    

    return cosine_similarity(
        tracks_db_scaled, user_vector_weighted)

In [None]:
similarities = get_similarites(tracks_db_scaled, tracks_user, scaler)
track_db = tracks_db.assign(similarity=similarities)
track_db.head()

In [None]:
track_db.sort_values('similarity', ascending=False).iloc[:10]

### Version 3: adding popularity

In [None]:
tracks_user = tracks_db.sample(5)
tracks_user.head()

In [None]:
def get_similarites(tracks_db_scaled, tracks_user, scaler, 
        popularity_rate=1):
    user_vector = tracks_user.to_numpy()
    user_vector = scaler.transform(user_vector)

    beta = 0.9

    # 2nd method - with bias correction   
    user_vector_weighted = 0
    for i in user_vector.shape[0]:
        user_vector_weighted = (
            (beta*user_vector_weighted + (1-beta)*user_vector[i]) / (1-beta*i))    

    similarity = cosine_similarity(
        tracks_db_scaled, user_vector_weighted)
    popularitiyes = track_db_scaled.
    similiraty += 