In [1]:
# TODO:
# * taking time into account
# * taking popularity into account

# Recommender system demo

In [2]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

In [4]:
tracks_db = pd.read_csv('../data/features.csv', index_col='track_id')
tracks_db.head()

Unnamed: 0_level_0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,artists_ids,genres
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0aQA9DP54h37OevE7hRc2a,,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4,"[""6fOMl44jA4Sp5b9PpYCkzz""]","[[""hip hop"", ""pop rap"", ""rap""]]"
4ixBAI0E1M6df9mP27etZK,,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4,"[""46VwdSiSccsgzNhJP9sGhQ"", ""2272CVpxVafY3L0UHu...","[[""icelandic hip hop"", ""icelandic pop""]]"
3jO5IAvslWxaTxswokg66i,,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4,"[""2LRoIwlKmHjgvigdNGBHNo"", ""5J7rXWjtn5HzUkJ4Je...","[[""colombian pop"", ""latin"", ""pop reggaeton"", ""..."
5iS3TEmmRcDJU2pTJQ98zB,,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4,"[""3MaQwUQBigrjcMYHk8bDKm"", ""5Y24oYaTX3VOvtAcG2...","[[""finnish dance pop"", ""finnish hip hop"", ""fin..."
29xPIFa0othUcFrEGyJCZu,,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4,"[""2s7fZ34RK8RUJQ2ewsh6I5"", ""3tTvSeZiFDP3CY5EdP...","[[""francoton"", ""french hip hop"", ""pop urbaine""..."


### Version 1: using pure audio features only

In [5]:
feature_names = [
    'danceability', 'energy', 'key', 'loudness',	
    'mode',	'speechiness', 'acousticness', 'instrumentalness',
    'liveness',	'valence', 'tempo', 'duration_ms', 'time_signature']
tracks_db = tracks_db[feature_names]
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,7,-4.05,0,0.245,0.24,0.0,0.108,0.405,160.026,252712,4
4ixBAI0E1M6df9mP27etZK,0.731,0.494,4,-9.567,1,0.0467,0.201,0.0,0.16,0.477,120.045,228056,4
3jO5IAvslWxaTxswokg66i,0.859,0.464,3,-7.328,1,0.289,0.208,0.0,0.114,0.37,134.966,195453,4
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,10,-10.288,0,0.0869,0.775,0.583,0.107,0.35,174.04,300861,4
29xPIFa0othUcFrEGyJCZu,0.824,0.684,1,-7.185,1,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,4


In [6]:
cat_features = ['key', 'mode', 'time_signature']
for cat_feature in cat_features:
    dummies = pd.get_dummies(tracks_db[cat_feature], prefix=f'{cat_feature}_')
    tracks_db = pd.concat([tracks_db, dummies], axis=1)

tracks_db.drop(columns=cat_features, inplace=True)
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,0,0,1,0,0,0,0,1,0
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,0,0,1,0,0,0,1,0
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,0,0,1,0,0,0,1,0
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,0,1,0,1,0,0,0,0,1,0
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,0,0,1,0,0,0,1,0


In [7]:
scaler = MinMaxScaler()
tracks_db_scaled = pd.DataFrame(
    scaler.fit_transform(tracks_db.to_numpy()), columns=tracks_db.columns)
tracks_db_scaled.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
0,0.662944,0.795214,0.836288,0.253623,0.241449,0.0,0.096867,0.409505,0.695481,0.175277,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.742132,0.492049,0.684643,0.048344,0.202213,0.0,0.150113,0.482305,0.521722,0.155861,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.872081,0.461934,0.746186,0.299172,0.209256,0.0,0.10301,0.374115,0.586569,0.130187,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.48934,0.615524,0.664825,0.089959,0.779678,0.588889,0.095843,0.353893,0.756387,0.213194,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.836548,0.682782,0.750117,0.421325,0.023742,0.008616,0.064305,0.514661,0.613415,0.123509,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [8]:
tracks_user = tracks_db.sample(5)
tracks_user.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__9,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20JtELGPfOAMq7FpUt8eSM,0.866,0.399,-10.938,0.0745,0.117,0.0,0.0689,0.713,129.047,223256,...,0,0,0,0,1,0,0,0,1,0
6NidE9hFNlSMBaTZ4NtgP6,0.772,0.595,-7.576,0.0629,0.406,0.0,0.25,0.37,141.97,170267,...,0,0,0,0,1,0,0,0,1,0
073yIM7v2j6XadD4Nkn0Fh,0.764,0.745,-4.563,0.0475,0.0488,0.0,0.172,0.727,93.013,228933,...,0,0,0,1,0,0,0,0,1,0
3iwZ8hg0q1kTBB5kjg0nO4,0.367,0.623,-9.436,0.0499,0.0199,0.000731,0.159,0.511,128.268,205467,...,0,0,0,0,1,0,0,0,1,0
0vY6s9ixUj40b7IntTBARV,0.62,0.743,-8.102,0.273,0.0423,0.0,0.407,0.412,129.245,183093,...,0,0,0,0,1,0,0,0,0,1


In [9]:
def get_similarites(tracks_db_scaled, tracks_user, scaler):
    user_vector = tracks_user.to_numpy()
    user_vector_scaled = scaler.transform(user_vector)
    user_vector_scaled = np.sum(user_vector_scaled, axis=0, keepdims=True)

    return cosine_similarity(
        tracks_db_scaled, user_vector_scaled)

In [10]:
similarities = get_similarites(tracks_db_scaled, tracks_user, scaler)
track_db = tracks_db.assign(similarity=similarities)
track_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,0,1,0,0,0,0,1,0,0.795361
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,0,1,0,0,0,1,0,0.826087
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,0,1,0,0,0,1,0,0.823895
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,1,0,1,0,0,0,0,1,0,0.633268
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,0,1,0,0,0,1,0,0.922411


In [11]:
track_db.sort_values('similarity', ascending=False).iloc[:10]

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__10,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1BApx7glGPbKRZ4Tqomd2f,0.86,0.929,-0.372,0.0934,0.108,0.0,0.324,0.751,130.001,362240,...,0,0,0,1,0,0,0,1,0,0.941732
7BqoNoJiAv6PWyG9caE58y,0.82,0.84,-3.033,0.208,0.0286,0.0,0.313,0.783,155.977,254241,...,0,0,0,1,0,0,0,1,0,0.941368
6V2yiB8b0769W0MV1pQ95M,0.913,0.86,-1.383,0.131,0.25,0.000226,0.261,0.736,120.022,230000,...,0,0,0,1,0,0,0,1,0,0.941274
5jiGCDUwv8gqWLpWMv4hok,0.904,0.905,-0.753,0.157,0.233,0.000294,0.241,0.796,120.089,196141,...,0,0,0,1,0,0,0,1,0,0.94116
0pSBuHjILhNEo55xK1zrRt,0.804,0.886,-2.512,0.04,0.0837,0.0,0.266,0.789,144.946,238587,...,0,0,0,1,0,0,0,1,0,0.941152
7meQTf97i7gyAzeHTLIytv,0.739,0.832,-3.244,0.151,0.119,1.3e-05,0.245,0.776,176.04,377580,...,0,0,0,1,0,0,0,1,0,0.940739
53uKGbcMjEc2gIWbfmvpdS,0.74,0.83,-3.243,0.153,0.119,1.3e-05,0.243,0.78,176.054,377580,...,0,0,0,1,0,0,0,1,0,0.94073
5vDvBD3ReINP8dXUplj477,0.792,0.743,-2.806,0.0851,0.107,0.0,0.183,0.742,150.024,195637,...,0,0,0,1,0,0,0,1,0,0.940538
14mHfNsSHN0FbLftyjrV9g,0.792,0.743,-2.806,0.0851,0.107,0.0,0.183,0.742,150.024,195637,...,0,0,0,1,0,0,0,1,0,0.940538
4GGwM2Krtlx9G9ii1RaEOx,0.795,0.715,-3.583,0.0836,0.142,2e-06,0.179,0.751,165.95,304733,...,0,0,0,1,0,0,0,1,0,0.940459


### Version 2: adding time

In [17]:
release_dates = pd.date_range(
    end=datetime.today(), periods=len(track_db)).tolist()
tracks_db = track_db.assign(release_date=release_dates)
tracks_db_scaled = tracks_db_scaled.assign(release_date=release_dates)
tracks_db.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity,release_date
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0aQA9DP54h37OevE7hRc2a,0.653,0.796,-4.05,0.245,0.24,0.0,0.108,0.405,160.026,252712,...,0,1,0,0,0,0,1,0,0.795361,1872-05-18 12:49:57.539606
4ixBAI0E1M6df9mP27etZK,0.731,0.494,-9.567,0.0467,0.201,0.0,0.16,0.477,120.045,228056,...,0,0,1,0,0,0,1,0,0.826087,1872-05-19 12:49:57.539606
3jO5IAvslWxaTxswokg66i,0.859,0.464,-7.328,0.289,0.208,0.0,0.114,0.37,134.966,195453,...,0,0,1,0,0,0,1,0,0.823895,1872-05-20 12:49:57.539606
5iS3TEmmRcDJU2pTJQ98zB,0.482,0.617,-10.288,0.0869,0.775,0.583,0.107,0.35,174.04,300861,...,0,1,0,0,0,0,1,0,0.633268,1872-05-21 12:49:57.539606
29xPIFa0othUcFrEGyJCZu,0.824,0.684,-7.185,0.407,0.0236,0.00853,0.0762,0.509,141.143,186973,...,0,0,1,0,0,0,1,0,0.922411,1872-05-22 12:49:57.539606


In [18]:
tracks_user = tracks_db.sample(5)
tracks_user.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key__11,mode__0,mode__1,time_signature__0,time_signature__1,time_signature__3,time_signature__4,time_signature__5,similarity,release_date
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6Sy0fYlJtIVYeCTiIj4RJc,0.686,0.359,-10.579,0.0363,0.535,0.0,0.337,0.683,89.997,198720,...,0,0,1,0,0,0,1,0,0.803195,1988-03-04 12:49:57.539606
4SeuiNnF2n9yA0KztuIFc8,0.723,0.559,-8.098,0.164,0.194,0.0,0.122,0.505,154.956,198467,...,0,0,1,0,0,0,1,0,0.836079,1995-02-24 12:49:57.539606
2n1xrggQtAGEV1AgzvooGB,0.739,0.317,-8.668,0.0905,0.391,0.00487,0.243,0.806,71.165,162373,...,0,0,1,0,0,0,1,0,0.80681,1933-09-17 12:49:57.539606
3S2kxneiC4nGxEWGs1S2D5,0.616,0.957,-3.134,0.194,0.0435,0.0,0.256,0.564,78.013,544547,...,0,0,1,0,0,0,1,0,0.916195,1906-06-06 12:49:57.539606
70zg99pT51vB4wlMS7e4q7,0.775,0.633,-3.332,0.133,0.201,0.0,0.219,0.394,140.008,250333,...,0,1,0,0,0,0,1,0,0.708592,1881-07-14 12:49:57.539606


In [None]:
def get_similarites(tracks_db_scaled, tracks_user, scaler):
    user_vector = tracks_user.to_numpy()
    user_vector_scaled = scaler.transform(user_vector)

    # TODO: weight each row by how far it is from today 
    # (1 if today, ~0 when long time ago)
    user_vector_scaled = np.sum(user_vector_scaled, axis=0, keepdims=True)

    return cosine_similarity(
        tracks_db_scaled, user_vector_scaled)