# Recommender system demo

This notebook implements a demo of *content-based* recommender system. 

Idea: create a vector for a user based on his listening history (by averaging 
normalized tracks vector). Calculate cosine similarities to determine tracks in 
database to recommend.

In [1]:
import pandas as pd
import numpy as np

import processing

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from db.gateway import ChartTracksGateway

In [2]:
tracks_db = processing.get_tracks_with_genres()
tracks_db.head()

Unnamed: 0_level_0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2rPE9A1vEgShuZxxzR2tZH,"thank u, next",0.724,0.647,1.0,-5.642,1.0,0.0658,0.28,0.0,0.102,0.435,106.96,207333.0,4.0,"[pop, dance pop]"
5p7ujcrUXASCNwRaWNHR1C,Without Me,0.752,0.488,6.0,-7.05,1.0,0.0705,0.297,9e-06,0.0936,0.533,136.041,201661.0,4.0,"[indie poptimism, pop, etherpop, dance pop, el..."
1A6OTy97kk0mMdm78rHsm8,Sunflower - Spider-Man: Into the Spider-Verse,0.753,0.498,2.0,-5.61,1.0,0.0504,0.551,0.0,0.0706,0.927,89.95,158053.0,4.0,"[dfw rap, rap, melodic rap, trap]"
4w8niZpiMy6qz1mntFA5uM,"Taki Taki (with Selena Gomez, Ozuna & Cardi B)",0.842,0.801,8.0,-4.167,0.0,0.228,0.157,5e-06,0.0642,0.617,95.881,212500.0,4.0,"[pop, electronic trap, puerto rican pop, post-..."
6zeeWid2sgw4lap2jV61PZ,Better,0.596,0.552,0.0,-10.278,0.0,0.097,0.0765,0.334,0.104,0.112,97.949,229320.0,4.0,"[pop, pop r&b]"


Join `n_streams` values.

In [3]:
chart_tracks_gw = ChartTracksGateway()
chart_tracks_db = chart_tracks_gw.fetch_all()

In [4]:
n_streams = chart_tracks_db[['track_id', 'n_streams']] \
                .groupby(by='track_id') \
                .sum()
tracks_db = tracks_db.join(n_streams, how='inner')
tracks_db.dropna(inplace=True)

tracks_db.head()

Unnamed: 0,title,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,genres,n_streams
2rPE9A1vEgShuZxxzR2tZH,"thank u, next",0.724,0.647,1.0,-5.642,1.0,0.0658,0.28,0.0,0.102,0.435,106.96,207333.0,4.0,"[pop, dance pop]",561076752
5p7ujcrUXASCNwRaWNHR1C,Without Me,0.752,0.488,6.0,-7.05,1.0,0.0705,0.297,9e-06,0.0936,0.533,136.041,201661.0,4.0,"[indie poptimism, pop, etherpop, dance pop, el...",973406239
1A6OTy97kk0mMdm78rHsm8,Sunflower - Spider-Man: Into the Spider-Verse,0.753,0.498,2.0,-5.61,1.0,0.0504,0.551,0.0,0.0706,0.927,89.95,158053.0,4.0,"[dfw rap, rap, melodic rap, trap]",187988079
4w8niZpiMy6qz1mntFA5uM,"Taki Taki (with Selena Gomez, Ozuna & Cardi B)",0.842,0.801,8.0,-4.167,0.0,0.228,0.157,5e-06,0.0642,0.617,95.881,212500.0,4.0,"[pop, electronic trap, puerto rican pop, post-...",893331882
6zeeWid2sgw4lap2jV61PZ,Better,0.596,0.552,0.0,-10.278,0.0,0.097,0.0765,0.334,0.104,0.112,97.949,229320.0,4.0,"[pop, pop r&b]",547708799


### Preprocessing

Let's keep useful features only.

In [5]:
feature_names = [
    'danceability', 'energy', 'key', 'loudness',	
    'mode',	'speechiness', 'acousticness', 'instrumentalness',
    'liveness',	'valence', 'tempo', 'duration_ms', 'time_signature', 'n_streams']
tracks_db = tracks_db[feature_names]
tracks_db.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,n_streams
2rPE9A1vEgShuZxxzR2tZH,0.724,0.647,1.0,-5.642,1.0,0.0658,0.28,0.0,0.102,0.435,106.96,207333.0,4.0,561076752
5p7ujcrUXASCNwRaWNHR1C,0.752,0.488,6.0,-7.05,1.0,0.0705,0.297,9e-06,0.0936,0.533,136.041,201661.0,4.0,973406239
1A6OTy97kk0mMdm78rHsm8,0.753,0.498,2.0,-5.61,1.0,0.0504,0.551,0.0,0.0706,0.927,89.95,158053.0,4.0,187988079
4w8niZpiMy6qz1mntFA5uM,0.842,0.801,8.0,-4.167,0.0,0.228,0.157,5e-06,0.0642,0.617,95.881,212500.0,4.0,893331882
6zeeWid2sgw4lap2jV61PZ,0.596,0.552,0.0,-10.278,0.0,0.097,0.0765,0.334,0.104,0.112,97.949,229320.0,4.0,547708799


Categorical variables have to be represented properly.

In [6]:
cat_features = ['key', 'mode', 'time_signature']
for cat_feature in cat_features:
    dummies = pd.get_dummies(tracks_db[cat_feature], prefix=f'{cat_feature}')
    tracks_db = pd.concat([tracks_db, dummies], axis=1)

tracks_db.drop(columns=cat_features, inplace=True)
tracks_db.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_8.0,key_9.0,key_10.0,key_11.0,mode_0.0,mode_1.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0
2rPE9A1vEgShuZxxzR2tZH,0.724,0.647,-5.642,0.0658,0.28,0.0,0.102,0.435,106.96,207333.0,...,0,0,0,0,0,1,0,0,1,0
5p7ujcrUXASCNwRaWNHR1C,0.752,0.488,-7.05,0.0705,0.297,9e-06,0.0936,0.533,136.041,201661.0,...,0,0,0,0,0,1,0,0,1,0
1A6OTy97kk0mMdm78rHsm8,0.753,0.498,-5.61,0.0504,0.551,0.0,0.0706,0.927,89.95,158053.0,...,0,0,0,0,0,1,0,0,1,0
4w8niZpiMy6qz1mntFA5uM,0.842,0.801,-4.167,0.228,0.157,5e-06,0.0642,0.617,95.881,212500.0,...,1,0,0,0,1,0,0,0,1,0
6zeeWid2sgw4lap2jV61PZ,0.596,0.552,-10.278,0.097,0.0765,0.334,0.104,0.112,97.949,229320.0,...,0,0,0,0,1,0,0,0,1,0


All the features should be in the same range for vector operations.

In [7]:
scaler = MinMaxScaler()
tracks_db_scaled = pd.DataFrame(
    scaler.fit_transform(tracks_db.to_numpy()), 
    columns=tracks_db.columns, index=tracks_db.index)
tracks_db_scaled.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_8.0,key_9.0,key_10.0,key_11.0,mode_0.0,mode_1.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0
2rPE9A1vEgShuZxxzR2tZH,0.713847,0.647347,0.801273,0.045286,0.281687,0.0,0.087405,0.424997,0.358663,0.139542,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5p7ujcrUXASCNwRaWNHR1C,0.744546,0.484821,0.762144,0.05027,0.29879,9e-06,0.078654,0.528014,0.531148,0.135076,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1A6OTy97kk0mMdm78rHsm8,0.745642,0.495042,0.802162,0.028953,0.554324,0.0,0.054693,0.942184,0.257774,0.100735,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4w8niZpiMy6qz1mntFA5uM,0.843219,0.804763,0.842263,0.217308,0.157944,5e-06,0.048026,0.616315,0.292952,0.143611,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6zeeWid2sgw4lap2jV61PZ,0.573512,0.55024,0.672438,0.078375,0.076958,0.339431,0.089488,0.085462,0.305218,0.156857,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


### Getting recommendations

Let's pretend we have some listening history. Each time user listens to 
some track, it gets added to his dataframe.

In [8]:
tracks_user = tracks_db.sample(1, random_state=7)
tracks_user.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_8.0,key_9.0,key_10.0,key_11.0,mode_0.0,mode_1.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0
5WvAo7DNuPRmk4APhdPzi8,0.552,0.76,-4.706,0.342,0.0733,0.0,0.0865,0.639,135.702,260000.0,...,0,0,0,0,0,1,0,0,0,1


First we need to build user's vector. Let's create it with EMA - more recent
tracks contributes more to the vector. 

In [9]:
class UserVector:
    def __init__(self, tracks, scaler, beta=0.9):
        self.scaler = scaler
        self.beta = beta
        self.popularity_col_idx = self._get_popularity_col_idx(tracks)
        self.vec = self.build_vec(tracks)

    def build_vec(self, tracks):
        vec = tracks.to_numpy()
        vec = self.scaler.transform(vec) 

        # Determine n_streams column number and remove it
        vec = np.delete(vec, self.popularity_col_idx, 1)

        # EMA  
        vec_weighted = 0
        for t in range(1, vec.shape[0]+1):
            vec_weighted = self.beta*vec_weighted + (1-self.beta)*vec[t-1]  
                
        vec_weighted = vec_weighted.reshape(1, -1)

        return vec_weighted

    def add_track(self, track):
        track_vec = track.to_numpy().reshape(1, -1)
        track_vec = self.scaler.transform(track_vec)

        # Determine n_streams column number and remove it
        track_vec = np.delete(track_vec, self.popularity_col_idx, 1)

        self.vec = self.beta*self.vec + (1-self.beta)*track_vec

    def _get_popularity_col_idx(self, df):
        return np.nonzero(df.columns == 'n_streams')[0][0]

When computing similarity we take into account both audio feautures and 
popularity (`popularity_rate=-0.5` rewards less known tracks, `popularity_rate=0.5`
rewards most popular tracks).

In [10]:
def get_similarities(tracks_db_scaled, user_vector, popularity_rate=0):
    n_streams = tracks_db_scaled.n_streams.to_numpy().reshape(-1, 1)
    tracks_db_scaled = tracks_db_scaled.drop(['n_streams'], axis=1)
    
    audio_similarities = cosine_similarity(
                            tracks_db_scaled, user_vector.vec)
    similarities = audio_similarities + popularity_rate*n_streams
    
    return similarities

In [11]:
user_vector = UserVector(tracks_user, scaler)
similarities = get_similarities(tracks_db_scaled, user_vector, popularity_rate=0)
tracks_db_sim = tracks_db.assign(similarity=similarities)

# Drop tracks that are already there in user history
# tracks_db = tracks_db.drop(tracks_user.index)

tracks_db_sim.sort_values('similarity', ascending=False).iloc[:10]

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_9.0,key_10.0,key_11.0,mode_0.0,mode_1.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0,similarity
5WvAo7DNuPRmk4APhdPzi8,0.552,0.76,-4.706,0.342,0.0733,0.0,0.0865,0.639,135.702,260000.0,...,0,0,0,0,1,0,0,0,1,1.0
3KbCjPkewCmJt09FYtyVK5,0.599,0.825,-6.484,0.373,0.0734,0.0,0.288,0.419,121.115,227687.0,...,0,0,0,0,1,0,0,0,1,0.989159
5R4Yrc2j4jw1itr4hKcN26,0.575,0.619,-6.663,0.322,0.0208,0.0,0.103,0.296,110.109,212204.0,...,0,0,0,0,1,0,0,0,1,0.984609
3nbtUeypf5UJvxBNA9KCJD,0.604,0.705,-5.749,0.404,0.615,0.0,0.0973,0.662,117.412,234518.0,...,0,0,0,0,1,0,0,0,1,0.971255
1z48NPFPNJEV4UAg83OFd2,0.382,0.331,-9.096,0.0347,0.782,2e-06,0.302,0.277,112.821,194836.0,...,0,0,0,0,1,0,0,0,1,0.895616
7sTtHHrD0zDpmzQzH3zegz,0.455,0.292,-6.792,0.0563,0.889,0.0,0.106,0.43,84.339,178467.0,...,0,0,0,0,1,0,0,0,1,0.888794
4bTZeO72FwMa6wKOiqoynL,0.725,0.79,-2.331,0.197,0.0711,1e-06,0.0602,0.763,175.988,287773.0,...,0,0,0,0,1,0,0,1,0,0.817396
2ROeNgkjxZJE3LJhlkZVGo,0.615,0.915,-4.107,0.403,0.0465,0.0,0.241,0.87,117.664,171905.0,...,0,0,0,0,1,0,0,0,1,0.815663
2Y7vHNr6cQhfmqplHqSHJJ,0.637,0.763,-5.213,0.371,0.12,4e-05,0.0857,0.604,137.974,252094.0,...,0,0,0,1,0,0,0,0,1,0.814661
4Rif4wqdvzy84dhP1apC4Y,0.591,0.888,-3.161,0.274,0.105,2.6e-05,0.0928,0.574,126.881,245000.0,...,0,0,0,0,1,0,0,0,1,0.814068


It's very easy to extend user's vector with a newly listened track.

In [12]:
user_vector.add_track(tracks_db.loc['5nTuQBSjMML1tcgheI2YsY'])