# Understanding distance and similarity metrics using track's audio features
The purpose of this notebook is to learn about similarity and distance metrics using tracks audio features data and better understand and choose the right metric for similarity between tracks.

In [14]:
import MySQLdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('../')
import myenvvar
import datetime as dt
from sklearn.neighbors import DistanceMetric

In [2]:
conn = MySQLdb.Connection(
        host=myenvvar.db_vars['host'],
        user=myenvvar.db_vars['user'],
        passwd=myenvvar.db_vars['password'],
        port=myenvvar.db_vars['port'],
        db=myenvvar.db_vars['db']
        )
conn.set_character_set('utf8')

# Read tracks audio features

In [8]:
q = "SELECT * FROM audio_features"
audio_features_df = pd.read_sql(q, conn)
audio_features_df.shape

(2100, 17)

In [9]:
audio_features_df.head()

Unnamed: 0,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,total_available_markets,popularity,added_datetime
0,00bOhb4584JjyfTiXX81mO,0.629,0.387,10,-13.667,1,0.0386,0.424,0.0,0.0479,0.497,76.003,275640,4,79,58,2020-06-06 16:50:49
1,00FldKRY0RvsKorwsMzNt3,0.605,0.856,2,-6.094,0,0.0666,0.0346,0.873,0.0874,0.215,152.005,202105,4,79,37,2020-06-06 16:50:49
2,00HqKJWFv3GS9cPfEB1WQm,0.678,0.795,4,-6.309,0,0.131,0.0101,0.0159,0.0981,0.648,118.006,215593,4,79,55,2020-06-10 08:09:43
3,00KCwnrvIXX8GRU3ZMOIBW,0.825,0.692,6,-6.084,0,0.0627,0.172,0.001,0.0861,0.602,123.985,207821,4,2,61,2020-06-12 08:44:13
4,00VpefDueq3OB9zkZHLDVI,0.636,0.963,7,-7.309,1,0.0636,0.00342,0.194,0.113,0.766,124.971,216025,4,2,26,2020-06-08 07:47:57


# Drop columns and scale features

In [10]:
def preprocess_audio_features_df(afdf):
    cols_to_drop = ['key', 'time_signature', 'total_available_markets', 'mode', 'duration_ms','popularity','added_datetime']
    afdf = afdf.drop(cols_to_drop, axis=1)
    afdf.set_index('track_id',inplace=True)
    # normalyze / scale features
    for col in ['loudness', 'tempo']:
        afdf[col] = ((afdf[col] - afdf[col].min()) / (afdf[col].max() - afdf[col].min()))
    return afdf

In [11]:
audio_features_df = preprocess_audio_features_df(audio_features_df)

In [12]:
audio_features_df.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00bOhb4584JjyfTiXX81mO,0.629,0.387,0.48214,0.0386,0.424,0.0,0.0479,0.497,0.219707
00FldKRY0RvsKorwsMzNt3,0.605,0.856,0.739271,0.0666,0.0346,0.873,0.0874,0.215,0.631622
00HqKJWFv3GS9cPfEB1WQm,0.678,0.795,0.731971,0.131,0.0101,0.0159,0.0981,0.648,0.447355
00KCwnrvIXX8GRU3ZMOIBW,0.825,0.692,0.73961,0.0627,0.172,0.001,0.0861,0.602,0.47976
00VpefDueq3OB9zkZHLDVI,0.636,0.963,0.698017,0.0636,0.00342,0.194,0.113,0.766,0.485104


In [17]:
track_ids = audio_features_df.index.values
track_ids[:5]

array(['00bOhb4584JjyfTiXX81mO', '00FldKRY0RvsKorwsMzNt3',
       '00HqKJWFv3GS9cPfEB1WQm', '00KCwnrvIXX8GRU3ZMOIBW',
       '00VpefDueq3OB9zkZHLDVI'], dtype=object)

# Distance and Similarity metrics:

## "euclidean"

In [18]:
dist = DistanceMetric.get_metric('euclidean')
dist_mat = dist.pairwise(audio_features_df)

In [None]:
for i in range(len(track_ids)):
    