In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("asaniczka/top-spotify-songs-in-73-countries-daily-updated")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/asaniczka/top-spotify-songs-in-73-countries-daily-updated/versions/456


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
df = pd.read_csv(path + '/universal_top_spotify_songs.csv')

df.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2plbrEY59IikOBgBGLjaoe,Die With A Smile,"Lady Gaga, Bruno Mars",1,1,0,,2025-01-03,100,False,...,6,-7.777,0,0.0304,0.308,0.0,0.122,0.535,157.969,3
1,4wJ5Qq0jBN4ajy7ouZIV1c,APT.,"ROSÉ, Bruno Mars",2,-1,0,,2025-01-03,88,False,...,0,-4.477,0,0.26,0.0283,0.0,0.355,0.939,149.027,4
2,6dOtVTDdiauQNBQEDOtlAB,BIRDS OF A FEATHER,Billie Eilish,3,0,3,,2025-01-03,97,False,...,2,-10.171,1,0.0358,0.2,0.0608,0.117,0.438,104.978,4
3,7ne4VBA60CxGM75vw0EYad,That’s So True,Gracie Abrams,4,0,1,,2025-01-03,96,True,...,1,-4.169,1,0.0368,0.214,0.0,0.159,0.372,108.548,4
4,7tI8dRuH2Yc6RuoTjxo4dU,Who,Jimin,5,0,3,,2025-01-03,91,False,...,0,-3.743,0,0.032,0.00289,0.0,0.193,0.838,116.034,4


In [4]:
df.columns

Index(['spotify_id', 'name', 'artists', 'daily_rank', 'daily_movement',
       'weekly_movement', 'country', 'snapshot_date', 'popularity',
       'is_explicit', 'duration_ms', 'album_name', 'album_release_date',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature'],
      dtype='object')

In [5]:
df = df[df['country'].isna()]
df.drop(['country'], axis=1, inplace=True)

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['country'], axis=1, inplace=True)


Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,snapshot_date,popularity,is_explicit,duration_ms,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2plbrEY59IikOBgBGLjaoe,Die With A Smile,"Lady Gaga, Bruno Mars",1,1,0,2025-01-03,100,False,251667,...,6,-7.777,0,0.0304,0.308,0.0,0.122,0.535,157.969,3
1,4wJ5Qq0jBN4ajy7ouZIV1c,APT.,"ROSÉ, Bruno Mars",2,-1,0,2025-01-03,88,False,169917,...,0,-4.477,0,0.26,0.0283,0.0,0.355,0.939,149.027,4
2,6dOtVTDdiauQNBQEDOtlAB,BIRDS OF A FEATHER,Billie Eilish,3,0,3,2025-01-03,97,False,210373,...,2,-10.171,1,0.0358,0.2,0.0608,0.117,0.438,104.978,4
3,7ne4VBA60CxGM75vw0EYad,That’s So True,Gracie Abrams,4,0,1,2025-01-03,96,True,166300,...,1,-4.169,1,0.0368,0.214,0.0,0.159,0.372,108.548,4
4,7tI8dRuH2Yc6RuoTjxo4dU,Who,Jimin,5,0,3,2025-01-03,91,False,170887,...,0,-3.743,0,0.032,0.00289,0.0,0.193,0.838,116.034,4


In [6]:
print(df.isna().sum())

df.dropna(axis=0, inplace=True)

spotify_id            0
name                  0
artists               0
daily_rank            0
daily_movement        0
weekly_movement       0
snapshot_date         0
popularity            0
is_explicit           0
duration_ms           0
album_name            7
album_release_date    7
danceability          0
energy                0
key                   0
loudness              0
mode                  0
speechiness           0
acousticness          0
instrumentalness      0
liveness              0
valence               0
tempo                 0
time_signature        0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(axis=0, inplace=True)


In [7]:
df = df.drop_duplicates(subset=['spotify_id'])

In [8]:
df = df.drop_duplicates(subset=['name', 'artists'], keep='first')

df = df.reset_index(drop=True)

In [9]:
num_features = [
    "danceability",
    "energy",
    'key',
    "loudness",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
]

In [10]:
scaler = StandardScaler()

df[num_features] = scaler.fit_transform(df[num_features])

In [11]:
features = [
    'name',
    'artists',
    'popularity',
    'danceability',
    'energy',
    'loudness',
    'duration_ms',
    'speechiness',
    'acousticness',
    'instrumentalness',
    'liveness',
    'valence',
    'tempo',
    'album_release_date'
]

df['combined_features'] = df[features].apply(lambda x: ' '.join(x.astype(str)), axis=1)

In [12]:
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(df['combined_features'])

In [13]:
similarity = cosine_similarity(tfidf_matrix)

In [14]:
def recommend(song_id, similarity, df, top_n=10):
    song_idx = df.index[df['spotify_id'] == song_id][0]
    similar_indices = similarity[song_idx].argsort()[::-1][1:top_n+1]
    recommendations = df.iloc[similar_indices]
    return recommendations[['spotify_id', 'name', 'artists']]

In [15]:
recommendations = recommend('2plbrEY59IikOBgBGLjaoe', similarity, df, 20)

print(recommendations)

                 spotify_id                             name  \
159  19KlZwqlT3fguP2BeHF1Q1                          Disease   
516  7mXuWTczZNxG5EDcjFEuJR                        LADY GAGA   
1    4wJ5Qq0jBN4ajy7ouZIV1c                             APT.   
35   0KKkJNfGyhkQ5aFogxQAPU               That's What I Like   
205  0nJW01T7XtvILxQgC5J7Wh              When I Was Your Man   
39   7BqBn9nzAq8spo5e7cZ0dJ             Just the Way You Are   
21   3w3y8KPTfNeOKPiqUTakBh             Locked out of Heaven   
230  09LrGvT9KsACH66RHYMDyR                      Don’t Smile   
331  5og4Qzt92jJzVDkOtSEilb             But Daddy I Love Him   
312  6IwmDRpswDujeciIBTiEOL                       love hotel   
484  0FlwhvrncUKrEAhzunmCKm                          Forever   
552  21vc2kQZMS00cAyNT82a1M                         Vultures   
323  7DpUoxGSdlDHfqCYj0otzU                      BITTERSUITE   
243  0rtDE9zfXbamTlRUSwY7zy                  Belong Together   
10   35ISBknsCeZQtq66xABI9g             

In [16]:
X = df[num_features].values
y = df['popularity'].values

In [17]:
def triplet_loss(margin=.5):
    def loss(y_true, y_pred):
        anchor, positive, negative = tf.split(y_pred, num_or_size_splits=3, axis=1)

        positive_distance = tf.reduce_sum(tf.square(anchor - positive), axis=1)
        negative_distance = tf.reduce_sum(tf.square(anchor - negative), axis=1)

        basic_loss = positive_distance - negative_distance + margin
        loss = tf.reduce_mean(tf.maximum(basic_loss, 0.0))

        return loss

    return loss

In [18]:
def create_triplet(data, labels):
    triplets = []
    unique_labels = np.unique(labels)

    for label in unique_labels:
        positive = np.where(labels == label)[0]
        negative = np.where(labels != label)[0]

        for anchor in positive:
            positive_idx = np.random.choice(positive)
            negative_idx = np.random.choice(negative)
            triplets.append((data[anchor], data[positive_idx], data[negative_idx]))

    return np.array(triplets)

In [19]:
triplets = create_triplet(X, y)
anchor, positive, negative = zip(*triplets)

anchor = np.array(anchor)
positive = np.array(positive)
negative = np.array(negative)

In [20]:
def build_encoder():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Input(shape=(X.shape[1],)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(X.shape[1], activation='relu'),
        tf.keras.layers.Lambda(lambda x: tf.nn.l2_normalize(x, axis=1))
    ])

    return model

encoder = build_encoder()

In [21]:
anchor_input = tf.keras.layers.Input(shape=(X.shape[1],))
positive_input = tf.keras.layers.Input(shape=(X.shape[1],))
negative_input = tf.keras.layers.Input(shape=(X.shape[1],))

anchor_embedding = encoder(anchor_input)
positive_embedding = encoder(positive_input)
negative_embedding = encoder(negative_input)

output = tf.keras.layers.Concatenate()([anchor_embedding, positive_embedding, negative_embedding])

model = tf.keras.Model(inputs=[anchor_input, positive_input, negative_input], outputs=output)

model.compile(optimizer='adam', loss=triplet_loss(margin=0.01))
model.summary()

In [22]:
model.fit([anchor, positive, negative], np.zeros((anchor.shape[0], 1)), epochs=10, batch_size=256)

Epoch 1/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.2425
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.2181 
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1935 
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1843 
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.1629 
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1746 
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1545 
Epoch 8/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1486 
Epoch 9/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.1342 
Epoch 10/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1300 


<keras.src.callbacks.history.History at 0x7cd49a90d600>

In [23]:
embeddings = model.predict([anchor, positive, negative])

print(embeddings)

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[[0.12583254 0.         0.43882966 ... 0.         0.7431618  0.26214415]
 [0.2659357  0.44861248 0.5211474  ... 0.         0.         0.46976817]
 [0.3492183  0.         0.01261611 ... 0.         0.         0.        ]
 ...
 [0.         0.16873989 0.2289216  ... 0.         0.01375306 0.        ]
 [0.         0.         0.15271324 ... 0.247483   0.07050628 0.        ]
 [0.43483394 0.         0.         ... 0.         0.         0.13773139]]


In [24]:
embeddings.shape

(607, 30)

In [25]:
model_similarity = cosine_similarity(embeddings)

In [26]:
recommendations = recommend('2plbrEY59IikOBgBGLjaoe', model_similarity, df, 20)

print(recommendations)

                 spotify_id                                        name  \
563  1hjRhYpWyqDpPahmSlUTlc                    Style (Taylor's Version)   
199  2nLtzopw4rPReszdYBJU6h                                        Numb   
359  7Mts0OfPorF4iwOomvfqn1                              So High School   
48   7sd6zMrgGpEa7NkQm9TRrg                                       NADIE   
343  3eh51r6rFWAlGQRlHx9QnQ                                    Push Ups   
202  7MXVkk9YMctZqd1Srtv4MB                                     Starboy   
282  0bBnrokPXtfwXQarqCu1Gz                                   Road Rage   
56   4iZ4pt7kvcaH6Yo8UoZ4s2                                      Snooze   
208  3kXoKlD84c6OmIcOLfrfEs                                   September   
489  2LBqCSwhJGcFQeTHMVGwy3                                 Die For You   
397  1yfKakY4rvI17lk20ekuRA                             Crocodile Tearz   
543  4E63weMCaNZuGPEFMnuEi8                                        exes   
369  0g4fMVo4JjwnIpTfFfLd

In [28]:
results = pd.concat([df['spotify_id'], pd.DataFrame(model_similarity)], axis=1)

results.head()

Unnamed: 0,spotify_id,0,1,2,3,4,5,6,7,8,...,597,598,599,600,601,602,603,604,605,606
0,2plbrEY59IikOBgBGLjaoe,1.0,0.505074,0.602579,0.605785,0.660477,0.453132,0.480302,0.501089,0.700321,...,0.499218,0.555937,0.505696,0.497833,0.336066,0.527565,0.535964,0.758907,0.621222,0.581657
1,4wJ5Qq0jBN4ajy7ouZIV1c,0.505074,1.0,0.596854,0.808874,0.570983,0.577174,0.512853,0.481423,0.545337,...,0.710999,0.605122,0.634022,0.689453,0.465965,0.631527,0.571379,0.6686,0.590915,0.556812
2,6dOtVTDdiauQNBQEDOtlAB,0.602579,0.596854,1.0,0.608851,0.692989,0.672445,0.573109,0.66718,0.737542,...,0.720057,0.75296,0.605609,0.666366,0.570493,0.675039,0.84551,0.689941,0.728588,0.716812
3,7ne4VBA60CxGM75vw0EYad,0.605785,0.808874,0.608851,1.0,0.793698,0.646553,0.630808,0.702083,0.625999,...,0.6891,0.514203,0.597661,0.661143,0.4831,0.577585,0.492418,0.69454,0.667911,0.606654
4,7tI8dRuH2Yc6RuoTjxo4dU,0.660477,0.570983,0.692989,0.793698,1.0,0.714012,0.447998,0.830663,0.73485,...,0.618222,0.578618,0.632949,0.577321,0.614282,0.674477,0.562995,0.570112,0.739585,0.624998


In [30]:
results.to_csv('results.csv', index=False)