In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\LENOVO\Downloads\archive (18)\spotify_history.csv")
df.head()


Unnamed: 0,spotify_track_uri,ts,platform,ms_played,track_name,artist_name,album_name,reason_start,reason_end,shuffle,skipped
0,2J3n32GeLmMjwuAzyhcSNe,2013-07-08 02:44:34,web player,3185,"Say It, Just Say It",The Mowgli's,Waiting For The Dawn,autoplay,clickrow,False,False
1,1oHxIPqJyvAYHy0PVrDU98,2013-07-08 02:45:37,web player,61865,Drinking from the Bottle (feat. Tinie Tempah),Calvin Harris,18 Months,clickrow,clickrow,False,False
2,487OPlneJNni3NWC8SYqhW,2013-07-08 02:50:24,web player,285386,Born To Die,Lana Del Rey,Born To Die - The Paradise Edition,clickrow,unknown,False,False
3,5IyblF777jLZj1vGHG2UD3,2013-07-08 02:52:40,web player,134022,Off To The Races,Lana Del Rey,Born To Die - The Paradise Edition,trackdone,clickrow,False,False
4,0GgAAB0ZMllFhbNc3mAodO,2013-07-08 03:17:52,web player,0,Half Mast,Empire Of The Sun,Walking On A Dream,clickrow,nextbtn,False,False


In [3]:
# Convert timestamp to datetime
df['ts'] = pd.to_datetime(df['ts'])

# Aggregate by track
agg = df.groupby('spotify_track_uri').agg(
    play_count=('ts','count'),
    first_play=('ts','min'),
    last_play=('ts','max'),
    avg_ms_played=('ms_played','mean'),
    skip_rate=('skipped','mean'),
    shuffle_rate=('shuffle','mean')
).reset_index()

# Recency
agg['recency_days'] = (agg['last_play'] - agg['first_play']).dt.days

# Target label
agg['repeat_within_30d'] = ((agg['play_count'] >= 2) & (agg['recency_days'] <= 30)).astype(int)

agg.head()


Unnamed: 0,spotify_track_uri,play_count,first_play,last_play,avg_ms_played,skip_rate,shuffle_rate,recency_days,repeat_within_30d
0,003d3VbyJTZiiOYT2W7fnQ,1,2021-01-19 06:08:45,2021-01-19 06:08:45,290933.0,0.0,1.0,0,0
1,003vvx7Niy0yvhvHt4a68B,36,2020-08-15 22:23:37,2024-09-27 03:04:15,164376.722222,0.027778,0.916667,1503,0
2,0048lYktR2LGsDBFnE7ohH,1,2020-07-07 00:44:06,2020-07-07 00:44:06,338.0,0.0,0.0,0,0
3,005Dlt8Xaz3DkaXiRJgdiS,1,2024-04-15 02:09:29,2024-04-15 02:09:29,472194.0,0.0,0.0,0,0
4,008wXvCVu8W8vCbq5VQDlC,1,2020-05-01 14:50:44,2020-05-01 14:50:44,13265.0,0.0,0.0,0,0


In [5]:
from sklearn.model_selection import train_test_split

# Features we will use
feature_cols = ['play_count','recency_days','avg_ms_played','skip_rate','shuffle_rate']

# Ensure no NaNs (simple fill)
X = agg[feature_cols].fillna(0)
y = agg['repeat_within_30d']

# If dataset is extremely imbalanced, we will stratify only if both classes present
stratify_arg = y if (y.nunique() > 1) else None

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=stratify_arg
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Positive label ratio in train:", y_train.mean(), "in test:", y_test.mean())


Train shape: (13221, 5) Test shape: (3306, 5)
Positive label ratio in train: 0.11300204220558203 in test: 0.11282516636418632


In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

# Metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_proba) if len(set(y_test))>1 else None
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1-score:", f1)
if roc_auc is not None:
    print("ROC AUC:", roc_auc)
print("Confusion matrix:\\n", cm)


Accuracy: 0.999395039322444
Precision: 0.9946666666666667
Recall: 1.0
F1-score: 0.9973262032085561
ROC AUC: 1.0
Confusion matrix:\n [[2931    2]
 [   0  373]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [9]:
import joblib, os
os.makedirs('models', exist_ok=True)
joblib.dump(model, 'models/xgb_repeat_model.joblib')
print("Saved model to models/xgb_repeat_model.joblib")


Saved model to models/xgb_repeat_model.joblib


In [11]:
# Prepare full X_all (same order as agg)
X_all = agg[feature_cols].fillna(0)
agg['repeat_probability'] = model.predict_proba(X_all)[:,1]

# If you have track_name, artist_name columns in original df, merge first play metadata:
# Create a small metadata table from original df
meta = df.sort_values('ts').groupby('spotify_track_uri').agg({
    'track_name':'first',
    'artist_name':'first',
    'album_name':'first'
}).reset_index()

# Merge
results = agg.merge(meta, on='spotify_track_uri', how='left')

# Top 10 likely-to-repeat tracks
top10 = results.sort_values('repeat_probability', ascending=False).head(10)
top10_display = top10[['spotify_track_uri','track_name','artist_name','play_count','recency_days','repeat_probability']]
top10_display.reset_index(drop=True, inplace=True)
top10_display


Unnamed: 0,spotify_track_uri,track_name,artist_name,play_count,recency_days,repeat_probability
0,2eZ1S67UGlyCB5388TvxHf,Hoy,David Bisbal,4,3,0.999945
1,4HiVpz0nbHCzcIMxVDyLNR,Wild Love,James Bay,4,2,0.999913
2,3Dr1BaQdxLScuXu1RTeCPY,Juan Charrasqueado,Victor Cordero,4,2,0.999913
3,64DxdXTNY58CVu2j5QjCIk,Does Your Mother Know - From 'Mamma Mia!' Orig...,Christine Baranski,5,3,0.999911
4,6lMmEPTbkiVeWixfw34yIF,Desde Que Te Perdí,Kevin Johansen,4,22,0.999908
5,5SQKCi4FIW4mpi9pj1vJ3Y,Qualified,Dr. John,4,8,0.999908
6,6aVssog265SY3b7EuhLDPf,For No One - 2022 Mix,The Beatles,4,7,0.999906
7,6MvGg9X7lIt64WW28Nxfxo,Texas Flood,Stevie Ray Vaughan,4,3,0.999896
8,5EskUV4Rg1VMNVPB32xUme,Nómadas,La Maravillosa Orquesta del Alcohol,4,2,0.999896
9,35Fjjqjlq3YRJbK8QM7OGK,El Agua Clara,Contacto Norte,4,9,0.999896


In [13]:
results.to_csv('repeat_probabilities_all_tracks.csv', index=False)
top10_display.to_csv('top10_recommendations.csv', index=False)
print("Saved: repeat_probabilities_all_tracks.csv and top10_recommendations.csv")


Saved: repeat_probabilities_all_tracks.csv and top10_recommendations.csv
