Import libraries & load data

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Load the dataset with engineered features
df = pd.read_csv("../data/processed/spotify_uk_features.csv")

df.head()


Unnamed: 0,title,rank,date,artist,url,region,chart,trend,streams,is_top100,days_on_chart,trend_num,log_streams,prev_rank,rank_change
0,#AYFKM (Freestyle),200,2019-09-02,Ecko,https://open.spotify.com/track/0eFPzxF2X9hlNcg...,Argentina,top200,NEW_ENTRY,21858.0,0,1,2,9.992368,,
1,#AYFKM (Freestyle),196,2019-09-03,Ecko,https://open.spotify.com/track/0eFPzxF2X9hlNcg...,Argentina,top200,MOVE_UP,23359.0,0,2,1,10.05878,200.0,4.0
2,#AYFKM (Freestyle),193,2019-09-04,Ecko,https://open.spotify.com/track/0eFPzxF2X9hlNcg...,Argentina,top200,MOVE_UP,23800.0,0,3,1,10.077483,196.0,3.0
3,#BrooklynBloodPop!,6,2021-05-14,SyKo,https://open.spotify.com/track/7K9Z3yFNNLv5kwT...,Argentina,viral50,NEW_ENTRY,0.0,1,1,2,0.0,,
4,#BrooklynBloodPop!,6,2021-05-15,SyKo,https://open.spotify.com/track/7K9Z3yFNNLv5kwT...,Argentina,viral50,SAME_POSITION,0.0,1,2,0,0.0,6.0,0.0


In [25]:
import os

file_path = "../data/processed/spotify_uk_features.csv"

print("File exists:", os.path.exists(file_path))
print("File size (bytes):", os.path.getsize(file_path))


File exists: True
File size (bytes): 78692267


Define features and label

In [26]:
feature_cols = [
    "rank",
    "streams",
    "log_streams",
    "days_on_chart",
    "trend_num",
    "prev_rank",
    "rank_change"
]

X = df[feature_cols]
y = df["is_top100"]


Time-based train/test split

You must NOT mix old data and new data randomly — that causes future information leaking into the past.

In [27]:
# Train a Logistic Regression model
df["date"] = pd.to_datetime(df["date"])

# Split the data into training and testing sets based on a date cutoff
cutoff = pd.Timestamp("2021-01-01")

train = df[df["date"] < cutoff]
test  = df[df["date"] >= cutoff]

X_train = train[feature_cols]
y_train = train["is_top100"]

X_test = test[feature_cols]
y_test = test["is_top100"]


len(train), len(test)


(364273, 91035)

In [28]:
print("Total rows:", len(df))
print("Train rows:", len(train))
print("Test rows:", len(test))

print("Date min:", df["date"].min())
print("Date max:", df["date"].max())


Total rows: 455308
Train rows: 364273
Test rows: 91035
Date min: 2017-01-01 00:00:00
Date max: 2021-12-31 00:00:00


Train Logistic Regression

In [29]:
from sklearn.impute import SimpleImputer

# Handle missing values in X_train
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)

log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_imputed, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


Evaluate Logistic Regression

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("log_reg", LogisticRegression(max_iter=2000))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]


Inspect Logistic Regression weights: This shows how eacht features affects prediction.

In [31]:
# Print feature coefficients
for feature, coef in zip(feature_cols, log_reg.coef_[0]):
    print(f"{feature}: {coef:.4f}")

rank: -9.8639
streams: 0.0000
log_streams: 0.0630
days_on_chart: 0.0001
trend_num: -0.0503
prev_rank: -0.1786
rank_change: 0.1793


Train stronger model: Random Forest

In [32]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Evaluate Random Forest

In [33]:
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))

Random Forest:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36400
           1       1.00      1.00      1.00     54635

    accuracy                           1.00     91035
   macro avg       1.00      1.00      1.00     91035
weighted avg       1.00      1.00      1.00     91035

ROC-AUC: 1.0


Feature importance: shows which features matter most

In [34]:
import numpy as np

importances = rf.feature_importances_
sorted_idx = np.argsort(importances)[::-1]

for idx in sorted_idx:
    print(f"{feature_cols[idx]}: {importances[idx]:.4f}")

rank: 0.5629
prev_rank: 0.2670
streams: 0.0788
log_streams: 0.0602
rank_change: 0.0204
days_on_chart: 0.0091
trend_num: 0.0017


Saveing model

In [None]:
import pickle

with open("../models/rf_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

FileNotFoundError: [Errno 2] No such file or directory: '../models/random_forest.pkl'