## **UPLOADING THE DATASET FILE**


In [None]:
from google.colab import files
uploaded = files.upload()

Saving archive.zip to archive.zip


## **UNZIP THE DATASET**

In [None]:
import zipfile
import os

for filename in uploaded.keys():
  if filename.endswith(".zip"):
    with zipfile.ZipFile(filename, 'r') as zip_ref:
      zip_ref.extractall("top_spotify")

## **LOAD AND INSPECT THE DATA**

*Load CSV File*

In [None]:
import pandas as pd
import os

for root, dirs, files in os.walk("top_spotify"):
  for file in files:
    print(os.path.join(root, file))

top_spotify/universal_top_spotify_songs.csv


In [None]:
df = pd.read_csv("top_spotify/universal_top_spotify_songs.csv")
df.head()

Unnamed: 0,spotify_id,name,artists,daily_rank,daily_movement,weekly_movement,country,snapshot_date,popularity,is_explicit,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,2RkZ5LkEzeHGRsmDqKwmaJ,Ordinary,Alex Warren,1,1,0,,2025-06-11,95,False,...,2,-6.141,1,0.06,0.704,7e-06,0.055,0.391,168.115,3
1,42UBPzRMh5yyz0EDPr6fr1,Manchild,Sabrina Carpenter,2,-1,48,,2025-06-11,89,True,...,7,-5.087,1,0.0572,0.122,0.0,0.317,0.811,123.01,4
2,0FTmksd2dxiE5e3rWyJXs6,back to friends,sombr,3,0,1,,2025-06-11,98,False,...,1,-2.291,1,0.0301,9.4e-05,8.8e-05,0.0929,0.235,92.855,4
3,7so0lgd0zP2Sbgs2d7a1SZ,Die With A Smile,"Lady Gaga, Bruno Mars",4,0,-1,,2025-06-11,91,False,...,6,-7.727,0,0.0317,0.289,0.0,0.126,0.498,157.964,3
4,6dOtVTDdiauQNBQEDOtlAB,BIRDS OF A FEATHER,Billie Eilish,5,1,0,,2025-06-11,100,False,...,2,-10.171,1,0.0358,0.2,0.0608,0.117,0.438,104.978,4


*Inspect the Dataset*

In [None]:
df.info()
df.describe()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2110316 entries, 0 to 2110315
Data columns (total 25 columns):
 #   Column              Dtype  
---  ------              -----  
 0   spotify_id          object 
 1   name                object 
 2   artists             object 
 3   daily_rank          int64  
 4   daily_movement      int64  
 5   weekly_movement     int64  
 6   country             object 
 7   snapshot_date       object 
 8   popularity          int64  
 9   is_explicit         bool   
 10  duration_ms         int64  
 11  album_name          object 
 12  album_release_date  object 
 13  danceability        float64
 14  energy              float64
 15  key                 int64  
 16  loudness            float64
 17  mode                int64  
 18  speechiness         float64
 19  acousticness        float64
 20  instrumentalness    float64
 21  liveness            float64
 22  valence             float64
 23  tempo               float64
 24  time_signature      int6

Unnamed: 0,0
spotify_id,0
name,30
artists,29
daily_rank,0
daily_movement,0
weekly_movement,0
country,28908
snapshot_date,0
popularity,0
is_explicit,0


In [None]:
print(df.columns.tolist())

['spotify_id', 'name', 'artists', 'daily_rank', 'daily_movement', 'weekly_movement', 'country', 'snapshot_date', 'popularity', 'is_explicit', 'duration_ms', 'album_name', 'album_release_date', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature']


## **PREPROCESS THE DATA**

*Drop or Fill Missing Values*

In [None]:
df = df.dropna()

*Encode Categorical Features*

In [None]:
from sklearn.preprocessing import LabelEncoder

df['name'] = LabelEncoder().fit_transform(df['name'])
df['artists'] = LabelEncoder().fit_transform(df['artists'])
df['country'] = LabelEncoder().fit_transform(df['country'])

*Select Features and Target*

In [None]:
X = df[[
    'name', 'artists', 'country', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature'
]]

y = df['popularity']

## **TRAIN-TEST SPLIT**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## **TRAIN BASELINE MODELS**

*Import ML Models*

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

*Train and Evaluate*

In [None]:
models = {

          "Linear Regression": LinearRegression(),
          "Random Forest": RandomForestRegressor(),
          "Decision Tree": DecisionTreeRegressor()
}

for name, model in models.items():
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  mse = mean_squared_error(y_test, preds)
  r2 = r2_score(y_test, preds)
  print(f"{name} -> MSE: {mse:.2f}, R^2: {r2:.2f}")

Linear Regression -> MSE: 228.18, R^2: 0.08
Random Forest -> MSE: 45.21, R^2: 0.82
Decision Tree -> MSE: 47.92, R^2: 0.81


## **HYPERPARAMETER TUNING**

*Grid Search with Random Forest*

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {

              'n_estimators': [50, 100, 150],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10]
}

grid = GridSearchCV(RandomForestRegressor(), param_grid, cv = 3, scoring = 'neg_mean_squared_error')
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", -grid.best_score_)


KeyboardInterrupt: 

*Randomised Search*

In [None]:
from sklearn.model_selection import RandomizedSearchCV

from scipy.stats import randint
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None] + list(range(5, 25, 5)),
    'min_samples_split': randint(2, 11)
}

random_search = RandomizedSearchCV(RandomForestRegressor(), param_dist, n_iter=10, cv=3, scoring = 'neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

print("Best Params:", random_search.best_params_)
print("Best Score:", -random_search.best_score_)


## **FINAL EVALUATION**

In [None]:
best_model = random_search.best_estimator_
final_preds = best_model.predict(X_test)

print("Final MSE:", mean_squared_error(y_test, final_preds))
print("Final R2 score:", r2_score(y_test, final_preds))