In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df=pd.read_csv("Spotify -Raw Data.csv",encoding='ISO-8859-1')
df["streams"] = pd.to_numeric(df["streams"], errors="coerce")
df = df.dropna()

threshold = df["streams"].quantile(0.80)  # Top 10% most streamed songs
df["popular"] = (df["streams"] >= threshold).astype(int) 

features = ["bpm", "danceability_%", "valence_%", "energy_%", "acousticness_%", "instrumentalness_%"]
X = df[features]
y = df["popular"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Full dataset distribution:\n", df["popular"].value_counts())
print("Train set distribution:\n", y_train.value_counts())
print("Test set distribution:\n", y_test.value_counts())
print(df["popular"].value_counts())

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred,zero_division=1))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Full dataset distribution:
 popular
0    652
1    164
Name: count, dtype: int64
Train set distribution:
 popular
0    521
1    131
Name: count, dtype: int64
Test set distribution:
 popular
0    131
1     33
Name: count, dtype: int64
popular
0    652
1    164
Name: count, dtype: int64
Accuracy: 0.80

Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89       131
           1       1.00      0.00      0.00        33

    accuracy                           0.80       164
   macro avg       0.90      0.50      0.44       164
weighted avg       0.84      0.80      0.71       164


Confusion Matrix:
 [[131   0]
 [ 33   0]]


In [6]:
df.head(15)

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%,popular
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381700.0,43,...,B,Major,80,89,83,31,0,8,4,0
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716300.0,48,...,C#,Major,71,61,74,7,0,10,4,0
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140004000.0,94,...,F,Major,51,32,53,17,0,31,6,0
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840800.0,116,...,A,Major,55,58,72,11,0,11,15,0
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236300.0,84,...,A,Minor,65,23,80,14,63,11,6,0
5,Sprinter,"Dave, Central Cee",2,2023,6,1,2186,91,183706200.0,67,...,C#,Major,92,66,58,19,0,8,24,0
6,Ella Baila Sola,"Eslabon Armado, Peso Pluma",2,2023,3,16,3090,50,725980100.0,34,...,F,Minor,67,83,76,48,0,8,3,0
7,Columbia,Quevedo,1,2023,7,7,714,43,58149380.0,25,...,F,Major,67,26,71,37,0,11,4,0
8,fukumean,Gunna,1,2023,5,15,1096,83,95217320.0,60,...,C#,Minor,85,22,62,12,0,28,9,0
9,La Bebe - Remix,"Peso Pluma, Yng Lvcas",2,2023,3,17,2953,44,553634100.0,49,...,D,Minor,81,56,48,21,0,8,33,0
