In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv("Spotify -Raw Data.csv", encoding='ISO-8859-1')

In [2]:
# Convert 'streams' to numeric (handling non-numeric values)
df["streams"] = pd.to_numeric(df["streams"], errors="coerce")

# Define a hit song as one in the top 25% of streams
threshold = df["streams"].quantile(0.75)
df["hit"] = (df["streams"] >= threshold).astype(int)

In [3]:
# Select relevant numerical features
features = [
    "bpm", "danceability_%", "valence_%", "energy_%", "acousticness_%",
    "instrumentalness_%", "liveness_%", "speechiness_%", "in_spotify_playlists",
    "in_spotify_charts", "in_apple_playlists", "in_apple_charts", "in_deezer_playlists",
    "in_deezer_charts", "in_shazam_charts"
]

# Convert specific columns to numeric and handle missing values
df["in_shazam_charts"] = pd.to_numeric(df["in_shazam_charts"], errors="coerce").fillna(0)
df["in_deezer_playlists"] = pd.to_numeric(df["in_deezer_playlists"], errors="coerce").fillna(0)

# Drop rows with missing values in selected features
df = df.dropna(subset=features)

In [None]:
# We used Random Forest Classification to predict if a song would be a hit.
# Defined a hit song as one in the top 25% of streams.
# Trained the model on features like BPM, Danceability, Energy, and Playlist Count.

In [8]:
# Define X (features) and y (target)
X = df[features]
y = df["hit"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Feature Importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({"Feature": features, "Importance": importances}).sort_values(by="Importance", ascending=False)

# Display results
print(f"Accuracy:{accuracy}")
print(report)
print(feature_importance_df)


Accuracy:0.8952879581151832
              precision    recall  f1-score   support

           0       0.92      0.95      0.93       148
           1       0.79      0.72      0.76        43

    accuracy                           0.90       191
   macro avg       0.86      0.83      0.84       191
weighted avg       0.89      0.90      0.89       191

                 Feature  Importance
8   in_spotify_playlists    0.359961
12   in_deezer_playlists    0.166207
10    in_apple_playlists    0.165453
11       in_apple_charts    0.042597
9      in_spotify_charts    0.031583
1         danceability_%    0.031240
4         acousticness_%    0.029487
3               energy_%    0.028153
0                    bpm    0.027940
13      in_deezer_charts    0.026694
2              valence_%    0.025386
6             liveness_%    0.022472
14      in_shazam_charts    0.020443
7          speechiness_%    0.018095
5     instrumentalness_%    0.004291


In [7]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())


Cross-validated Accuracy Scores: [0.93464052 0.90196078 0.95394737 0.91447368 0.86842105]
Mean CV Accuracy: 0.9146886824905401
