# Spotify Most Streamed Songs Analysis

## Overview

This project analyzes and predicts the number of Spotify charts (`in_spotify_charts`) a song appears in, using various song attributes. It leverages machine learning regression models to identify key factors influencing a song's chart presence.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('spotify_streams.csv')

# Clean data
# Drop columns
columns_to_drop = [
    'track_name',
    'artist(s)_name',
    'in_apple_charts', 
    'in_apple_playlists', 
    'in_deezer_playlists', 
    'in_deezer_charts', 
    'in_shazam_charts',
    # 'key',
    'cover_url',
    'streams',
    'in_spotify_playlists',
    'released_year',
    'released_month',
    'released_day',  # Optionally drop if not useful
]
data = data.drop(columns=columns_to_drop)

# Encode categorial variables
data['mode'] = data['mode'].map({'Major':1, 'Minor':0})
data = pd.get_dummies(data, columns=['key'], drop_first=True)

# Convert boolean to integer for 'key' dummy variables
key_columns = [col for col in data.columns if col.startswith('key_')]
data[key_columns] = data[key_columns].astype(int)

print(data.info())

# # View the first few rows
data.head()

# # Summary statistics
# print(data.describe())

# # # Data types and missing values



In [None]:
### Numerical Features Distribution
numeric_features = ['artist_count', 'bpm', 'danceability_%',
                    'valence_%', 'energy_%', 'acousticness_%',
                    'instrumentalness_%', 'liveness_%', 'speechiness_%']

plt.figure(figsize=(15, 20))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(5, 2, i)
    sns.histplot(data[feature], kde=True, bins=30)
    plt.title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()


In [None]:
### Categorical Features Distribution
# 'mode' is already encoded, so we can plot its distribution
plt.figure(figsize=(6,4))
sns.countplot(x='mode', data=data)
plt.title('Distribution of Mode (Major=1, Minor=0)')
plt.xlabel('Mode')
plt.ylabel('Count')
plt.show()

In [None]:
## 2. Correlation Analysis

plt.figure(figsize=(12,10))
corr = data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Identify highly correlated features with the target
target = 'in_spotify_charts'
correlations = data.corr()[target].sort_values(ascending=False)
print("\nCorrelations with Target Variable:")
print(correlations)

In [None]:
## 3. Determine Data Transformations

# Check skewness of the target variable
print(f"\nSkewness of '{target}': {data[target].skew()}")

# Apply log transformation if skewness is high
if abs(data[target].skew()) > 1:
    data[target] = np.log1p(data[target])
    print(f"Applied log transformation to '{target}'. New skewness: {data[target].skew()}")

In [None]:
## 4. Handle Outliers

# Boxplot to identify outliers in numerical features
plt.figure(figsize=(15, 20))
for i, feature in enumerate(numeric_features, 1):
    plt.subplot(5, 2, i)
    sns.boxplot(x=## 5. Feature Scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_features_scaled = ['artist_count', 'bpm', 'danceability_%',
                           'valence_%', 'energy_%', 'acousticness_%',
                           'instrumentalness_%', 'liveness_%', 'speechiness_%']

data[numeric_features_scaled] = scaler.fit_transform(data[numeric_features_scaled])

# Display scaled features
print("\nScaled Numerical Features:")
print(data[numeric_features_scaled].head())data[feature])
    plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()

# Decide on outlier treatment (e.g., capping)
from scipy import stats

# Example: Remove data points with z-score > 3
z_scores = np.abs(stats.zscore(data[numeric_features]))
data = data[(z_scores < 3).all(axis=1)]
print(f"\nData shape after removing outliers: {data.shape}")

In [None]:
## 5. Feature Scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_features_scaled = ['artist_count', 'bpm', 'danceability_%',
                           'valence_%', 'energy_%', 'acousticness_%',
                           'instrumentalness_%', 'liveness_%', 'speechiness_%']

data[numeric_features_scaled] = scaler.fit_transform(data[numeric_features_scaled])

# Display scaled features
print("\nScaled Numerical Features:")
print(data[numeric_features_scaled].head())

In [None]:
## 7. Final Feature Set
# Define target variable
y = data['in_spotify_charts']

# Define feature set (drop target)
X = data.drop(['in_spotify_charts'], axis=1)

# Display feature set
print("\nFeature Set (X):")
print(X.head())

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Modeling

## 1. Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("\nLinear Regression:")
print(f"MSE: {mse_lr:.2f}")
print(f"MAE: {mae_lr:.2f}")
print(f"R2 Score: {r2_lr:.2f}")

In [None]:
## 2. Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("\nRandom Forest Regressor:")
print(f"MSE: {mse_rf:.2f}")
print(f"MAE: {mae_rf:.2f}")
print(f"R2 Score: {r2_rf:.2f}")

In [None]:
## 3. Hyperparameter Tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print("\nBest Parameters from GridSearchCV:", grid_rf.best_params_)
print("Best Cross-Validation MSE:", -grid_rf.best_score_)

# Best model predictions
best_rf = grid_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)

# Evaluate
mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print("\nTuned Random Forest Regressor:")
print(f"MSE: {mse_best_rf:.2f}")
print(f"MAE: {mae_best_rf:.2f}")
print(f"R2 Score: {r2_best_rf:.2f}")

In [None]:
## 5. Compare All Models
model_performance = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Tuned Random Forest'],
    'MSE': [mse_lr, mse_rf, mse_best_rf],
    'MAE': [mae_lr, mae_rf, mae_best_rf],
    'R2 Score': [r2_lr, r2_rf, r2_best_rf]
})

print("\nModel Performance Comparison:")
print(model_performance)


In [None]:
## 6. Feature Importance Analysis

### 6.1 Tuned Random Forest
importances_best = best_rf.feature_importances_
feature_importances_best = pd.Series(importances_best, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(12,8))
sns.barplot(x=feature_importances_best, y=feature_importances_best.index)
plt.title('Tuned Random Forest Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

## Conclusions

In this project, we aimed to predict the number of Spotify charts a song appears in using three regression models: Linear Regression, Random Forest Regressor, and Tuned Random Forest Regressor. The performance of each model is summarized below:

| Model                | MSE      | MAE      | R² Score |
|----------------------|----------|----------|----------|
| Linear Regression    | 2.1389   | 1.2992   | 0.0304   |
| Random Forest        | 2.3009   | 1.3524   | -0.0430  |
| Tuned Random Forest  | 2.1953   | 1.3249   | 0.0049   |

### Key Findings

- **Linear Regression** achieved an R² score of **0.0304**, indicating it explains approximately **3%** of the variance in the target variable. While slightly better than random guessing, the model's predictive power is minimal.

- **Random Forest Regressor** resulted in a negative R² score of **-0.0430**, suggesting that it performs worse than a simple mean predictor. This may be due to overfitting or irrelevant feature inclusion.

- **Tuned Random Forest Regressor** showed a marginal improvement with an R² score of **0.0049**, still indicating poor performance and negligible explanatory power.

### Conclusion

The regression models developed in this study exhibit limited effectiveness in predicting the number of Spotify charts a song appears in. The low and negative R² scores across models suggest that the current feature set and model configurations are insufficient to capture the underlying factors influencing chart presence.
