In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('spotify_streams.csv')

# Clean data
# Drop columns
columns_to_drop = [
    'track_name',
    'artist(s)_name',
    'in_apple_charts', 
    'in_apple_playlists', 
    'in_deezer_playlists', 
    'in_deezer_charts', 
    'in_shazam_charts',
    # 'key',
    'cover_url',
    'streams',
    'in_spotify_playlists',
    'released_year',
    'released_month',
    'released_day',  # Optionally drop if not useful
]
data = data.drop(columns=columns_to_drop)

# # Convert streams to numerical - removed streams for now
# columns_to_clean = ['streams']
# for column in columns_to_clean:
#     data[column] = data[column].str.replace(',', '')  # Remove commas
#     data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric

# # Fill NaN in numerical columns with 0
# data['streams'] = data['streams'].fillna(0)

# Fill NaN in 'key' with 'Unknown' (or another meaningful placeholder)
# data['key'] = data['key'].fillna('Unknown')

# Encode categorial variables
data['mode'] = data['mode'].map({'Major':1, 'Minor':0})
data = pd.get_dummies(data, columns=['key'], drop_first=True)

# Convert boolean to integer for 'key' dummy variables
key_columns = [col for col in data.columns if col.startswith('key_')]
data[key_columns] = data[key_columns].astype(int)

# # Check for missing values
print(data.isnull().sum())

# # View the first few rows
print(data.head())

# # Summary statistics
# print(data.describe())

# # # Data types and missing values
print(data.info())



In [None]:
# Distribution of in_spotify_charts
plt.figure(figsize=(8,6))
sns.histplot(data['in_spotify_charts'], kde=True)
plt.title('Distribution of Songs in Spotify Charts')
plt.xlabel('Number of Spotify Charts')
plt.ylabel('Frequency (Number of Songs)')
plt.show()

# Distribution of BPM
plt.figure(figsize=(8,6))
sns.histplot(data['bpm'], kde=True)
plt.title('Distribution of BPM')
plt.xlabel('Beats Per Minute (BPM)')
plt.ylabel('Frequency')
plt.show()


In [None]:
# plt.figure(figsize=(12,10))
# corr = data.corr()
# sns.heatmap(corr, annot=True, cmap='coolwarm')
# plt.title('Correlation Heatmap')
# plt.show()


# Ensure only numeric columns are used for the correlation matrix
numeric_data = data.select_dtypes(include=[np.number])

# Calculate the correlation matrix
corr = numeric_data.corr()

# Plot the heatmap
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show


In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_features = ['artist_count', 'bpm', 'danceability_%',
                    'valence_%', 'energy_%', 'acousticness_%',
                    'instrumentalness_%', 'liveness_%', 'speechiness_%']

data[numeric_features] = scaler.fit_transform(data[numeric_features])

In [None]:
# Define target variable
y = data['in_spotify_charts']

# Check skewness and apply log transformation if needed
print(f"Skewness of target: {y.skew()}")

if y.skew() > 1 or y.skew() < -1:
    y = np.log1p(y)  # Log transform

# Define feature set (drop target)
X = data.drop(['in_spotify_charts'], axis=1)

# Split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression:")
print(f"MSE: {mse_lr:.2f}")
print(f"MAE: {mae_lr:.2f}")
print(f"R2 Score: {r2_lr:.2f}")

In [None]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Regressor:")
print(f"MSE: {mse_rf:.2f}")
print(f"MAE: {mae_rf:.2f}")
print(f"R2 Score: {r2_rf:.2f}")

In [None]:
# Hyperparameter Tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_rf = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print("Best Parameters:", grid_rf.best_params_)
print("Best Cross-Validation MSE:", -grid_rf.best_score_)

best_rf = grid_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test)

mse_best_rf = mean_squared_error(y_test, y_pred_best_rf)
mae_best_rf = mean_absolute_error(y_test, y_pred_best_rf)
r2_best_rf = r2_score(y_test, y_pred_best_rf)

print("Tuned Random Forest Regressor:")
print(f"MSE: {mse_best_rf:.2f}")
print(f"MAE: {mae_best_rf:.2f}")
print(f"R2 Score: {r2_best_rf:.2f}")

In [None]:
# XGBoost Regressor
from xgboost import XGBRegressor

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regressor:")
print(f"MSE: {mse_xgb:.2f}")
print(f"MAE: {mae_xgb:.2f}")
print(f"R2 Score: {r2_xgb:.2f}")

In [None]:
# Compare Models
model_performance = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Tuned Random Forest', 'XGBoost Regressor'],
    'MSE': [mse_lr, mse_rf, mse_best_rf, mse_xgb],
    'MAE': [mae_lr, mae_rf, mae_best_rf, mae_xgb],
    'R2 Score': [r2_lr, r2_rf, r2_best_rf, r2_xgb]
})

print(model_performance)

In [None]:
# Feature Importances for Tuned Random Forest
importances_best = best_rf.feature_importances_
feature_importances_best = pd.Series(importances_best, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(12,8))
sns.barplot(x=feature_importances_best, y=feature_importances_best.index)
plt.title('Tuned Random Forest Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

In [None]:
# Feature Importances for XGBoost
importances_xgb = xgb.feature_importances_
feature_importances_xgb = pd.Series(importances_xgb, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(12,8))
sns.barplot(x=feature_importances_xgb, y=feature_importances_xgb.index)
plt.title('XGBoost Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()