# Creating Cohorts of Songs – Course-end Project 2
**Simplilearn AI & ML PGP | Spotify Rolling Stones Dataset**

**Objective:** Perform exploratory data analysis and cluster analysis to create cohorts of songs and understand the factors that define each cohort for better song recommendations.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('whitegrid')
print("Libraries loaded successfully.")

## 1. Load Data & Data Dictionary
Load the Rolling Stones Spotify dataset and (optionally) the data dictionary for column definitions.

In [None]:
# Load dataset
df = pd.read_csv('rolling_stones_spotify.csv')
# Remove unnamed index column if present
if df.columns[0].startswith('Unnamed') or df.columns[0] == '':
    df = df.iloc[:, 1:]
print("Dataset shape:", df.shape)
print("\nColumns:", list(df.columns))
df.head()

In [None]:
# Load and display Data Dictionary (column definitions)
try:
    data_dict = pd.read_excel('Data Dictionary - Creating cohorts of songs.xlsx')
    print("Data Dictionary - Creating cohorts of songs:")
    display(data_dict)
except Exception as e:
    print("Data dictionary not loaded:", e)
    print("Spotify audio features: acousticness, danceability, energy, instrumentalness,")
    print("liveness, loudness, speechiness, tempo, valence, popularity, duration_ms")

## 2. Data Quality Checks
Check for missing values and basic statistics of numeric features used for clustering.

In [None]:
# Data quality - missing values
print("Missing values per column:")
print(df.isnull().sum())
print("\nTotal missing:", df.isnull().sum().sum())
print("\nData types:\n", df.dtypes)
# Numeric features (Spotify audio features) for clustering
feature_cols = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness',
               'loudness', 'speechiness', 'tempo', 'valence', 'popularity', 'duration_ms']
df_numeric = df[feature_cols].copy()
print("\nBasic statistics of clustering features:")
df_numeric.describe()

## 3. Exploratory Data Analysis
Understand distributions and relationships among audio features that will define song cohorts.

In [None]:
# Correlation heatmap of numeric features
plt.figure(figsize=(10, 8))
corr = df_numeric.corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation between song audio features')
plt.tight_layout()
plt.show()

In [None]:
# Distributions of key features
fig, axes = plt.subplots(3, 4, figsize=(14, 10))
axes = axes.flatten()
for i, col in enumerate(feature_cols):
    axes[i].hist(df_numeric[col].dropna(), bins=40, edgecolor='black', alpha=0.7)
    axes[i].set_title(col)
    axes[i].set_ylabel('Count')
axes[-1].axis('off')
plt.suptitle('Distribution of song features', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Pairplot of a subset of features (sample for speed)
sample = df_numeric.sample(min(400, len(df_numeric)), random_state=42)
sns.pairplot(sample[['energy', 'danceability', 'acousticness', 'valence', 'popularity']], diag_kind='kde')
plt.suptitle('Pairwise relationships (sample)', y=1.02)
plt.show()

## 4. Cluster Analysis – Creating Cohorts of Songs
Scale features and apply K-Means. Use elbow method and silhouette score to choose number of cohorts (k).

In [None]:
# Handle any missing values and scale features
X = df_numeric.dropna()
if len(X) < len(df_numeric):
    print("Dropped rows with missing values:", len(df_numeric) - len(X))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Scaled feature matrix shape:", X_scaled.shape)

In [None]:
# Elbow method and silhouette score for optimal k
inertias = []
silhouettes = []
K_range = range(2, 12)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(K_range, inertias, 'bo-')
axes[0].set_xlabel('Number of clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow method')
axes[1].plot(K_range, silhouettes, 'go-')
axes[1].set_xlabel('Number of clusters (k)')
axes[1].set_ylabel('Silhouette score')
axes[1].set_title('Silhouette score')
plt.tight_layout()
plt.show()
print("Silhouette scores:", dict(zip(K_range, [round(s, 3) for s in silhouettes])))

In [None]:
# Fit final K-Means with chosen k (e.g. k=5 for interpretable cohorts; adjust based on elbow/silhouette)
n_clusters = 5
kmeans_final = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_scaled)
# Attach cluster to dataframe for interpretation
df_clustered = X.copy()
df_clustered['cohort'] = cluster_labels
print("Cohort (cluster) sizes:")
print(df_clustered['cohort'].value_counts().sort_index())
print("\nSilhouette score for k=%d:" % n_clusters, round(silhouette_score(X_scaled, cluster_labels), 3))

## 5. Cohort Profiles – Factors That Define Each Cohort
Compare mean feature values across cohorts to interpret what type of songs each cohort represents.

In [None]:
# Cohort profiles (mean of each feature per cohort)
cohort_profiles = df_clustered.groupby('cohort')[feature_cols].mean()
cohort_profiles.round(3)

In [None]:
# Visualize cohort profiles (radar/bar)
cohort_profiles_plot = cohort_profiles.copy()
# Normalize for comparison (0-1 scale per feature for display)
for c in cohort_profiles_plot.columns:
    min_v, max_v = cohort_profiles_plot[c].min(), cohort_profiles_plot[c].max()
    if max_v > min_v:
        cohort_profiles_plot[c] = (cohort_profiles_plot[c] - min_v) / (max_v - min_v)
cohort_profiles_plot.plot(kind='bar', figsize=(12, 5))
plt.title('Cohort profiles (normalized feature means)')
plt.xlabel('Cohort')
plt.ylabel('Normalized value')
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# 2D visualization: project cohorts by two important features (e.g. energy vs acousticness)
df_clustered['cohort'] = cluster_labels
plt.figure(figsize=(8, 6))
for i in range(n_clusters):
    subset = df_clustered[df_clustered['cohort'] == i]
    plt.scatter(subset['energy'], subset['acousticness'], label=f'Cohort {i}', alpha=0.6)
plt.xlabel('Energy')
plt.ylabel('Acousticness')
plt.title('Song cohorts in Energy–Acousticness space')
plt.legend()
plt.tight_layout()
plt.show()

## 6. Summary – Factors That Create Cohorts of Songs
Based on EDA and cluster analysis:

In [None]:
# Summary: key differentiators per cohort (top 3 features above/below overall mean)
overall_mean = df_clustered[feature_cols].mean()
print("Factors that create distinct song cohorts (vs overall mean):\n")
for cohort_id in range(n_clusters):
    cohort_mean = cohort_profiles.loc[cohort_id]
    diff = (cohort_mean - overall_mean).reindex(cohort_mean.index)
    diff = diff.reindex(diff.abs().sort_values(ascending=False).index)
    top = diff.head(3)
    print("Cohort %d: " % cohort_id, " | ".join(["%s=%.2f" % (k, v) for k, v in top.items()]))
print("\nInterpretation: Cohorts are driven by energy, acousticness, danceability, valence,")
print("instrumentalness, tempo, and popularity. Use these profiles to recommend similar songs within each cohort.")