In [29]:
# Update clustering to generate a wider range of genres (8 clusters) and re-run analysis

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


In [None]:
# Use previous clean dataframe for clustering
features = ['bpm', 'danceability_%', 'valence_%', 'energy_%', 'acousticness_%', 
            'instrumentalness_%', 'liveness_%', 'speechiness_%']


In [None]:
# Ensure features are numeric
for col in features:
    df[col] = pd.to_numeric(df[col], errors='coerce')


In [None]:
# Drop rows with missing values in features
df_clean = df.dropna(subset=features).copy()


In [None]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean[features])


In [None]:
# Set number of clusters to 8
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df_clean['genre_cluster'] = kmeans.fit_predict(X_scaled)


In [None]:
# Map cluster numbers to genre names for 8 clusters
genre_names = {
    0: 'Electronic/Dance',
    1: 'Pop/Mainstream',
    2: 'Hip-Hop/Rap',
    3: 'Acoustic/Indie',
    4: 'R&B/Soul',
    5: 'Rock',
    6: 'Country',
    7: 'Alternative/Experimental'
}
df_clean['genre'] = df_clean['genre_cluster'].map(genre_names)


In [None]:
# Create popularity column based on streams (binary, above median)
df_clean['streams'] = pd.to_numeric(df_clean['streams'], errors='coerce')
median_streams = df_clean['streams'].median()
df_clean['popular'] = (df_clean['streams'] > median_streams).astype(int)


In [None]:
# Print distribution of genres
genre_distribution = df_clean['genre'].value_counts().reset_index()
genre_distribution.columns = ['genre', 'count']
print('Genre Distribution:')
print(genre_distribution.head(10))


In [None]:
# Calculate popularity by genre (rate of songs above stream median)
popularity_by_genre = df_clean.groupby('genre')['popular'].mean().reset_index()
popularity_by_genre.rename(columns={'popular': 'popularity_rate'}, inplace=True)
popularity_by_genre = popularity_by_genre.sort_values('popularity_rate', ascending=False)
print('\
Popularity by Genre (rate of songs above stream median):')
print(popularity_by_genre)


In [None]:
# Calculate average streams by genre
avg_streams_by_genre = df_clean.groupby('genre')['streams'].mean().reset_index()
avg_streams_by_genre = avg_streams_by_genre.sort_values('streams', ascending=False)
print('\
Average Streams by Genre:')
print(avg_streams_by_genre)


In [31]:
# Train multinomial logistic regression to predict genre labels
X_genre = df_clean[features]
y_genre = df_clean['genre']

X_genre_train, X_genre_test, y_genre_train, y_genre_test = train_test_split(X_genre, y_genre, test_size=0.2, random_state=42)

model_genre = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model_genre.fit(X_genre_train, y_genre_train)

y_genre_pred = model_genre.predict(X_genre_test)
accuracy_genre = accuracy_score(y_genre_test, y_genre_pred)
report_genre = classification_report(y_genre_test, y_genre_pred)

print('\
Genre Prediction using Multinomial Logistic Regression')
print('Accuracy:')
print(accuracy_genre)
print('Classification Report:')
print(report_genre)

print('done')

Genre Prediction using Multinomial Logistic Regression
Accuracy:
0.9162303664921466
Classification Report:
                          precision    recall  f1-score   support

          Acoustic/Indie       1.00      1.00      1.00         2
Alternative/Experimental       0.95      0.95      0.95        20
                 Country       0.92      0.88      0.90        25
        Electronic/Dance       0.86      1.00      0.92        18
             Hip-Hop/Rap       0.92      0.92      0.92        26
          Pop/Mainstream       0.91      0.83      0.87        35
                R&B/Soul       0.96      0.92      0.94        24
                    Rock       0.91      0.95      0.93        41

                accuracy                           0.92       191
               macro avg       0.93      0.93      0.93       191
            weighted avg       0.92      0.92      0.92       191

done


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
