In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score


In [2]:
import os
os.environ['OMP_NUM_THREADS'] = '7'

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:



# Load dataset
data = pd.read_csv('Input_data/cpa_preprocessed.csv')

# Drop target variable
features = data.drop(columns=['NumStorePurchases'])



In [5]:
# # Set default font size for all plots
# plt.rcParams.update({
#     'font.size': 18,
#     'axes.titlesize': 18,
#     'axes.labelsize': 18,
#     'xtick.labelsize': 16,
#     'ytick.labelsize': 16,
#     'legend.fontsize': 18,
#     'figure.titlesize': 22
# })

In [None]:

# Normalize the data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Split into training and test sets
X_train, X_test = train_test_split(normalized_features, test_size=0.2, random_state=30)


In [7]:

# Determine optimal number of clusters using Elbow Method and Silhouette Score
inertia = []
silhouette_scores = []
k_values = range(2, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=30, n_init=15)
    kmeans.fit(X_train)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_train, kmeans.labels_))


In [8]:

# Plot Elbow Method
plt.figure(figsize=(10, 4))
# plt.subplot(1, 2, 1)
fig1, axs = plt.subplots(1, 2)
fig1.suptitle('KMeans Results for CPA Dataset (No Dimensionality Reduction)')

axs[0].plot(k_values, inertia, marker='o')
axs[0].set_xlabel('Number of Clusters')
axs[0].set_ylabel('Inertia')
axs[0].set_title('Elbow Method')

# Plot Silhouette Scores
# plt.subplot(1, 2, 2)
axs[1].plot(k_values, silhouette_scores, marker='o', color='red')
axs[1].set_xlabel('Number of Clusters')
axs[1].set_ylabel('Silhouette Score')
axs[1].set_title('Silhouette Score')


# fig1.tight_layout()
# fig1.subplots_adjust(top=0.88)
plt.show()






In [9]:


# Optimal number of clusters (choosing the best from both methods)
optimal_k = k_values[silhouette_scores.index(max(silhouette_scores))]
print(f'Optimal number of clusters: {optimal_k}')

# Apply KMeans
kmeans = KMeans(n_clusters=optimal_k, random_state=30, n_init=10)
kmeans_labels = kmeans.fit_predict(X_train)

# Apply Expectation Maximization (Gaussian Mixture Model)
gmm = GaussianMixture(n_components=optimal_k, random_state=30)
gmm_labels = gmm.fit_predict(X_train)




In [10]:


# Evaluate clustering performance
kmeans_silhouette = silhouette_score(X_train, kmeans_labels)
kmeans_db = davies_bouldin_score(X_train, kmeans_labels)

gmm_silhouette = silhouette_score(X_train, gmm_labels)
gmm_db = davies_bouldin_score(X_train, gmm_labels)
gmm_log_likelihood = gmm.score(X_train)

print(f'KMeans - Silhouette Score: {kmeans_silhouette}, Davies-Bouldin Index: {kmeans_db}')
print(f'GMM - Silhouette Score: {gmm_silhouette}, Davies-Bouldin Index: {gmm_db}, Log-Likelihood: {gmm_log_likelihood}')





In [11]:

# Visualize clusters using PCA (2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('KMeans Clusters (PCA Reduced)')

plt.subplot(1, 2, 2)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=gmm_labels, palette='coolwarm')
plt.title('GMM Clusters (PCA Reduced)')
plt.show()




In [12]:

# Visualize clusters using PCA (3D)
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_train)

fig = plt.figure(figsize=(12, 5))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=kmeans_labels, cmap='viridis')
ax.set_title('KMeans Clusters (PCA Reduced)')

ax = fig.add_subplot(122, projection='3d')
ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=gmm_labels, cmap='coolwarm')
ax.set_title('GMM Clusters (PCA Reduced)')

plt.show()




#### Finding ideal number of clusters using GMM for CPA


In [None]:
# Determine optimal number of clusters using GMM (BIC/AIC)
bic_scores = []
aic_scores = []

# cpa_gmm_inertia = []
# cpa_gmm_silhouette_scores =[]

k_values = range(2, 15)
log_likelihoods = []

for k in k_values:
    gmm = GaussianMixture(n_components=k, random_state=30)
    gmm.fit(X_train)
    bic_scores.append(gmm.bic(X_train))
    aic_scores.append(gmm.aic(X_train))
    log_likelihoods.append(gmm.score(X_train))
    # cpa_gmm_inertia.append(gmm.inertia_)
    # cpa_gmm_silhouette_scores.append(silhouette_score(X_train, gmm.labels_))


## there is no straight forward way to calculate intertia or silhouette score for GMM
## so we will use BIC and AIC scores to determine the optimal number of clusters


    




In [14]:
# Create DataFrame for BIC, AIC, and Log-Likelihood
scores_df = pd.DataFrame({'Clusters': k_values, 'BIC': bic_scores, 'AIC': aic_scores, 'Log-Likelihood': log_likelihoods})
print(scores_df)



In [15]:

# Plot BIC and AIC Scores
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(k_values, bic_scores, marker='o', color='blue')
plt.xlabel('Number of Clusters')
plt.ylabel('BIC Score')
plt.title('BIC Score for GMM')

plt.subplot(1, 2, 2)
plt.plot(k_values, aic_scores, marker='o', color='red')
plt.xlabel('Number of Clusters')
plt.ylabel('AIC Score')
plt.title('AIC Score for GMM')
plt.show()




In [16]:


# Plot Log-Likelihood
plt.figure(figsize=(6, 4)) 
plt.plot(k_values, log_likelihoods, marker='o', color='green')
plt.xlabel('Number of Clusters')
plt.ylabel('Log-Likelihood')
plt.title('Log-Likelihood for GMM')
plt.show()         



In [17]:

# Optimal number of clusters (choosing the best from BIC)
optimal_k = k_values[bic_scores.index(min(bic_scores))]
print(f'Optimal number of clusters (GMM - BIC): {optimal_k}')

# Apply Expectation Maximization (Gaussian Mixture Model)
gmm = GaussianMixture(n_components=optimal_k, random_state=30)
gmm_labels = gmm.fit_predict(X_train)

# Evaluate clustering performance
gmm_silhouette = silhouette_score(X_train, gmm_labels)
gmm_db = davies_bouldin_score(X_train, gmm_labels)
gmm_log_likelihood = gmm.score(X_train)

print(f'GMM - Silhouette Score: {gmm_silhouette}, Davies-Bouldin Index: {gmm_db}, Log-Likelihood: {gmm_log_likelihood}')




In [34]:

# Visualize clusters using PCA (2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=gmm_labels, palette='tab20')
plt.title('GMM Clusters (PCA Reduced)')
plt.show()




In [35]:

# Visualize clusters using PCA (3D)
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_train)

fig = plt.figure(figsize=(6, 5))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=gmm_labels, cmap='tab20')
ax.set_title('GMM Clusters (PCA Reduced)')

plt.show()




#### Spotify Data


In [20]:
# Load dataset
spotify_data = pd.read_csv('Input_data/spotify_processed.csv')


In [21]:
spotify_data.head()



In [22]:
# Replace NaN values in the 'in_shazam_charts' column with its median
spotify_data['in_shazam_charts'] = spotify_data['in_shazam_charts'].fillna(spotify_data['in_shazam_charts'].median())

# Verify that there are no more NaN values in the 'in_shazam_charts' column
nan_counts_after = spotify_data['in_shazam_charts'].isnull().sum()
print(f"NaN values in 'in_shazam_charts' after replacement: {nan_counts_after}")




In [23]:
# Identify NaN values in 'key' and 'mode' columns
print("NaN values before imputation:")
print(spotify_data[['key', 'mode']].isnull().sum())

# Fill NaN values with most frequent value for each column
spotify_data['key'] = spotify_data['key'].fillna(spotify_data['key'].mode()[0])
spotify_data['mode'] = spotify_data['mode'].fillna(spotify_data['mode'].mode()[0])

# Verify that there are no more NaN values
print("\nNaN values after imputation:")
print(spotify_data[['key', 'mode']].isnull().sum())



In [24]:
spotify_features = spotify_data.drop(columns=['popularity'])

In [25]:
spotify_features_encoded = pd.get_dummies(spotify_features, columns=['key', 'mode'], drop_first=True)

In [26]:
spotify_features_encoded.head()



In [27]:
spotify_features_encoded.describe()



In [28]:

# Normalize the data
spotitfy_scalar = StandardScaler()
normalized_features = spotitfy_scalar.fit_transform(spotify_features_encoded)

# Split into training and test sets
X_train_spotify, X_test_spotify = train_test_split(normalized_features, test_size=0.2, random_state=30)


In [29]:

# Determine optimal number of clusters using Elbow Method and Silhouette Score
inertia_spotify = []
silhouette_scores_spotify = []



In [30]:

k_values = range(2, 14)

for k in k_values:
    kmeans_spotify = KMeans(n_clusters=k, random_state=30, n_init=10)
    kmeans_spotify.fit(X_train_spotify)
    inertia_spotify.append(kmeans_spotify.inertia_)
    silhouette_scores_spotify.append(silhouette_score(X_train_spotify, kmeans_spotify.labels_))


In [31]:

# Plot Elbow Method
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.plot(k_values, inertia_spotify, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method')

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(k_values, silhouette_scores_spotify, marker='o', color='red')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score')
plt.show()

# Optimal number of clusters (choosing the best from both methods)
optimal_k = k_values[silhouette_scores_spotify.index(max(silhouette_scores_spotify))]
print(f'Optimal number of clusters: {optimal_k}')

# Apply KMeans
kmeans = KMeans(n_clusters=optimal_k, random_state=30, n_init=10)
spotify_kmeans_labels = kmeans.fit_predict(X_train_spotify)

# Apply Expectation Maximization (Gaussian Mixture Model)
gmm = GaussianMixture(n_components=optimal_k, random_state=30)
spotify_gmm_labels = gmm.fit_predict(X_train_spotify)

# Evaluate clustering performance
kmeans_silhouette = silhouette_score(X_train_spotify, spotify_kmeans_labels)
kmeans_db = davies_bouldin_score(X_train_spotify, spotify_kmeans_labels)

gmm_silhouette = silhouette_score(X_train_spotify, spotify_gmm_labels)
gmm_db = davies_bouldin_score(X_train_spotify, spotify_gmm_labels)
gmm_log_likelihood = gmm.score(X_train_spotify)

print(f'KMeans - Silhouette Score: {kmeans_silhouette}, Davies-Bouldin Index: {kmeans_db}')
print(f'GMM - Silhouette Score: {gmm_silhouette}, Davies-Bouldin Index: {gmm_db}, Log-Likelihood: {gmm_log_likelihood}')






In [32]:

# Visualize clusters using PCA (2D)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_spotify)

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=spotify_kmeans_labels, palette='viridis')
plt.title('KMeans Clusters (PCA Reduced)')

plt.subplot(1, 2, 2)
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=spotify_gmm_labels, palette='coolwarm')
plt.title('GMM Clusters (PCA Reduced)')
plt.show()

# Visualize clusters using PCA (3D)
pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_train_spotify)

fig = plt.figure(figsize=(12, 5))
ax = fig.add_subplot(121, projection='3d')
ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=spotify_kmeans_labels, cmap='viridis')
ax.set_title('KMeans Clusters (PCA Reduced)')

ax = fig.add_subplot(122, projection='3d')
ax.scatter(X_pca_3d[:, 0], X_pca_3d[:, 1], X_pca_3d[:, 2], c=spotify_gmm_labels, cmap='coolwarm')
ax.set_title('GMM Clusters (PCA Reduced)')

plt.show()




