In [None]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:

RSEED = 42
C = 3

In [None]:
# Create synthetic data
# Using make_blobs to generate synthetic data with 10 features and 3 centers
# This will create a dataset suitable for clustering, with some noise added
# to simulate real-world data variability.
X, y, centroids = make_blobs(n_samples=500, n_features=10, centers=C, cluster_std=2, center_box=(-10.0, 10.0), random_state=RSEED, return_centers=True)

In [None]:
centroids

In [None]:
# test dataset with centroid initialization

X1, y1 = make_blobs(n_samples=500, n_features=10, centers=centroids, cluster_std=3.5, center_box=(-10.0, 10.0), random_state=RSEED)

In [None]:
df2 = pd.DataFrame(X1, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])

In [None]:
df2.to_csv("test_data.csv", index=False)

In [None]:
X[:2]

In [None]:
y[:20]

In [None]:
# Create a DataFrame for plotting
df = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])
df['label'] = y

# Plot using seaborn
sns.scatterplot(data=df, x='x1', y='x2', hue='label', palette='winter')
plt.title('Customer Clusters')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(df['x1'], df['x2'], df['x3'], c=df['label'], cmap='winter', alpha=0.6)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_zlabel('x3')
ax.set_title('first three segmentation features (x1, x2, x3)')
# plt.legend(*scatter.legend_elements(), title="Label")
plt.show()

In [None]:
df.head()

In [None]:
df.to_csv('../data/customer_clusters.csv', index=False)

In [None]:
df = pd.read_csv('../data/customer_clusters.csv')
df.head()

In [None]:
df_features = df.drop(columns='label')

In [None]:
df_features.to_parquet('../data/customer_features.parquet', index=False)

In [None]:
df_features.head()

In [None]:
# Initialize KMeans model
model = KMeans(n_clusters=C, random_state=RSEED)

In [None]:
import pickle

with open('../data/kmeans_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Fit the model to the features
# This will cluster the data into C clusters based on the features
# The model will learn the centroids of these clusters
# The model can then be used to predict the cluster labels for new data 
model.fit(df_features)

In [None]:
model.labels_

In [None]:
df_features['new_label'] = model.labels_

In [None]:
# Create a DataFrame for plotting
df = pd.DataFrame(X, columns=['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10'])
df['label'] = y

# Plot using seaborn
sns.scatterplot(data=df, x='x1', y='x2', hue=df_features['new_label'], palette='winter')
plt.title('Customer Clusters')
plt.show()

In [None]:
df_features.head()

In [None]:
df_features['date'] = pd.to_datetime('2025-06-01')

In [None]:
df_features.head()

In [None]:
cols = df_features.columns.tolist()
cols = cols[-1:] + cols[:-1]  # Move 'date' to the front
cols