In [3]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from tabulate import tabulate


# Set the environment variable OMP_NUM_THREADS to 1
os.environ["OMP_NUM_THREADS"] = "1"

# Suppress UserWarning related to KMeans memory leak
warnings.filterwarnings("ignore", category=UserWarning)

# Generate a sample dataset with two features and three clusters
np.random.seed(42)
data = np.concatenate([
    np.random.normal(loc=[3, 3], scale=1, size=(100, 2)),
    np.random.normal(loc=[8, 8], scale=1, size=(100, 2)),
    np.random.normal(loc=[6, 1], scale=1, size=(100, 2))
])

# Create a DataFrame for Plotly
df = pd.DataFrame(data, columns=['Feature_1', 'Feature_2'])

# Choose the number of clusters (K)
num_clusters = 3

# Step 1: Choosing the number of clusters (K)
kmeans = KMeans(n_clusters=num_clusters,n_init=10, init='random', random_state=42)

# Step 2: Initializing centroids
initial_centroids = kmeans.fit(df).cluster_centers_

# Create a DataFrame for initial centroids
centroids_df = pd.DataFrame(initial_centroids, columns=['Feature_1', 'Feature_2'])

# Plot the initial 2D scatter plot with centroids
fig_2d = px.scatter(df, x='Feature_1', y='Feature_2', opacity=0.8, title='K-Means Clustering (2D)', size_max=10)
fig_2d.update_traces(marker=dict(symbol='circle'))

# Add initial centroids to the 2D scatter plot
fig_2d.add_trace(go.Scatter(
    x=centroids_df['Feature_1'],
    y=centroids_df['Feature_2'],
    mode='markers',
    marker=dict(color='red', symbol='diamond', size=10),
    name='Initial Centroids'
))

# Show the initial 2D plot
fig_2d.show()


In [4]:
initial_centroids

array([[8.12824872, 8.04348765],
       [5.98394844, 0.83859771],
       [2.91679303, 3.02541557]])

In [12]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from tabulate import tabulate

# Set the environment variable OMP_NUM_THREADS to 1
os.environ["OMP_NUM_THREADS"] = "1"

# Suppress UserWarning related to KMeans memory leak
warnings.filterwarnings("ignore", category=UserWarning)

# Generating synthetic customer data with five positive features
np.random.seed(42)
customer_data = np.random.rand(100, 5) * 100  # Creating positive values between 0 and 100

# Choosing the number of clusters (K=3)
k = 3

# Applying K-means clustering
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(customer_data)

# Convert cluster_labels array to a DataFrame
clusters_df = pd.DataFrame({'Cluster': cluster_labels}, columns=['Cluster'])

# Concatenate the cluster labels DataFrame with the original customer_data DataFrame
customer_data = pd.concat([pd.DataFrame(customer_data), clusters_df], axis=1)

# Displaying the first few rows with assigned clusters
#print(customer_data.head())

#print(tabulate(clusters_df, headers='keys', tablefmt='psql'))

print(tabulate(customer_data.head(), headers='keys', tablefmt='psql'))


+----+----------+----------+---------+---------+---------+-----------+
|    |        0 |        1 |       2 |       3 |       4 |   Cluster |
|----+----------+----------+---------+---------+---------+-----------|
|  0 | 37.454   | 95.0714  | 73.1994 | 59.8658 | 15.6019 |         2 |
|  1 | 15.5995  |  5.80836 | 86.6176 | 60.1115 | 70.8073 |         0 |
|  2 |  2.05845 | 96.991   | 83.2443 | 21.2339 | 18.1825 |         2 |
|  3 | 18.3405  | 30.4242  | 52.4756 | 43.1945 | 29.1229 |         2 |
|  4 | 61.1853  | 13.9494  | 29.2145 | 36.6362 | 45.607  |         1 |
+----+----------+----------+---------+---------+---------+-----------+
