In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from tabulate import tabulate
import csv

In [28]:
data_link_path = 'https://media.githubusercontent.com/media/npedamalla/sample-csv-files/main/files/customers/customers-100.csv'

In [29]:
df = pd.read_csv(data_link_path, delimiter=',', quoting=csv.QUOTE_MINIMAL)

# Now you can work with the DataFrame 'df'
#print(tabulate(df, headers='keys', tablefmt='psql'))

In [31]:
#print(tabulate(df.tail(), headers='keys', tablefmt='psql'))

In [34]:
print(tabulate(df.info(), headers='keys', tablefmt='psql'))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Index              100 non-null    int64 
 1   Customer Id        100 non-null    object
 2   First Name         100 non-null    object
 3   Last Name          100 non-null    object
 4   Company            100 non-null    object
 5   City               100 non-null    object
 6   Country            100 non-null    object
 7   Phone 1            100 non-null    object
 8   Phone 2            100 non-null    object
 9   Email              100 non-null    object
 10  Subscription Date  100 non-null    object
 11  Website            100 non-null    object
dtypes: int64(1), object(11)
memory usage: 9.5+ KB



In [94]:
import os
import warnings
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from tabulate import tabulate




In [103]:
# Set the environment variable OMP_NUM_THREADS to 1
os.environ["OMP_NUM_THREADS"] = "1"

# Suppress UserWarning related to KMeans memory leak
warnings.filterwarnings("ignore", category=UserWarning)

# Generate a sample dataset with six features and three clusters
data, _ = make_blobs(n_samples=300, n_features=6, centers=3, random_state=42)

# Ensure all values are positive
data[data < 0] = 0

# Set the desired range for positive whole numbers
low_range = 5
high_range = 99
data = np.random.randint(low_range, high_range + 1, size=data.shape)


In [115]:
# Apply KMeans clustering with k=3 (number of centers)
kmeans = KMeans(n_clusters=3, n_init=10, random_state=42)
kmeans.fit(data)

centroids = kmeans.cluster_centers_

# Add cluster labels to the original dataset
data_with_labels = np.column_stack((data, kmeans.labels_))

# Create a DataFrame for Plotly
column_names = [f'Feature_{i}' for i in range(1, 7)] + ['Cluster']
df = pd.DataFrame(data_with_labels, columns=column_names)

# Create a DataFrame for centroids
centroids_df = pd.DataFrame(centroids, columns=column_names[:-1])

df_sorted = df.sort_values(by='Cluster')


print(tabulate(centroids_df, headers='keys', tablefmt='psql'))

print(tabulate(df_sorted.head(), headers='keys', tablefmt='psql'))

+----+-------------+-------------+-------------+-------------+-------------+-------------+
|    |   Feature_1 |   Feature_2 |   Feature_3 |   Feature_4 |   Feature_5 |   Feature_6 |
|----+-------------+-------------+-------------+-------------+-------------+-------------|
|  0 |     30.9423 |     55.2308 |     62.8654 |     43.0288 |     72.5481 |     45.5769 |
|  1 |     58.4688 |     55.5833 |     35.5313 |     53.2812 |     40.3021 |     27.3021 |
|  2 |     62.33   |     43.95   |     51.31   |     51.66   |     42.5    |     78.11   |
+----+-------------+-------------+-------------+-------------+-------------+-------------+
+-----+-------------+-------------+-------------+-------------+-------------+-------------+-----------+
|     |   Feature_1 |   Feature_2 |   Feature_3 |   Feature_4 |   Feature_5 |   Feature_6 |   Cluster |
|-----+-------------+-------------+-------------+-------------+-------------+-------------+-----------|
| 149 |          24 |          20 |          89 |  

In [109]:
# Plot the 3D scatter plot with centroids
fig_3d = px.scatter_3d(df, x='Feature_1', y='Feature_2', z='Feature_3',
                       color='Cluster', opacity=0.8,
                       title='K-Means Clustering (3D)', size_max=10)

# Set the symbol for all data points to 'circle'
fig_3d.update_traces(marker=dict(symbol='circle'))

# Add centroids to the 3D scatter plot
fig_3d.add_trace(go.Scatter3d(
    x=centroids_df['Feature_1'],
    y=centroids_df['Feature_2'],
    z=centroids_df['Feature_3'],
    mode='markers',
    marker=dict(color='red', symbol='diamond', size=10),
    name='Centroids'
))

# Show the 3D plot
fig_3d.show()

In [110]:
# Plot the 2D scatter plot with centroids
fig_2d = px.scatter(df, x='Feature_1', y='Feature_2',
                    color='Cluster', opacity=0.8,
                    title='K-Means Clustering (2D)', size_max=10)

# Set the symbol for all data points to 'circle'
fig_2d.update_traces(marker=dict(symbol='circle'))

# Add centroids to the 2D scatter plot
fig_2d.add_trace(go.Scatter(
    x=centroids_df['Feature_1'],
    y=centroids_df['Feature_2'],
    mode='markers',
    marker=dict(color='red', symbol='diamond', size=10),
    name='Centroids'
))

# Show the 2D plot
fig_2d.show()

In [111]:
# Create a 3D scatter plot with only dots
fig = go.Figure()

for cluster in df['Cluster'].unique():
    cluster_data = df[df['Cluster'] == cluster]
    fig.add_trace(go.Scatter3d(
        x=cluster_data['Feature_1'],
        y=cluster_data['Feature_2'],
        z=cluster_data['Feature_3'],
        mode='markers',
        marker=dict(size=5),
        name=f'Cluster {cluster}'
    ))

# Set layout and show the plot
fig.update_layout(title='K-Means Clustering (3D)',
                  scene=dict(xaxis_title='Feature 1', yaxis_title='Feature 2', zaxis_title='Feature 3'))
#fig.show()

In [114]:
# Plot the initial 2D scatter plot with centroids
fig_2d = px.scatter(df, x='Feature_1', y='Feature_2', color='Cluster', opacity=0.8, title='K-Means Clustering (2D)', size_max=10)
fig_2d.update_traces(marker=dict(symbol='circle'))

# Add initial centroids to the 2D scatter plot
fig_2d.add_trace(go.Scatter(
    x=centroids_df['Feature_1'],
    y=centroids_df['Feature_2'],
    mode='markers',
    marker=dict(color='red', symbol='diamond', size=10),
    name='Centroids'
))

# Show the initial 2D plot
fig_2d.show()

# Perform iterations
num_iterations = 5
for iteration in range(1, num_iterations + 1):
    # Assign data points to the nearest cluster
    labels = kmeans.predict(data)
    
    # Update centroids
    kmeans.cluster_centers_ = centroids = np.array([data[labels == i].mean(axis=0) for i in range(kmeans.n_clusters)])
    
    # Update DataFrame with new labels
    df['Cluster'] = labels
    
    # Plot the updated 2D scatter plot with centroids
    fig_2d = px.scatter(df, x='Feature_1', y='Feature_2', color='Cluster', opacity=0.8, title=f'K-Means Clustering (2D) - Iteration {iteration}', size_max=10)
    fig_2d.update_traces(marker=dict(symbol='circle'))
    
    # Add updated centroids to the 2D scatter plot
    fig_2d.add_trace(go.Scatter(
        x=centroids[:, 0],
        y=centroids[:, 1],
        mode='markers',
        marker=dict(color='red', symbol='diamond', size=10),
        name='Centroids'
    ))
    
    # Show the updated 2D plot
    fig_2d.show()