# K-Means Clustering in Python

This is a simple example of how to use the K-Means clustering algorithm in Python. It utilizes the `sklearn` library to perform the clustering. The dataset used is fake customer purchase information is generated using the faker library

## Import Libraries

In [None]:
%pip install Faker
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install scikit-learn

## Generate Fake Data

In [4]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

# Number of data points
n = 1000

categories = ['Electronics', 'Fashion', 'Home & Living', 'Beauty & Health', 'Books', 'Sports & Outdoors']

products = {
    'Electronics': ['Smartphone', 'Laptop', 'Headphones', 'Camera', 'Smartwatch'],
    'Fashion': ['Shirt', 'Dress', 'Shoes', 'Handbag', 'Sunglasses'],
    'Home & Living': ['Sofa', 'Dining Table', 'Lamp', 'Cookware', 'Bed'],
    'Beauty & Health': ['Lipstick', 'Perfume', 'Shampoo', 'Body Lotion', 'Nail Polish'],
    'Books': ['Mystery Novel', 'Science Fiction', 'Romance Novel', 'History', 'Biography'],
    'Sports & Outdoors': ['Basketball', 'Tennis Racket', 'Yoga Mat', 'Camping Tent', 'Cycling Helmet']
}

data = {
    'Product Name': [],
    'Category': [],
    'Purchase Date': [],
    'Quantity': [],
    'Price': [],
    'CustomerID': [],
    'zipcode': [],
    'discountApplied': [],
}

for _ in range(n):

    customer_id = fake.random_int(min=1000, max=9999)
    zipcode = fake.zipcode()
    num_purchases = np.random.randint(1, 6)
    for _ in range(num_purchases):
        quantity = np.random.randint(1, 6)
        price = np.round(np.random.uniform(10, 1000), 2)
        category = np.random.choice(categories)
        product_name = np.random.choice(products[category])
        purchase_date = fake.date_between_dates(date_start=datetime.now() - timedelta(days=365), date_end=datetime.now())
        discount_applied = np.random.choice([True, False])
        
        data['Product Name'].append(product_name)
        data['Category'].append(category)
        data['Purchase Date'].append(purchase_date)
        data['Quantity'].append(quantity)
        data['Price'].append(price)
        data['CustomerID'].append(customer_id)
        data['zipcode'].append(zipcode)
        data['discountApplied'].append(discount_applied)


length = len(data['Product Name'])
for key in data:
    if len(data[key]) != length:
        data[key] += [None] * (length - len(data[key]))

df = pd.DataFrame(data)

# Save the data to a CSV file
# df.to_csv('fake_purchase_history.csv', index=False)

print("Fake purchase history data generated")

Fake purchase history data generated


## Perform Clustering

In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import pandas as pd

# df = pd.read_csv('fake_purchase_history.csv')

scaler = StandardScaler()
X = scaler.fit_transform(df[['Quantity', 'Price']])

kmeans = KMeans(n_clusters=3, random_state=42)

kmeans.fit(X)

labels = kmeans.labels_

df['Cluster'] = labels

# Save the post clustering data to a CSV file
# df.to_csv('fake_purchase_history_clusters.csv', index=False)

print("K-means clustering complete. Results included in df under column 'Cluster'")

  super()._check_params_vs_input(X, default_n_init=10)


K-means clustering complete. Results included in df under column 'Cluster'


## Visualize Clusters

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.scatter(df['Quantity'], df['Price'], c=df['Cluster'])

plt.xlabel('Quantity')
plt.ylabel('Price')
plt.title('K-Means Clustering Results')

plt.show()

## Post Clustering Analysis

### Cluster Means

In [7]:
import pandas as pd

cluster_means = df.groupby('Cluster')[['Quantity', 'Price']].mean()

overall_means = df[['Quantity', 'Price']].mean()

# Print the mean values for each cluster and the overall dataset
print("Cluster means:")
print(cluster_means)
print("\nOverall means:")
print(overall_means)

Cluster means:
         Quantity       Price
Cluster                      
0        3.695148  777.401582
1        1.462825  459.729052
2        4.069038  268.905941

Overall means:
Quantity      3.009060
Price       499.570215
dtype: float64


### Cluster Sizes

In [8]:
import pandas as pd

# Size of each cluster
cluster_sizes = df.groupby('Cluster').size()

# Percentage of each cluster
cluster_percentages = cluster_sizes / cluster_sizes.sum()

print("Cluster sizes:")
print(cluster_sizes)
print("\nCluster percentages:")
print(cluster_percentages)

Cluster sizes:
Cluster
0     948
1    1076
2     956
dtype: int64

Cluster percentages:
Cluster
0    0.318121
1    0.361074
2    0.320805
dtype: float64
