In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'customer-segmentation-tutorial-in-python:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F42674%2F74935%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240723%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240723T151135Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D1531b0577452206712caf812e23c4bc2efd3f56044fce136578580b9617dc4d9340c895c1e2906e4825ac9892885676bee5722a06ab1802d3d989acb72359b6be8b4e8fa25504778b067c9c0ccce9104d423108dc5c69a50f8e6c1e628f6c8b0019ad8f535aade86bcf6758850668c3ee17fbdf2f8e04cb4da2acbb7bca299fb264486c0760f130905287b64c3161a7940bcf209d1d68971374eb18c0b097f6e611a42c8c94c1660d9be58afa43e51de51d1e608a0d9041012194a4ab703e9c9ad6f906912dc24a9e36624e3c70cac159965e7779cb9fcb955755c8e30f9ddedc1b941cf7dd6d432235929e00b7d2213fbed6307970d23560bb9f835ab1ccc0d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


**Dataset:** [/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv](http://)

> # **Importing the Modules**

In [None]:
# Importing the modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

> # **Loading and understanding the dataset**

In [None]:
# Loading and understnading the dataset

df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
df.head()

In [None]:
# To get last 5 rows of the dataset
df.tail()

In [None]:
#To print any 10 random rows of the dataset
df.sample(10)

In [None]:
# Display basic information about the DataFrame
print("DataFrame Info:")
print(df.info())

In [None]:
# Generate summary statistics of the numerical columns in the DataFrame
print(df.describe())

In [None]:
# Define features as the columns of your DataFrame (except the target variable)
features = df.drop('Age', axis=1)  # Assuming 'prices' is your target variable

# Now you can split your data into training and testing sets
X_train, X_test = train_test_split(features, test_size=0.2, random_state=42)

> # **Data Preprocessing and Feature Selection**

**Selects relevant features for clustering and standardizes the selected features using StandardScaler.**

In [None]:
# Selecting the features for clustering
features = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

In [None]:
from sklearn.preprocessing import LabelEncoder

# Assuming 'gender' is the column with categorical data
le = LabelEncoder()
X_train['Gender'] = le.fit_transform(X_train['Gender'])
X_test['Gender'] = le.transform(X_test['Gender'])

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

> # **Determine Optimal Number of Clusters and Apply K-means Clustering**

**Determines the optimal number of clusters using the Elbow method and applies K-means clustering with the chosen number of clusters to the standardized features.**

In [None]:
scaled_features = X_train_scaled
scaled_features = np.vstack((X_train_scaled, X_test_scaled))
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

In [None]:
# Plot the Elbow method graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss, marker='o', linestyle='--')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')  # Within-cluster sum of squares
plt.show()

In [None]:
# Based on the elbow method, choose the optimal number of clusters
optimal_clusters = 5

In [None]:
# Apply K-means clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
cluster_labels = kmeans.fit_predict(scaled_features)

In [None]:
# Apply K-means clustering to the training data with the optimal number of clusters
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
train_cluster_labels = kmeans.fit_predict(X_train_scaled)

In [None]:
# Train a KMeans model on the training data
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X_train_scaled)

# Predict cluster labels for the training data
train_cluster_labels = kmeans.labels_

# Predict cluster labels for the test data
test_cluster_labels = kmeans.predict(X_test_scaled)

In [None]:
# Add cluster labels to the original data
X_train['Cluster'] = train_cluster_labels
X_test['Cluster'] = test_cluster_labels

In [None]:
# Train a KMeans model on the training data
kmeans = KMeans(n_clusters=optimal_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(X_train_scaled)

# Predict cluster labels for the training data
train_cluster_labels = kmeans.labels_

# Predict cluster labels for the test data
test_cluster_labels = kmeans.predict(X_test_scaled)

# Print the count of customers in each cluster
print("Training Data:")
print(pd.Series(train_cluster_labels).value_counts())

print("\nTest Data:")
print(pd.Series(test_cluster_labels).value_counts())

In [None]:
# Visualize the clusters (assuming 3D plot for Age, Annual Income, and Spending Score)
from mpl_toolkits.mplot3d import Axes3D

In [None]:
fig = plt.figure(figsize=(12, 8))
plt.title("Customer Segmentation", fontsize = 15)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(features['Age'], features['Annual Income (k$)'], features['Spending Score (1-100)'], c=cluster_labels, cmap='viridis', s=60)
ax.set_xlabel('Age')
ax.set_ylabel('Annual Income (k$)')
ax.set_zlabel('Spending Score (1-100)')
plt.show()

In [None]:
# Assume that you want to cluster based on the 'Age', 'Annual Income (k$)', and 'Spending Score (1-100)' columns
kmeans = KMeans(n_clusters=5)  # Replace 5 with the number of clusters you want
kmeans.fit(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']])

# Generate cluster labels
train_cluster_labels = kmeans.labels_

# Check the length of the cluster labels
print(len(train_cluster_labels))

# Check the shape of your dataframe
print(df.shape)

# If the lengths don't match, you can try to slice your dataframe to match the length of the cluster labels
if len(train_cluster_labels) < df.shape[0]:
    df = df.head(len(train_cluster_labels))

# Now you can assign the cluster labels to a new column in your dataframe
df['Cluster'] = train_cluster_labels

# Now you can create the pairplot
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(df, vars=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'], hue='Cluster', palette='viridis', diag_kind='kde')
plt.suptitle('Pairplot for Customer Segmentation')
plt.show()

In [None]:
for feature in features.columns:
    plt.figure(figsize=(8, 6))
    sns.boxplot(x='Cluster', y=feature, data=df, palette='viridis')
    plt.title(f'Cluster-wise Distribution of {feature}')
    plt.show()

In [None]:
from sklearn.metrics import silhouette_samples  # Import silhouette_samples function
from mpl_toolkits.mplot3d import Axes3D

# Silhouette plot for evaluating cluster quality
silhouette_vals = silhouette_samples(scaled_features, cluster_labels)
plt.figure(figsize=(10, 6))
y_ticks = []
y_lower, y_upper = 0, 0
for i, cluster in enumerate(sorted(df['Cluster'].unique())):
    cluster_silhouette_vals = silhouette_vals[df['Cluster'] == cluster]
    cluster_silhouette_vals.sort()
    y_upper += len(cluster_silhouette_vals)
    color = plt.cm.viridis(float(i) / len(df['Cluster'].unique()))
    plt.barh(range(y_lower, y_upper), cluster_silhouette_vals, color=color)
    y_ticks.append((y_lower + y_upper) / 2)
    y_lower += len(cluster_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color="red", linestyle="--")
plt.yticks(y_ticks, df['Cluster'].unique())
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient values')
plt.title('Silhouette Plot for Clusters')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples

# Compute silhouette scores
silhouette_avg = silhouette_score(scaled_features, cluster_labels)
silhouette_vals = silhouette_samples(scaled_features, cluster_labels)

# Print the average silhouette score
print(f'Average Silhouette Score: {silhouette_avg:.2f}')

# Print silhouette score for each sample (optional)
for i in range(optimal_clusters):
    cluster_silhouette_vals = silhouette_vals[cluster_labels == i]
    print(f'Silhouette Score for Cluster {i}: {np.mean(cluster_silhouette_vals):.2f}')

> # **Conclusion:**

**This project successfully applied K-means clustering to group retail store customers based on their age, annual income, and spending score. The Elbow Method helped determine the optimal number of clusters, and the resulting clusters were visualized in a 3D graph. The silhouette score provided a quantitative evaluation of the clustering quality, ensuring meaningful customer segmentation. This analysis equips businesses with valuable insights into their customer base, allowing for targeted marketing strategies and personalized customer experiences. By understanding customer segments, businesses can enhance customer satisfaction and drive strategic decision-making.**