In [1]:
## Importing the library

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [2]:
data = pd.read_csv("../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")

data

## Data Preprocessing

In [3]:
data.drop('CustomerID', axis=1, inplace=True)

In [4]:
encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])

gender_mappings = {index: label for index, label in enumerate(encoder.classes_)}
gender_mappings

In [5]:
scaler = StandardScaler()

scaled_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [6]:
scaled_data

## Clustering

In [7]:
max_clusters = 50

In [9]:
kmeans_tests = [KMeans(n_clusters=i, n_init=10) for i in range(1, max_clusters)]
inertias = [kmeans_tests[i].fit(scaled_data).inertia_ for i in range(len(kmeans_tests))]

In [10]:
plt.figure(figsize=(7, 5))
plt.plot(range(1, max_clusters), inertias)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title("Choosing the number of Clusters")
plt.show()

In [11]:
kmeans = KMeans(n_clusters=10, n_init=10)
kmeans.fit(scaled_data)

In [12]:
clusters = kmeans.predict(scaled_data)
clusters

## PCA

In [14]:
pca = PCA(n_components=2)

reduced_data = pd.DataFrame(pca.fit_transform(scaled_data), columns=['PC1', 'PC2'])

In [15]:
reduced_data

In [16]:
kmeans.cluster_centers_

In [17]:
reduced_centers = pca.transform(kmeans.cluster_centers_)

In [18]:
reduced_centers

In [20]:
reduced_data['cluster'] = clusters

In [21]:
reduced_data

In [22]:
reduced_data['cluster'] == 7

In [24]:
reduced_data[reduced_data['cluster'] == 7].loc[:, 'PC1']

In [25]:
reduced_data[reduced_data['cluster'] == 7].loc[:, 'PC2']

## Data Visualization

In [27]:
plt.figure(figsize=(14, 10))

plt.scatter(reduced_data[reduced_data['cluster'] == 0].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 0].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 1].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 1].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 2].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 2].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 3].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 3].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 4].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 4].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 5].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 5].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 6].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 6].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 7].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 7].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 8].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 8].loc[:, 'PC2'])
plt.scatter(reduced_data[reduced_data['cluster'] == 9].loc[:, 'PC1'], reduced_data[reduced_data['cluster'] == 9].loc[:, 'PC2'])


plt.scatter(reduced_centers[:, 0], reduced_centers[:, 1], color='black', marker='x', s=300)

plt.xlabel('PC1')
plt.ylabel('PC2')

plt.show()