# Customer Segmentation with K-Mean Clustering

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px 
import plotly.graph_objects as go

## Load dataset

In [2]:
!pip install opendatasets -q

In [3]:
import opendatasets as od

In [4]:
url = 'https://www.kaggle.com/datasets/vjchoudhary7/customer-segmentation-tutorial-in-python'

In [5]:
od.download(url,data_dir='./')

Skipping, found downloaded files in "./customer-segmentation-tutorial-in-python" (use force=True to force download)


## Load the Dataset

In [6]:
df = pd.read_csv('./customer-segmentation-tutorial-in-python/Mall_Customers.csv')

In [7]:
df

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
...,...,...,...,...,...
195,196,Female,35,120,79
196,197,Female,45,126,28
197,198,Male,32,126,74
198,199,Male,32,137,18


## Null Values in dataset

In [8]:
df.isna().sum()

CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

## Dtypes

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


## Count of Gender

In [10]:
fig = px.histogram(df,x='Gender',color='Gender',title='Male and Female Count')
fig.show()

## Plot of Relation between Age ,Gender and Annual Income

In [11]:
px.scatter(df,x='Age',y='Annual Income (k$)',color='Gender')

## Plot for relation between Age, Spending Score and Annual Income

In [12]:
px.scatter(df,x='Age',y='Spending Score (1-100)',color='Gender')

## Variable Distribution

In [13]:
fig = go.Figure()
fig.add_trace(go.Box(y=df["Spending Score (1-100)"],name='Spending Score'))
fig.add_trace(go.Box(y=df["Age"],name="Age"))
fig.add_trace(go.Box(y=df["Annual Income (k$)"],name='Annual Income'))
fig.update_layout(
  
    # group together boxes of the different
    # traces for each value of x
    boxmode='group'
)
fig.show()

## Customer Segmentation using Kmeans Clustering

In [14]:
from sklearn.cluster import KMeans

In [15]:
data = df[['Age','Annual Income (k$)','Spending Score (1-100)']]

In [16]:
fig = go.Figure()
WCSS = []
for i in range(1, 11):
    clf = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    clf.fit(data)
    WCSS.append(clf.inertia_) # inertia is another name for WCSS

px.line(df, x=range(1, 11), y=WCSS, title='Life expectancy in Canada')

## Clustering n=5

In [17]:
clf = KMeans(n_clusters=5, init='k-means++', max_iter=300, n_init=10,  random_state=0)
y_kmeans = clf.fit_predict(data)

In [18]:
data_clustered = data.copy()
data_clustered['Cluster'] = y_kmeans

## Annual Income Vs Spending Score Clusters

In [19]:
px.scatter(data_clustered,x='Spending Score (1-100)',y='Annual Income (k$)',color='Cluster',)

* Cluster 3 and Cluster 1 are our target customer for promoting any product