In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Import Libraries** 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

**Import Data Set**

In [None]:
df = pd.read_csv("../input/mall-customers/Mall_Customers.csv")
df.head()

In [None]:
df.rename(columns={"Genre":"Gender"}, inplace=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

**Checking the null values**

In [None]:
df.isnull().sum()

**Checking Outliers**

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,2,1)
sns.boxplot(data=df, y="Annual Income (k$)")

plt.subplot(1,2,2)
sns.boxplot(data=df, y="Spending Score (1-100)")

plt.show()

**Data doesn't contain any outliers**

# Exploratory Data Analysis

In [None]:
# Checking Age Distribution

plt.figure(figsize=(10,6))
sns.set_style('darkgrid')

sns.displot(df.Age)
plt.title("Distribution of AGE\n", fontsize=20, color="green")
plt.xlabel("Age Range", fontsize=15)
plt.ylabel("Density", fontsize=15)

plt.show()

Conclusion - There are customers of wide variety of ages

In [None]:
# Annual Income (k$) Distribution

plt.figure(figsize=(10,6))
sns.set_style('darkgrid')

sns.displot(df["Annual Income (k$)"])
plt.title("Distribution of Annual Income (k$)\n", fontsize=20, color="green")
plt.xlabel("Annual Income (k$)", fontsize=15)
plt.ylabel("Density", fontsize=15)
plt.show()

Conclusion - Most of the annual income falls between 50K to 85K.

In [None]:
# Spending Score (1-100) Distribution

plt.figure(figsize=(10,6))
sns.set_style('darkgrid')

sns.displot(df["Spending Score (1-100)"])
plt.title("Distribution of Spending Score (1-100)\n", fontsize=20, color="green")
plt.xlabel("Spending Score (1-100)", fontsize=15)
plt.ylabel("Density", fontsize=15)
plt.show()

Conclusion - Most of the customers got the score ranging between 40 to 60

# Rescaling

In [None]:
df_scaled = df[["Age","Annual Income (k$)","Spending Score (1-100)"]]

# Class instance
scaler = StandardScaler()

# Fit_transform
df_scaled_fit = scaler.fit_transform(df_scaled)

In [None]:
df_scaled_fit = pd.DataFrame(df_scaled_fit)
df_scaled_fit.columns = ["Age","Annual Income (k$)","Spending Score (1-100)"]
df_scaled_fit.head()

In [None]:
data = df_scaled_fit[["Annual Income (k$)","Spending Score (1-100)"]]

# Model Building

In [None]:
k_means = KMeans(n_clusters=2, n_init='auto')
k_means.fit(data)

In [None]:
k_means.labels_

In [None]:
np.unique(k_means.labels_)

In [None]:
centers = k_means.cluster_centers_

centers

In [None]:
plt.figure(figsize=(10, 8))

plt.scatter(data['Annual Income (k$)'], 
            data['Spending Score (1-100)'], 
            c=k_means.labels_, s=100)

plt.scatter(centers[:,0], centers[:,1], color='blue', marker='s', s=200) 

plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('K-Means with 2 clusters')

plt.show()

In [None]:
from sklearn.metrics import silhouette_score
# measure of how similar a point is to other points in its own  
# cluster and how different it is from points in other clusters.

score = silhouette_score (data, k_means.labels_)

print("Score = ", score)

In [None]:
wscc = []
for i in range(1,15): 
    kmeans = KMeans(n_clusters=i,
                    n_init='auto',
                    init="k-means++",
                    random_state=0)
    kmeans.fit(data)
    wscc.append(kmeans.inertia_)  

plt.plot(range(1,15),wscc,marker="*",c="black")
plt.title("Elbow plot for optimal number of clusters")

# KMeans clustering with 5 clusters 

In [None]:
k_means = KMeans(n_clusters=5,
                n_init='auto')
k_means.fit(data)

In [None]:
np.unique(k_means.labels_)

In [None]:
k_means.labels_

In [None]:
centers = k_means.cluster_centers_

centers

In [None]:
plt.figure(figsize=(10, 8))

plt.scatter(data['Annual Income (k$)'], 
            data['Spending Score (1-100)'], 
            c=k_means.labels_, s=100)

plt.scatter(centers[:,0], centers[:,1], color='blue', marker='s', s=200) 

plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('5 Cluster K-Means')

plt.show()

In [None]:
from sklearn import metrics

score = metrics.silhouette_score(data, k_means.labels_)

print("Score = ", score)