In [None]:
import numpy as np
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns

import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

import statistics
from itertools import product
from scipy.cluster import hierarchy

from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv(os.path.join(dirname, filename))
df

In [None]:
df = df.drop(['CustomerID'], axis = 1)
df['Annual Income (k$)'] = 1000 * df['Annual Income (k$)']
df.rename(columns = {'Annual Income (k$)': 'Annual Income', 'Spending Score (1-100)': 'Spending Score'}, inplace = True)
print('There are {} missing values in the dataframe'.format(df.isna().sum().sum()))
df.head()

In [None]:
summary = pd.DataFrame(df.describe())
summary = summary.style.background_gradient(cmap = 'Blues') \
          .set_table_attributes("style = 'display: inline'") \
          .set_caption('Statistics of the Dataset') \
          .set_table_styles([{
                'selector': 'caption',
                'props': [
                    ('font-size', '16px')
                ]
          }])
summary

In [None]:
def generate_plotly_distributions_by_gender(feature_name):
    
    fig = px.violin(
                    df, 
                    x = 'Gender', 
                    y = feature_name, 
                    color = 'Gender', 
                    box = True, 
                    points = 'all',
                    template = 'plotly_dark'
    )

    fig.update_layout(
         autosize = False,
         width = 800,
         height = 800,
         title = dict(
             text = '<b> Distribution of ' + feature_name + ' by Gender</b>',
             x = 0.5,
             y = 0.95,
             font = dict(
                 size = 20
             )
         ),
         legend_title = '<b>Gender</b>',
         xaxis_title = '<b>Gender</b>',
         yaxis_title = '<b> ' + feature_name + ' </b>'
    )

    fig.show()
    
generate_plotly_distributions_by_gender(df.columns[1])
generate_plotly_distributions_by_gender(df.columns[2])
generate_plotly_distributions_by_gender(df.columns[3])

concluson: The median age of the men in the dataset is 37 years, whereas the median age for women in the dataset is 35 years. The median Annual Income of men in the dataset is 62.5k, while the median Annual Income for the women is 60k. In terms of Spending Scores, both men and women have the same median value of 50. The maximum values however show the highest Spending Score for men to be 97, wheras that of women to be 99.

In [None]:
fig = px.histogram(df, x = 'Annual Income', y = 'Spending Score', 
                   color = 'Gender', color_discrete_sequence = ['#87BE7A', '#0D782A'], 
                   marginal = 'box', hover_data = df.columns,
                   template = 'plotly_dark')

fig.update_layout(
         autosize = False,
         width = 800,
         height = 800,
         title = dict(
             text = '<b>Sum of Spending Scores vs Income for Men and Women</b>',
             x = 0.5,
             y = 0.95,
             font = dict(
                 size = 20
             )
         ),
         legend_title = '<b>Gender</b>',
         xaxis_title = '<b>Annual Income</b>',
         yaxis_title = '<b>Sum of Spending Scores</b>'
)

fig.show()

Conclusion: Here we analyze the sum of spending scores for both men and women for each group of Annual Income. It can be seen that for most groups of Annual Income women have a higher sum of spending scores as compared to men in the same group. The most common Annual Income range is 70k - 79k, where the sum of spending scores for women is 1076 and that of men is 823. An outlier also exists in the dataset with a male of Age = 30 having an Annual Income = 137k.

In [None]:
correlations = df.corr()
x = correlations.columns.tolist()
y = correlations.index.tolist()
z = correlations.values

fig = ff.create_annotated_heatmap(x = x, y = y, z = z, 
                                  annotation_text = z.round(2), 
                                  colorscale = 'teal',
                                  hovertemplate = "Correlation of %{x} and %{y}= %{z:.2f}")

fig.update_layout(
    title = dict(
         text = '<b>Feature Correlation</b>',
         x = 0.5,
         y = 0.95,
         font = dict(
             size = 20
         )
    ),
    template = 'plotly_dark',
    width = 800,
    height = 800
)

fig.show()

Conclusion: Some of the features have an extremely weak postive correlation (Example: Spending Score and Annual Income with a correlation of 0.01 can be assumed to be independent of each other). Apart from this weak negative correlations also exist in the dataset with Spending Score and Age have a correlation of -0.33, implying that older people might spend less than their younger counterparts.

In [None]:
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
X = df
print(X)

**K-Means Elbow Method**

In [None]:
df['Annual Income'] = df['Annual Income'] / 1000 

In [None]:
distortion = {}

for i in range(1, 16):
    km = KMeans(n_clusters = i, init = 'k-means++', random_state = 0).fit(X)
    distortion.update({i: km.inertia_})

number_of_clusters = list(distortion.keys())
inertia = list(distortion.values())
    
fig = px.line(x = number_of_clusters, y = inertia, markers = True)

fig.update_layout(template = 'plotly_dark',
                  width = 800, 
                  height = 800,
                  title = dict(
                  text = '<b>Elbow Curve</b>',
                     x = 0.5,
                     y = 0.95,
                     font = dict(
                         size = 20
                         )
                  ),
                  xaxis_title = '<b>Number of Clusters</b>',
                  yaxis_title = '<b>Inertia</b>'
)

fig.update_traces(marker = dict(
    size = 7, 
    color = '#B7E0F7')
)

fig.add_annotation(x = 5, y = 75000, 
                   showarrow = True, 
                   text = 'Optimal Clusters', 
                   arrowhead = 1, 
                   arrowsize = 2,
                   ax = 7,
                   ay = -100,
                   bordercolor = '#FFFF01',
                   borderpad = 3
                  )

fig.show()

In [None]:
km = KMeans(n_clusters = 5, random_state = 0)
cluster_predictions = km.fit_predict(X)

df['Cluster Segregation'] = cluster_predictions
df = df.sort_values(by = 'Cluster Segregation')
df['Cluster Segregation'] = df['Cluster Segregation'].astype(str)

fig = px.scatter(df, x = 'Annual Income', 
                 y = 'Spending Score', 
                 color = 'Cluster Segregation', 
                 color_discrete_sequence = px.colors.qualitative.Set2)

fig.update_traces(marker = dict(size = 10, opacity = 0.80))

fig.update_layout(
    template = 'plotly_dark',
    width = 800,
    legend_title = 'Clusters',
    title = dict(
             text = '<b>K-Means Clustering</b>',
             x = 0.5,
             y = 0.95,
             font = dict(
                 size = 20
             )
         ),
         xaxis_title = '<b>Annual Income</b>',
         yaxis_title = '<b>Sum of Spending Scores</b>'
)

fig.show()

K-Means Clusters: Cluster 0 represents that category of customers that have a low annual income (<40k) and thus have a low spending score (<40). Cluster 1 is for the customers that have a high annual income (>50k) and a high spending score (>70). Cluster 2 helps us identify that class of people that have an average annual income (40k - 70k) and an average spending score (40 - 60). Cluster 4 is for the group of customers who, despite having a low annual income have a high spending score. The final group, Cluster 5 represents that category of people who despite having a high annual income have a low spending score.