### Customer Segmentation- Unsupervised Learning

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df=pd.read_excel("Credit Card Customer Data.xlsx")
df.head()

In [None]:
df.shape

In [None]:
df['Customer Key'].nunique()

In [None]:
## duplicate rows in dataframe
df[df.duplicated(['Customer Key'])]

In [None]:
df=df.iloc[:,2:]
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
plt.boxplot(df['Avg_Credit_Limit'])

In [None]:
df['Avg_Credit_Limit']=np.log(df['Avg_Credit_Limit'])
plt.boxplot(df['Avg_Credit_Limit'])

In [None]:
df.columns

In [None]:
sns.distplot(df['Total_calls_made'])

In [None]:
##df['Total_visits_online'].replace(0,0.000000001,inplace=True)
df['Total_visits_online']=np.sqrt(df['Total_visits_online'])
plt.boxplot(df['Total_visits_online'])


In [None]:
plt.boxplot(df['Total_calls_made'])

In [None]:
## scale/Normalise the data
from scipy.stats import zscore
df_z=df.apply(zscore)
df_z.head()

Univariate Analysis

In [None]:
sns.pairplot(df_z,diag_kind='kde')

From the above pairplot, its intuitive to have atleast 4 or 5 clusters


In [None]:
corr_df=df_z.corr()
sns.heatmap(corr_df,annot=True)

The positive relationship between Avg_credit_limit and Total_Credit_Cards is positive, which is obvious that, if the customer has more number of credit cards, then his credit limit is likely to be high.
All other features seems to be reasonably independant of each other

## Unsupervised Learning
## K Means Clustering 

In [None]:
from sklearn.cluster import KMeans
## Finding the optimal k
n_clusters=range(1,18)
cluster_error=[]
for cluster in n_clusters:
    k_means=KMeans(cluster,random_state=30)
    k_means.fit(df_z)
    error=k_means.inertia_
    cluster_error.append(error)
    

In [None]:
## Elbow plot
plt.plot(n_clusters,cluster_error,marker='o')
plt.title('Elbow Plot')
plt.xlabel("Clusters")
plt.ylabel("Average In-Cluster distortion")

In [None]:
## From the above plot, k=4 or k=5 is optimal
#Lets try with k=4
k_means=KMeans(n_clusters=4,random_state=50,n_init=5)
k_means.fit(df_z)
label=k_means.labels_
centers=k_means.cluster_centers_


In [None]:
## Cluster Centroids
#kmeans_center=pd.DataFrame(centers,columns=list(df_z))
#kmeans_center.T

In [None]:
## Silhoutte score for KMeans model
from sklearn.metrics import silhouette_score
s_kmeans=silhouette_score(df_z,label)
print(s_kmeans)

In [None]:
df_k=df_z.copy()
df_k['Cluster']=label

In [None]:
plt.figure(figsize=(8,5))
df_kmeans['Cluster'].value_counts().plot.bar(color='red')
plt.xlabel("Clusters")
plt.ylabel("Count of Customers")
plt.title("KMeans-Number of Customers in Each Category")
plt.show()

In [None]:
zero = df_k[df_k['Cluster'] == 0].Total_visits_online.mean()
one = df_k[df_k['Cluster'] == 1].Total_visits_online.mean()
two = df_k[df_k['Cluster'] == 2].Total_visits_online.mean()
three = df_k[df_k['Cluster'] == 3].Total_visits_online.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Total Visits Online')
plt.title("KMeans-Total Visits online Each Category")
plt.show()

In [None]:
zero = df_k[df_k['Cluster'] == 0].Avg_Credit_Limit.mean()
one = df_k[df_k['Cluster'] == 1].Avg_Credit_Limit.mean()
two = df_k[df_k['Cluster'] == 2].Avg_Credit_Limit.mean()
three = df_k[df_k['Cluster'] == 3].Avg_Credit_Limit.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Avg Credit Limit')
plt.title("KMeans-Avg Credit Limit of Each Category")
plt.show()

In [None]:
zero = df_k[df_k['Cluster'] == 0].Total_calls_made.mean()
one = df_k[df_k['Cluster'] == 1].Total_calls_made.mean()
two = df_k[df_k['Cluster'] == 2].Total_calls_made.mean()
three = df_k[df_k['Cluster'] == 3].Total_calls_made.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Average Total Calls made')
plt.title("KMeans-Avg Total calls of Each Category")
plt.show()

HIERARCHICAL CLUSTERING

In [None]:
# Hierarchical clustering
from sklearn.cluster import AgglomerativeClustering
model_hier=AgglomerativeClustering(n_clusters=4,affinity='euclidean',linkage='average')
model_hier.fit(df_z)
label=model_hier.labels_
df_hier=df_z.copy()
df_hier['Cluster']=label
df_hier.head()


In [None]:
## Analysing the clusters with boxplot

df_hier.boxplot(by='Cluster',layout=(2,3),figsize=(10,5))

In [None]:
## Silhouette score for hierarchical clustering
from sklearn.metrics import silhouette_score
s_hier=silhouette_score(df_z,label,metric='euclidean',random_state=50)
print(s_hier)

## cophenetic coeff
from scipy.cluster.hierarchy import cophenet,linkage,dendrogram
from scipy.spatial.distance import pdist
Z=linkage(df_z,metric='euclidean',method='average')
c,c_dis=cophenet(Z,pdist(df_z))
c

In [None]:
## Dendrogram
plt.figure(figsize=(10, 5))
plt.title('Agglomerative Hierarchical Clustering Dendogram')
plt.xlabel('sample index')
plt.ylabel('Distance')
dendrogram(Z, leaf_rotation=90.,color_threshold = 40, leaf_font_size=8. )
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,5))
df_hier['Cluster'].value_counts().plot.bar(color='red')
plt.xlabel("Clusters")
plt.ylabel("Count of Customers")
plt.title("Number of Customers in Each Category")
plt.show()


In [None]:
## Cluster 0 has the highest number of customers and cluster 3 has the lowest number of customers


In [None]:
## Assigning the labels to the original dataset
df['Cluster']=label

In [None]:
zero = df[df['Cluster'] == 0].Avg_Credit_Limit.mean()
one = df[df['Cluster'] == 1].Avg_Credit_Limit.mean()
two = df[df['Cluster'] == 2].Avg_Credit_Limit.mean()
three = df[df['Cluster'] == 3].Avg_Credit_Limit.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Avg Credit Limit')
plt.title("Avg Credit Limit of Each Category")
plt.show()

In [None]:
zero = df[df['Cluster'] == 0].Total_visits_online.mean()
one = df[df['Cluster'] == 1].Total_visits_online.mean()
two = df[df['Cluster'] == 2].Total_visits_online.mean()
three = df[df['Cluster'] == 3].Total_visits_online.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Average Visits online')
plt.title("Hierarchical Clustering - Avg Online visits of Each Category")
plt.show()

In [None]:
zero = df[df['Cluster'] == 0].Total_calls_made.mean()
one = df[df['Cluster'] == 1].Total_calls_made.mean()
two = df[df['Cluster'] == 2].Total_calls_made.mean()
three = df[df['Cluster'] == 3].Total_calls_made.mean()


indices = ['0','1','2', '3']
bar = pd.DataFrame([zero, one, two, three], index = indices)
bar.plot.bar(color='green')
plt.xlabel('Label')
plt.ylabel('Average Total Calls made')
plt.title("Avg Total calls of Each Category")
plt.show()

### Comparison of KMeans Clusters and Hierarchical Clusters

### Hierarchical Clustering Analysis

Label 3 category has the highest Avg Credit limit and online visits, while the number of customers and total calls made, and  bank visits is the lowest

Label 0 category has the highest Total Calls made, while the average credit limit in label 0 is the lowest and is characterized by second highest number of customers.

Label 1 has the lowest total calls made

Label 2 is characterized by lowest average online visits

### KMeans Clustering Analysis

Label 2 has highest Avg Credit Limit and highest online visits, while the number of customers and total calls made is lowest.

Label 1 has highest total calls made, while lowest Avg Credit Limit and is characterised by highest number of customers.

Label 0 has lowest average total Online visits and high Average Credit Limits

Label 3 has lowest Online visits and highest bank visits



### Key Questions:

#### How many different segments of customers are there?
4 different segments of customers

#### How are these segments different from each other?
The 4 segments are distinct and their profile is below:

Label 0: Lowest Avg Credit limit Customers and make high number of Calls

Label 1: High Credit limit but less total number of Credit Cards and least total number of calls

Label 2: Customers making least Average online visits

Label 3: Highest Avg Credit limit customers and highest total number of credit cards

#### What are your recommendations to the bank on how to better market to and service these customers?
I would recommend the Market research team to focus their personalised Campaigns on Label 1 customers who have high Avg Credit limit but less number of Credit Cards. These potential customers can be targeted to take more credit cards.I would also recommend them to focus on label 0 customers, to improve their credit limit and hence sell more credit cards to them. 

Recommend the Operations and Service team to focus on Label 0 customers, since they make the most number of calls. 


