# Customer Segmentation

In [2]:
import pandas as pd 
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Load Data 

In [None]:
df = pd.read_csv("test.csv")

print("n: %s " %len(df))
print("n_columns: %s " %df.shape[1])

print("n_unique: %s " %df['id'].nunique())
print("n_dups: %s " %(len(df) - df['id']nunique()))

# Set id as row index
df = df.set_index('id')

#df.head(10)

### Data Cleaning

In [None]:
# Check for missing values

print('\nMissing values:', df['var1'].isna().sum())

# Check levels for categorical variable 

print('\n', df.groupby('var_cat').count()['id'])

# Check missing values for each column 
df.isna().sum()

In [None]:
# Generate new features

# Transform categorical variables to dummies / one-hot encoding 

df = pd.get_dummies(df, prefix = ['dummy1', 'dummy2', 'dummy3'], 
               columns = ['var1', 'var2', 'var3'], dtype = int) 

# Subset variables 

df_subset = df.drop(columns=[])
df_subset.dtypes
df_subset.describe()

## Dimensionality Reduction - PCA 

### Normalise selected fields to reduce feature space size

In [None]:
# Normalise select fields in agents df 

cols_to_norm = ['var1', 'var2', 'var3'] # numeric variables

df[cols_to_norm] = df[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

df['var1'].describe()

### Apply PCA 

In [10]:
from sklearn.decomposition import PCA
    
pca = PCA(n_components=25)
pca.fit(df)

PCA(copy=True, iterated_power='auto', n_components=25, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [11]:
df_pca = pca.transform(df)

In [None]:
# Optimise number of principle components

exp_var = pca.explained_variance_ratio_
exp_var_cum =np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)
print(exp_var_cum)
plt.plot(exp_var_cum)

In [None]:
# Run PCA using optimal components 

pca = PCA(n_components=20)

pca.fit(df)
df_reduced = pca.transform(df)

df_reduced = pd.DataFrame(df_reduced, index = df.index)
df_reduced.columns = ['PC1','PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']
df_reduced.shape
df_reduced.head()

In [3]:
# Save PCA output to pickle file in order to recreate clustering output 
# -- only save once the clustering has been optimised 

# Load pca feature df to recreate clustering output
df_reduced = pd.read_pickle('df_pca.pkl')

In [None]:
# Visualise agents in 3d space using the first 3 dimensions

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,8))
ax = Axes3D(fig)

ax.scatter(df['PC1'], df['PC2'], df['PC3'])
plt.savefig('3d.png')
plt.show()


## K-means

In [8]:
from sklearn.cluster import KMeans

In [10]:
# Load pca feature df to recreate clustering output
df_reduced = pd.read_pickle('df_pca.pkl')

In [None]:
# Inertia (within cluster sum-of-squares) - how internally coherent clusters are 

Sum_of_squared_distances = []
K = range(1,36) 
for k in K:
    km = KMeans(n_clusters=k)
    km = km.fit(df_reduced)
    Sum_of_squared_distances.append(km.inertia_)

fig = plt.figure(figsize=(16,8))
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()

In [12]:
# Fit optimal number of cluster to the data 

kmeans = KMeans(n_clusters=20, random_state = 100)
clusters = kmeans.fit(df_reduced)

kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=100, tol=0.0001, verbose=0)

In [13]:
clusters.labels_
clusters.inertia_

9883.184703826757

In [14]:
df_reduced['cluster'] = pd.Series(clusters.labels_, index=df_reduced.index)
df_reduced.shape

(33983, 21)

In [None]:
print(df_reduced.groupby('cluster').count()[['PC1']])
agents_reduced.head()

## Evaluate clusters and assign labels

In [None]:
# Visualise clusters 

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(16,8))
ax = Axes3D(fig)

ax.scatter(df_reduced['PC1'], df_reduced['PC2'], df_reduced['PC3'], c= df_reduced['cluster'])
plt.savefig('3d.png')
plt.show()

In [18]:
# Join new clusters to the original agents table 

agent_review = agents.join(agents_reduced['cluster'], on = 'id', how = 'left')
agent_review['Cluster'] = agent_review['cluster'] + 1 
agent_review.shape

(33983, 222)

In [None]:
# Investigate clusters 

counts = agent_review.groupby('Cluster').count()['var1']

fig = plt.figure(figsize=(16,8))
ax = sns.barplot(x = counts.index, y= counts)

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=11, color='gray', xytext=(0, 10), textcoords='offset points'
               )
    
ax.set_ylabel('')    

plt.show()

In [None]:
# Heatmap of cluster properties 

# Subset numerical agent fields and summarise for each cluster 

df_numeric = df_review[[
                'var1', 'var2','var3'
                ,'Cluster']].groupby('Cluster').sum()

# Plot heatmap

# normalise summary statistics for each cluster to make the comparable across variables
cols_to_norm = df_numeric.columns
df_numeric[cols_to_norm] = df_numeric[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))

fig = plt.figure(figsize=(20,10))
sns.heatmap(df_numeric.transpose(), cmap="PuBu", linewidths=0.5)#.set_title('Cluster Summary Heatmap', fontsize = 16)
plt.show()

In [None]:
pd.set_option('display.max_columns', 100)
df_review[(df_review['Cluster'] == 13) & (df_review['var1'] == 1) ].head(10)

## DBSCAN

In [28]:
from sklearn.cluster import DBSCAN

In [29]:
# Load pca feature df to recreate clustering output
df_reduced = pd.read_pickle('df_pca.pkl')

In [31]:
# DBSCAN clustering 

# Reset 
#del dbscan, clusters

dbscan = DBSCAN(eps= 0.85, min_samples = 10) # eps=1, min_sample=10
# eps has been optimised by the number and size of clusters that are generated. 

clusters = dbscan.fit(df_reduced)

#clusters.labels_

In [32]:
# Join cluster labels to agents list 
df_reduced['cluster'] = pd.Series(clusters.labels_, index=df_reduced.index)

In [None]:
print(df_reduced.groupby('cluster').count()[['PC1']])
#df_reduced.head()

In [None]:
# Join new clusters to the original agents table 

df_review = agents.join(df_reduced['cluster'], on = 'id', how = 'left')
df_review['Cluster'] = df_review['cluster'] + 1 
df_review.shape

In [None]:
# Investigate clusters 

counts = df_review.groupby('Cluster').count()['var1']

fig = plt.figure(figsize=(16,8))
ax = sns.barplot(x = counts.index, y= counts)

for p in ax.patches:
    ax.annotate(int(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=11, color='gray', xytext=(0, 10), textcoords='offset points'
               )
    
ax.set_ylabel('')    

plt.savefig('cluster_summary.png')
plt.show()


In [44]:
# Output cluster assignment

df_review['Cluster'].to_csv('cluster_output')