# Capstone Project: Create a Customer Segmentation Report for Arvato Financial Services

In [1]:
# setting random state for reproducibility
random_state = 22

# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import Normalizer

from kmodes.kmodes import KModes
# from yellowbrick.cluster import kelbow_visualizer

## Part 2: Customer Segmentation Report

The main bulk of the analysis is in this part of the project. Here, I'll use unsupervised learning techniques to describe the relationship between the demographics of the company's existing customers and the general population of Germany. By the end of this part, I'll be able to describe parts of the general population that are more likely to be part of the mail-order company's main customer base, and which parts of the general population are less so.

In [2]:
# load in the data
azdias = pd.read_csv('data/clean_AZDIAS.csv')
customers = pd.read_csv('data/clean_CUSTOMERS.csv')

del azdias['Unnamed: 0']
del customers['Unnamed: 0']

In [3]:
azdias.head()

Unnamed: 0,AGER_TYP,AKT_DAT_KL,ANZ_HAUSHALTE_AKTIV,ANZ_HH_TITEL,ANZ_KINDER,ANZ_PERSONEN,ANZ_TITEL,ARBEIT,BALLRAUM,CAMEO_DEUG_2015,...,VK_DHT4A,VK_DISTANZ,VK_ZG11,W_KEIT_KIND_HH,WOHNDAUER_2008,WOHNLAGE,ZABEOTYP,ANREDE_KZ,ALTERSKATEGORIE_GROB,PRAEGENDE_JUGENDJAHRE_MAINSTREAM
0,2,1.0,1.0,0.0,0.0,3.0,0.0,4.0,-1.0,-1.0,...,2.0,3.0,3.0,2.0,9.0,-1.0,3,0,3,1
1,-1,5.0,1.0,0.0,0.0,1.0,0.0,3.0,5.0,3.0,...,6.0,7.0,4.0,6.0,4.0,7.0,3,1,4,0
2,-1,1.0,1.0,0.0,0.0,1.0,0.0,4.0,-1.0,-1.0,...,10.0,10.0,10.0,-1.0,-1.0,-1.0,3,0,3,0
3,-1,1.0,2.0,0.0,0.0,3.0,0.0,4.0,6.0,4.0,...,4.0,7.0,4.0,3.0,9.0,3.0,1,1,1,1
4,-1,1.0,12.0,0.0,1.0,0.0,0.0,4.0,4.0,8.0,...,6.0,9.0,10.0,-1.0,5.0,1.0,5,0,1,1


### Data pre-processing: Scaling

Firstly, we'll perform a Normalization in data. 

In [4]:
# # Scale datasets with Normalizer
# scaler = Normalizer()
# scaler.fit(azdias)

# azdias_scaled = scaler.transform(azdias)
# customers_scaled = scaler.transform(customers)

### Modeling

Now it's time to start the modeling phase!

Let's begin choosing the number of clusters to implement.

In [4]:
# Convert dataframe to matrix
azdiasMatrix = azdias.loc[:, azdias.columns].to_numpy()
azdiasMatrix

array([[ 2.,  1.,  1., ...,  0.,  3.,  1.],
       [-1.,  5.,  1., ...,  1.,  4.,  0.],
       [-1.,  1.,  1., ...,  0.,  3.,  0.],
       ...,
       [-1.,  9., 23., ...,  1.,  1.,  1.],
       [-1.,  9.,  1., ...,  1.,  4.,  0.],
       [-1.,  9.,  2., ...,  1.,  3.,  1.]])

In [5]:
model_1 = KModes(random_state=random_state)

In [None]:
model_1.fit_predict(azdiasMatrix)

In [None]:
# Choosing optimal K
cost = []
for cluster in range(1, 10):
    try:
        kmodes = KModes(n_jobs = -1, n_clusters = cluster, init = 'Huang', random_state = random_state)
        kmodes.fit_predict(azdiasMatrix)
        cost.append(kmodes.cost_)
        print('Cluster initiation: {}'.format(cluster))
    except:
        break
        
# Converting the results into a dataframe and plotting them
df_cost = pd.DataFrame({'Cluster': range(1, 10), 'Cost': cost})
df_cost

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
plt.plot(df_cost['Cluster'], df_cost['Cost'], linestyle='--', marker='o')
plt.xlabel('number of clusters')
plt.ylabel('cost')
plt.title('Optimal number of cluster with Elbow Method')

In [None]:
# sse = {}
# for k in range(2, 10):
#     kmodes = KModes(random_state=random_state).fit(azdias)
#     sse[k] = kmodes.cost_ # Inertia: Soma das distâncias das amostras até o centro do cluster mais próximo
# plt.figure()
# plt.plot(list(sse.keys()), list(sse.values()))
# plt.xlabel("Number of clusters")
# plt.ylabel("cost")
# plt.show()

In [7]:
# # Elbow quick Method para K means
# visualizer_1 = kelbow_visualizer(model_1, azdias, k=(2,10))