In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from project import run

# setting display options
%matplotlib inline
pd.set_option('display.width', 4000)
pd.set_option('max_colwidth', 4000)
pd.set_option('max_rows', 100)
pd.set_option('max_columns', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

In [2]:
my_path = r'./data/insurance.db'

df = run(my_path, nb_exploration=True)

NaN values of "Area" column were imputed.
NaN values of "Education" column were imputed.
NaN values of "Children" column were imputed.
    First_Policy  Birthday Education  Salary Area Children     CMV  Claims  Motor  Household  Health  Life  Work_Compensation
ID                                                                                                                           
1           1985      1982      2.00    2177 1.00     1.00  380.97    0.39   0.57      -0.55   -0.29  0.24              -0.54
2           1981      1995      2.00     677 4.00     1.00 -131.13    1.12  -1.59       0.96   -0.69  0.00               1.75


In [3]:
df.head()

Unnamed: 0_level_0,First_Policy,Birthday,Education,Salary,Area,Children,CMV,Claims,Motor,Household,Health,Life,Work_Compensation
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,1985,1982,2.0,2177,1.0,1.0,380.97,0.39,0.57,-0.55,-0.29,0.24,-0.54
2,1981,1995,2.0,677,4.0,1.0,-131.13,1.12,-1.59,0.96,-0.69,0.0,1.75
3,1991,1970,1.0,2277,3.0,0.0,504.67,0.28,-0.66,0.1,-0.58,1.27,1.57
4,1990,1981,3.0,1099,4.0,1.0,-16.99,0.99,-0.83,-0.71,1.92,-0.07,-0.24
5,1986,1973,3.0,1763,4.0,1.0,35.23,0.9,0.3,-0.69,0.2,-0.5,0.09


In [4]:
df.dtypes

First_Policy            int32
Birthday                int32
Education            category
Salary                  int32
Area                 category
Children             category
CMV                   float64
Claims                float64
Motor                 float64
Household             float64
Health                float64
Life                  float64
Work_Compensation     float64
dtype: object

In [5]:
df.isna().any().any()

False

In [6]:
premiums_cols = ["Motor", "Household", "Health", "Life", "Work_Compensation"]
categorical_cols = ["Area", "Education", "Children"]
df.columns

Index(['First_Policy', 'Birthday', 'Education', 'Salary', 'Area', 'Children', 'CMV', 'Claims', 'Motor', 'Household', 'Health', 'Life', 'Work_Compensation'], dtype='object')

In [None]:
# Divide the variables into Value / Engage and Consumption / Affinity

ValueEngage = df[['Age',
               'Education',
               'Salary',
               'Area',
               'Children',
               'CMV',
               'Claims',
               'Customer_Years']]

ConsAff = df.loc[:,[ 'Motor',
               'Household',
               'Health',
               'Life',
               'Work_Compensation']].reindex()

In [None]:
# cols_for_clustering = []
# cols_for_clustering.extend(premiums_cols)
# cols_for_clustering.extend(categorical_cols)

### Pearson correlation

In [None]:
corr = df.corr(method='pearson')

# Obtain Correlation and plot it
plt.figure(figsize=(16,6))

h_map = sns.heatmap(corr, 
            xticklabels=corr.columns,
            yticklabels=corr.columns,
            cmap='PRGn', annot=True, linewidths=.5)

#this is fix for matplotlib3.1.1 to ensure the top and bottom rows are not cut off.
# According to: https://github.com/mwaskom/seaborn/issues/1773#issuecomment-546466986
bottom, top = h_map.get_ylim()
h_map.set_ylim(bottom + 0.5, top - 0.5)

plt.show()

In [None]:
# might be handy: https://github.com/joaolcorreia/RFM-analysis

In [None]:
from sklearn.preprocessing import StandardScaler
df[:] = StandardScaler().fit_transform(df[:])

x = df[df.columns.difference(categorical_cols + premiums_cols)].values # excluding categorical columns
x = x[:200] # slice array for faster cluster testing

In [None]:
from sklearn.decomposition import PCA

# Fitting the PCA algorithm with our Data
pca = PCA().fit(df)

print(pca.explained_variance_ratio_)

# Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.grid()
plt.show()

9 components explain 94.20951% of the variance. So, we'll use 9 components.

In [None]:
pca = PCA(n_components=9)
principalComponents = pca.fit_transform(df)

principalDf = pd.DataFrame(data = principalComponents, columns = ['pc_1', 'pc_2', 'pc_3', 'pc_4', 'pc_5', 'pc_6',
                                                                  'pc_7', 'pc_8', 'pc_9'])

In [None]:
print('Components: ', pca.components_)
print('Explained Variance: ', pca.explained_variance_)
print('Explained Variance Ratio: ', pca.explained_variance_ratio_)

In [None]:
principalDf.head(5)

In [None]:
# # do we need to show the PCA some how? so I try to do with T-SNE
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300)
# tsne_pca_results = tsne.fit_transform(principalDf)

# tsne_data = np.vstack((tsne_pca_results.T, principalDf.index)).T

# tsne_df = pd.DataFrame(data=tsne_data, columns = ('Dim_1', 'Dim_2', 'label'))

# sns.FacetGrid(tsne_df, hue='label', height=6).map(plt.scatter, 'Dim_1', 'Dim_2', alpha=.7)
# plt.show()

In [None]:
# df_subset = df.copy()
# df_subset = df_subset.T
# df_subset['tsne-pca-one'] = tsne_pca_results[:,0]
# df_subset['tsne-pca-two'] = tsne_pca_results[:,1]

# ax = plt.subplot(1,3,3)
# sb.scatterplot(
#     x='tsne-pca-one', y='tsne-pca-one',
#     hue=df.columns,
#     palette=sb.color_palette('hls', 10),
#     data = df_subset,
#     legend = 'full',
#     alpha = 0.3
# )

In [None]:
sns.pairplot(principalDf)
plt.show()

In [None]:
# only if PCA is 2 components:
# plt.scatter(principalDf.iloc[:, 0], principalDf.iloc[:, 1])
# # plt.scatter(principalComponents[:,0], principalComponents[:,1])
# plt.show()

In [None]:
# principalDf = StandardScaler().fit_transform(principalDf[:])

# x = principalDf.values

# x.shape

In [None]:
from utils.preprocessing import remove_outliers, handle_nans

_, pca_outliers = remove_outliers(principalDf, principalDf.columns)
print(pca_outliers, "\n")

i = 1

while pca_outliers.any() == True: # checking non-zero existence
    print(f"Iteration #{i}...")
    principalDf, pca_outliers = remove_outliers(principalDf, pca_outliers[pca_outliers > 0].index.tolist())
    principalDf = handle_nans(principalDf, pca_outliers[pca_outliers > 0].index.tolist())
    principalDf[:] = StandardScaler().fit_transform(principalDf[:])
    i += 1

print("No outliers after standardization.")

In [None]:
sns.pairplot(principalDf)
plt.show()

In [None]:
# Fitting the PCA algorithm with our Data
pca = PCA().fit(principalDf)

print(pca.explained_variance_ratio_)

# Plotting the Cumulative Summation of the Explained Variance
plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('Explained Variance')
plt.grid()
plt.show()

In [None]:
pca = PCA(n_components=2)
secondComponents = pca.fit_transform(principalDf)

secondDf = pd.DataFrame(data = secondComponents, columns = ['pc_1', 'pc_2'])

In [None]:
x = secondComponents

x.shape

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
plt.figure(figsize=(10, 7))
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(x)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
best_number_of_clusters = 3

In [None]:
kmeans = KMeans(n_clusters=best_number_of_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
pred_y = kmeans.fit_predict(x)
plt.scatter(x[:,0], x[:,1])
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='red')
plt.show()

In [None]:
# Awful results. However, agglomerative clustering seems to be better.

### Agglomerative Clustering

In [None]:
import scipy.cluster.hierarchy as shc

plt.figure(figsize=(10, 7))
plt.title("Customer Dendograms")
dend = shc.dendrogram(shc.linkage(x, method='ward'))

In [None]:
from sklearn.cluster import AgglomerativeClustering

n_clusters = 4

cluster = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
cluster.fit_predict(x)

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(x[:,0], x[:,1], c=cluster.labels_, cmap='rainbow')

plt.show()