# Low Contribution 

In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import tensorflow.keras as ks

#unsupervised:
from sklearn.cluster import KMeans
#transformers:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as st

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

#Note: train data has 13 feature column and 1 binary label for a total of 14 columns

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head(4)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0


In [4]:
train.shape

(165034, 14)

In [5]:
train.HasCrCard.unique()

array([1., 0.])

In [6]:
contingency = pd.crosstab(train.HasCrCard, train.Exited)
contingency

Exited,0,1
HasCrCard,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,31371,9235
1.0,98742,25686


In [7]:
result = st.chi2_contingency(contingency)
print("stat: {}, p_value = {}".format(result.statistic, result.pvalue))

stat: 80.78022958709316, p_value = 2.5226925668481934e-19


In [8]:
junk = train[['CreditScore','Tenure','HasCrCard']].copy()

In [9]:
kk = KMeans(n_clusters = 4, n_init = 10)
kk.fit(junk)

In [10]:
kk.labels_

array([0, 1, 0, ..., 3, 3, 2], dtype=int32)

In [11]:
kdf = pd.DataFrame(kk.labels_, columns = ['combo'])
z = pd.merge(kdf, train.Exited, left_index = True, right_index = True)

In [12]:
contingency = pd.crosstab(z.combo, z.Exited)
contingency

Exited,0,1
combo,Unnamed: 1_level_1,Unnamed: 2_level_1
0,48880,11658
1,43571,12419
2,19304,5124
3,18358,5720


In [13]:
result = st.chi2_contingency(contingency)
print("stat: {}, p_value = {}".format(result.statistic, result.pvalue))

stat: 264.11265813742415, p_value = 5.796085313174012e-57


In [14]:
#optimizer:
#optimize the binning
def comboOptimizer1(column_list, dataframe, max_n_clusters = 14):
    dd = dataframe[column_list].copy()
     
    output = []
    print("optimizing combination for features: {}".format(column_list))
    for i in range(2,max_n_clusters):
        kk = KMeans(n_clusters = i, n_init = 2, random_state = 11)
        kk.fit(dd)
        kdf = pd.DataFrame(kk.labels_, columns = ['combo'])
        z = pd.merge(kdf, dataframe.Exited, left_index = True, right_index = True)
        contingency = pd.crosstab(z.combo, z.Exited)
        result = st.chi2_contingency(contingency)
        output.append({"n_clusters": i, "stat": result.statistic, "p_value": result.pvalue})
    return pd.DataFrame(output)


In [15]:
feature_list = ['CreditScore','Tenure']
results = comboOptimizer1(column_list = feature_list, dataframe = train, 
                         max_n_clusters = 16)
results.sort_values(by = 'stat', ascending = False)
#Based on below, we will use n_clusters = 11

optimizing combination for features: ['CreditScore', 'Tenure']


Unnamed: 0,n_clusters,stat,p_value
9,11,363.198596,6.284792e-72
13,15,339.8057,5.6431259999999996e-64
10,12,333.029076,9.400645e-65
11,13,332.521425,6.789465e-64
12,14,326.132965,8.030059e-62
8,10,314.517832,2.165416e-62
6,8,309.867829,4.7191589999999996e-63
7,9,288.929991,9.329011e-58
2,4,259.561311,5.593725e-56
5,7,243.088205,1.22928e-49


In [20]:
feature_list = ['CreditScore','HasCrCard']
results = comboOptimizer1(column_list = feature_list, dataframe = train, 
                         max_n_clusters = 16)
results.sort_values(by = 'stat', ascending = False)
#Based on below, we will use n_clusters = 11

optimizing combination for features: ['CreditScore', 'HasCrCard']


Unnamed: 0,n_clusters,stat,p_value
13,15,361.036058,1.987938e-68
12,14,344.644261,1.037788e-65
8,10,336.854247,3.880541e-67
10,12,332.663473,1.123088e-64
6,8,327.36544,8.581671e-67
11,13,320.621588,2.173761e-61
7,9,311.45501,1.498793e-62
9,11,301.128659,8.974983e-59
5,7,297.163653,3.314971e-61
2,4,258.639436,8.853542e-56


In [32]:
feature_list = ['Tenure','HasCrCard']
results = comboOptimizer1(column_list = feature_list, dataframe = train, 
                         max_n_clusters = 16)
results.sort_values(by = 'stat', ascending = False)
#Based on below, we will use n_clusters = 11

optimizing combination for features: ['Tenure', 'HasCrCard']


Unnamed: 0,n_clusters,stat,p_value
12,14,318.791298,2.7854860000000003e-60
13,15,309.84845,1.041563e-57
11,13,295.654115,3.836644e-56
10,12,293.643797,1.91067e-56
8,10,252.758698,2.608954e-49
9,11,251.939016,2.1226949999999997e-48
7,9,250.76418,1.186417e-49
5,7,240.347067,4.73274e-49
6,8,185.518118,1.3298489999999999e-36
3,5,181.572368,3.426405e-38


In [29]:
class ComboTransformer1(BaseEstimator, TransformerMixin):
    def __init__(self, column_list, n_clusters = 11, n_init = 10):
        self.n_clusters = n_clusters
        self.n_init = n_init
        self.kmeans = KMeans(n_clusters = self.n_clusters, n_init = self.n_init)
        self.column_list = column_list
        self.targetEncoder = TargetEncoder()

    def fit(self, X, y):
        xx = X[self.column_list].copy()
        self.kmeans.fit(xx)
        n = self.kmeans.predict(xx)
        a = self.targetEncoder.fit(n.reshape(-1,1),y)
        return self

    def transform(self, X):
        xx = X[self.column_list].copy()
        n = self.kmeans.predict(xx)
        a = self.targetEncoder.transform(n.reshape(-1,1))
        return a
        
        
        
        

In [30]:
s = ComboTransformer1(column_list = ['HasCrCard', 'CreditScore'], n_clusters = 15)
s.fit(train, train.Exited)

In [31]:
s.transform(train)

array([[0.21719663],
       [0.22516242],
       [0.18746113],
       ...,
       [0.24042829],
       [0.24042829],
       [0.20222017]])