# High Contribution 

In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import tensorflow.keras as ks

#unsupervised:
from sklearn.cluster import KMeans
#transformers:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as st

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from itertools import combinations


#Note: train data has 13 feature column and 1 binary label for a total of 14 columns

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.head(3)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0


In [4]:
train.shape

(165034, 14)

In [5]:
#Age (log) vs Estimated Salary
def kmeansAnalyzer(column_names, n_clusters_max = 14):
    trex = train[column_names].copy()
    se = StandardScaler()
    trex_ = se.fit_transform(trex)
    
    sse = []
    for i in range(2, n_clusters_max + 1):
        tt = KMeans(n_clusters = i, n_init = 10, random_state = 11)
        tt.fit(trex_)
        nx = tt.predict(trex_)
        nx1 = pd.DataFrame(nx, columns = ['group_id'])
        nx1 = pd.merge(nx1, train.Exited, left_index = True, right_index = True)
        z = pd.crosstab(nx1.group_id, nx1.Exited)
        result = st.chi2_contingency(z)
        sse.append((i, tt.inertia_, result.statistic, result.pvalue))
    
    for_plot = pd.DataFrame(sse, columns = ['n_clusters', 'sse', 'stat', 'pval'])
    #plot now:
    
    plt.plot(for_plot.n_clusters, for_plot.sse)
    plt.xlabel("Number of Clusters")
    plt.ylabel("SSE")
    plt.show()
    
    return for_plot
        


In [6]:
# combination of categorical and Numeric data:
class ComboCatNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, cat_columns = [], num_columns = [], n_clusters =6, random_state=11):
        self.cat_columns = cat_columns
        self.num_columns = num_columns
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.standardScaler = StandardScaler()
        self.oneHotEncoder = OneHotEncoder(sparse_output =False)
        self.kmeans = KMeans(n_clusters = n_clusters, n_init = 10, random_state = self.random_state, max_iter = 50)
        self.targetEncoder = TargetEncoder(target_type = 'binary', random_state = self.random_state)
        print("Creating transformer combos for {} and {}.".format(self.cat_columns, self.num_columns))

    def fit(self, X, y):
        a1 = []
        a2 = []
        if(len(self.cat_columns) > 0):
            self.oneHotEncoder.fit(X[self.cat_columns])
            a1 = self.oneHotEncoder.transform(X[self.cat_columns])
        if(len(self.num_columns) > 0):
            a2 = np.array(X[self.num_columns])
            a2 = self.standardScaler.fit_transform(a2)
        if(len(a1) > 0 and len(a2) > 0):
            b1 = np.c_[a1,a2]
        if(len(a1) > 0 and len(a2) < 1):
            b1 = a1
        if(len(a1) < 1 and len(a2) > 0):
            b1 = a2
            
        self.kmeans.fit(b1)
        b2 = self.kmeans.predict(b1)
        self.targetEncoder.fit(b2.reshape(-1,1),y)
        return self

    def transform(self, X):
        a1 = []
        a2 = []
        if(len(self.cat_columns) > 0):
            a1 = self.oneHotEncoder.transform(X[self.cat_columns])
        if(len(self.num_columns) > 0):
            a2 = np.array(X[self.num_columns])
            a2 = self.standardScaler.transform(a2)
        if(len(a1) > 0 and len(a2) > 0):
            b1 = np.c_[a1,a2]
        if(len(a1) > 0 and len(a2) < 1):
            b1 = a1
        if(len(a1) < 1 and len(a2) > 0):
            b1 = a2
            
        b2 = self.kmeans.predict(b1)
        c = self.targetEncoder.transform(b2.reshape(-1,1))
        print("Completed the transformer combos for {} and {}.".format(self.cat_columns, self.num_columns))
        return c.reshape(-1,1) 

        

In [13]:
# Transform into KMeans
class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_columns = [], num_columns = [], n_clusters =6, random_state=11):
        self.cat_columns = cat_columns
        self.num_columns = num_columns
        self.n_clusters = n_clusters
        self.random_state = random_state
        self.standardScaler = StandardScaler()
        self.oneHotEncoder = OneHotEncoder(sparse_output =False)
        self.kmeans = KMeans(n_clusters = n_clusters, n_init = 10, random_state = self.random_state, max_iter = 50)
        self.targetEncoder = TargetEncoder(target_type = 'binary', random_state = self.random_state)
        print("Creating transformer combos for {} and {}.".format(self.cat_columns, self.num_columns))

    def fit(self, X, y = None):
        a1 = []
        a2 = []
        if(len(self.cat_columns) > 0):
            self.oneHotEncoder.fit(X[self.cat_columns])
            a1 = self.oneHotEncoder.transform(X[self.cat_columns])
        if(len(self.num_columns) > 0):
            a2 = np.array(X[self.num_columns])
            a2 = self.standardScaler.fit_transform(a2)
        if(len(a1) > 0 and len(a2) > 0):
            b1 = np.c_[a1,a2]
        if(len(a1) > 0 and len(a2) < 1):
            b1 = a1
        if(len(a1) < 1 and len(a2) > 0):
            b1 = a2
            
        self.kmeans.fit(b1)
        return self

    def transform(self, X):
        a1 = []
        a2 = []
        if(len(self.cat_columns) > 0):
            a1 = self.oneHotEncoder.transform(X[self.cat_columns])
        if(len(self.num_columns) > 0):
            a2 = np.array(X[self.num_columns])
            a2 = self.standardScaler.transform(a2)
        if(len(a1) > 0 and len(a2) > 0):
            b1 = np.c_[a1,a2]
        if(len(a1) > 0 and len(a2) < 1):
            b1 = a1
        if(len(a1) < 1 and len(a2) > 0):
            b1 = a2
            
        b2 = self.kmeans.predict(b1)
        
        return b2.reshape(-1,1) 

        

In [34]:
#Get combinations
cat_list =['Geography', 'Gender'] 
num_list =['Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'CreditScore']

result_list = []
two = list(combinations(num_list,2))
for i in cat_list:
    for j in two:
        print("checking for {} and {} combos.".format(i, j))
        kk = KMeansTransformer(cat_columns = [i], num_columns = [j[0],j[1]], n_clusters = 4)
        r = kk.fit(train)
        r1 = kk.transform(train)
        zz = pd.DataFrame(np.c_[r1, train.Exited])
        cc = pd.crosstab(zz.iloc[:,0], zz.iloc[:,1])
        result = st.chi2_contingency(cc)
        result_list.append((result.statistic, i, j))
        del r, r1, zz, cc, result
        

checking for Geography and ('Age', 'Tenure') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure'].
checking for Geography and ('Age', 'Balance') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Balance'].
checking for Geography and ('Age', 'NumOfProducts') combos.
Creating transformer combos for ['Geography'] and ['Age', 'NumOfProducts'].
checking for Geography and ('Age', 'HasCrCard') combos.
Creating transformer combos for ['Geography'] and ['Age', 'HasCrCard'].
checking for Geography and ('Age', 'IsActiveMember') combos.
Creating transformer combos for ['Geography'] and ['Age', 'IsActiveMember'].
checking for Geography and ('Age', 'EstimatedSalary') combos.
Creating transformer combos for ['Geography'] and ['Age', 'EstimatedSalary'].
checking for Geography and ('Age', 'CreditScore') combos.
Creating transformer combos for ['Geography'] and ['Age', 'CreditScore'].
checking for Geography and ('Tenure', 'Balance') combos.
Creating transformer co

In [37]:
output = pd.DataFrame(result_list)
output.sort_values(by = 0, ascending = False)

Unnamed: 0,0,1,2
4,33499.477589,Geography,"(Age, IsActiveMember)"
2,28211.838285,Geography,"(Age, NumOfProducts)"
30,27347.69995,Gender,"(Age, NumOfProducts)"
1,25069.028532,Geography,"(Age, Balance)"
47,23632.476321,Gender,"(NumOfProducts, IsActiveMember)"
19,23632.476321,Geography,"(NumOfProducts, IsActiveMember)"
6,20777.134027,Geography,"(Age, CreditScore)"
34,20108.860066,Gender,"(Age, CreditScore)"
41,19827.358373,Gender,"(Balance, NumOfProducts)"
13,19761.156169,Geography,"(Balance, NumOfProducts)"


In [33]:
#Get combinations
cat_list =['Geography', 'Gender'] 
num_list =['Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'CreditScore']

result_list_3 = []
three = list(combinations(num_list,3))
for i in cat_list:
    for j in three:
        print("checking for {} and {} combos.".format(i, j))
        kk = KMeansTransformer(cat_columns = [i], num_columns = [j[0],j[1], j[2]], n_clusters = 4)
        r = kk.fit(train)
        r1 = kk.transform(train)
        zz = pd.DataFrame(np.c_[r1, train.Exited])
        cc = pd.crosstab(zz.iloc[:,0], zz.iloc[:,1])
        result = st.chi2_contingency(cc)
        result_list_3.append((result.statistic, i, j))
        del r, r1, zz, cc, result
        

checking for Geography and ('Age', 'Tenure', 'Balance') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'Balance'].
checking for Geography and ('Age', 'Tenure', 'NumOfProducts') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'NumOfProducts'].
checking for Geography and ('Age', 'Tenure', 'HasCrCard') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'HasCrCard'].
checking for Geography and ('Age', 'Tenure', 'IsActiveMember') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'IsActiveMember'].
checking for Geography and ('Age', 'Tenure', 'EstimatedSalary') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'EstimatedSalary'].
checking for Geography and ('Age', 'Tenure', 'CreditScore') combos.
Creating transformer combos for ['Geography'] and ['Age', 'Tenure', 'CreditScore'].
checking for Geography and ('Age', 'Balance', 'NumOfProducts') combos.
Creating tr

In [36]:
output3 = pd.DataFrame(result_list_3)
output3.sort_values(by = 0, ascending = False)

Unnamed: 0,0,1,2
14,27553.887934,Geography,"(Age, NumOfProducts, CreditScore)"
70,27520.413111,Gender,"(Age, NumOfProducts, CreditScore)"
1,27432.045800,Geography,"(Age, Tenure, NumOfProducts)"
69,26100.689370,Gender,"(Age, NumOfProducts, EstimatedSalary)"
13,26069.583601,Geography,"(Age, NumOfProducts, EstimatedSalary)"
...,...,...,...
54,164.694091,Geography,"(HasCrCard, EstimatedSalary, CreditScore)"
31,156.161866,Geography,"(Tenure, HasCrCard, EstimatedSalary)"
87,153.073705,Gender,"(Tenure, HasCrCard, EstimatedSalary)"
91,141.307135,Gender,"(Tenure, EstimatedSalary, CreditScore)"
