# 

In [70]:
import numpy as np
import pandas as pd 
import tensorflow as tf
import tensorflow.keras as ks

#unsupervised:
from sklearn.cluster import KMeans
#transformers:
from sklearn.preprocessing import OneHotEncoder, TargetEncoder
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as st

In [71]:
train = pd.read_csv("train.csv")

In [72]:
train.head(4)

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0


In [115]:
train.shape

(165034, 14)

In [73]:
# check exited Y label:
# There are more retain than exits
train.groupby(by = "Exited")['Exited'].count()

Exited
0    130113
1     34921
Name: Exited, dtype: int64

In [74]:
# Check association with geography and Exited:
geo_chi = pd.crosstab(index = train.Geography, columns = train.Exited, 
                      aggfunc = "count", values = train.Exited)
geo_chi

Exited,0,1
Geography,Unnamed: 1_level_1,Unnamed: 2_level_1
France,78643,15572
Germany,21492,13114
Spain,29978,6235


In [75]:
#Perform chi-square:
# The below shows that this feature is significant.
result = st.chi2_contingency(geo_chi)
print("statistics : {}, p-value : {}".format(result.statistic, result.pvalue))

statistics : 7358.673765244894, p-value : 0.0


In [123]:
# Based on the bove, we use the below Transformer:
class GeographyTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.oneHotEncoder = OneHotEncoder(sparse_output = False)
        self.targetEncoder = TargetEncoder()


    def fit(self, X,y = None):
        #select the column:
        x = np.array(X['Geography']).reshape(-1,1)
        y = np.array(X['Exited'])
        self.oneHotEncoder.fit(x)
        s1 =self.targetEncoder.fit(x,y)
        return self
        
    def transform(self, X):
        x = np.array(X['Geography']).reshape(-1,1)
        a_ = self.oneHotEncoder.transform(x)
        print(a_.shape)
        b_ = self.targetEncoder.transform(x)
        print(b_.shape)
        
        return np.c_[a_, b_] 

In [124]:
gt = GeographyTransformer()
output = gt.fit_transform(train)

(165034, 3)
(165034, 1)


In [125]:
output

array([[1.        , 0.        , 0.        , 0.16528194],
       [1.        , 0.        , 0.        , 0.16528194],
       [1.        , 0.        , 0.        , 0.16528194],
       ...,
       [1.        , 0.        , 0.        , 0.16528194],
       [0.        , 0.        , 1.        , 0.17217667],
       [1.        , 0.        , 0.        , 0.16528194]])

In [140]:
z = pd.DataFrame(output)

In [142]:
z.corr()

Unnamed: 0,0,1,2,3
0,1.0,-0.594122,-0.611539,-0.619422
1,-0.594122,1.0,-0.273105,0.999493
2,-0.611539,-0.273105,1.0,-0.242348
3,-0.619422,0.999493,-0.242348,1.0


In [144]:
z[3].unique()

array([0.16528194, 0.17217667, 0.3789448 ])