# Predicting Credit Card Customer Segmentation

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",100)

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import r2_score, mean_squared_error,mean_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("BankChurners.csv")#https://www.kaggle.com/datasets/thedevastator/predicting-credit-card-customer-attrition-with-m

### EDA

In [3]:
df

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,0.000093,0.999910
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,0.000057,0.999940
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.000,0.000021,0.999980
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.760,0.000134,0.999870
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.500,0.000,0.000022,0.999980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,Existing Customer,50,M,2,Graduate,Single,$40K - $60K,Blue,40,3,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462,0.000191,0.999810
10123,710638233,Attrited Customer,41,M,2,Unknown,Divorced,$40K - $60K,Blue,25,4,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511,0.995270,0.004729
10124,716506083,Attrited Customer,44,F,1,High School,Married,Less than $40K,Blue,36,5,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.000,0.997880,0.002118
10125,717406983,Attrited Customer,30,M,2,Graduate,Unknown,$40K - $60K,Blue,36,4,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.000,0.996710,0.003294


In [4]:
df.isnull().sum() #We examine the empty lines.

CLIENTNUM                                                                                                                             0
Attrition_Flag                                                                                                                        0
Customer_Age                                                                                                                          0
Gender                                                                                                                                0
Dependent_count                                                                                                                       0
Education_Level                                                                                                                       0
Marital_Status                                                                                                                        0
Income_Category                                 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   CLIENTNUM                                                                                                                           10127 non-null  int64  
 1   Attrition_Flag                                                                                                                      10127 non-null  object 
 2   Customer_Age                                                                                                                        10127 non-null  int64  
 3   Gender                                                                           

### Feature Engineering

In [6]:
df["Attrition_Flag"].value_counts()

Existing Customer    8500
Attrited Customer    1627
Name: Attrition_Flag, dtype: int64

In [7]:
df["Attrition_Flag"]=df["Attrition_Flag"].map({"Existing Customer":0,"Attrited Customer":0})
df['Attrition_Flag']=df['Attrition_Flag'].astype(int)

In [8]:
df["Gender"].value_counts()

F    5358
M    4769
Name: Gender, dtype: int64

In [9]:
df["Gender"]=df["Gender"].map({"F":0,"M":1}) #We replace the data in Gender with 0 and 1.
df['Gender']=df['Gender'].astype(int) #We change the type of data in gender to integer.

In [10]:
df["Education_Level"].value_counts()

Graduate         3128
High School      2013
Unknown          1519
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: Education_Level, dtype: int64

In [11]:
df=df[df['Education_Level']!="Unknown"] #We are deleting the Unknown data in Education_Level.
df["Education_Level"]=df["Education_Level"].map({"Graduate":0,"High School":1,"Uneducated":2,"College":3,"Post-Graduate":4,"Doctorate":5})
df['Education_Level']=df['Education_Level'].astype(int)

In [12]:
df["Marital_Status"].value_counts()

Married     3999
Single      3322
Divorced     652
Unknown      635
Name: Marital_Status, dtype: int64

In [13]:
df=df[df['Marital_Status']!="Unknown"]
df["Marital_Status"]=df["Marital_Status"].map({"Married":0,"Single":1,"Divorced":2})
df['Marital_Status']=df['Marital_Status'].astype(int)

In [14]:
df["Income_Category"].value_counts()

Less than $40K    2792
$40K - $60K       1412
$80K - $120K      1202
$60K - $80K       1103
Unknown            892
$120K +            572
Name: Income_Category, dtype: int64

In [15]:
df=df[df['Income_Category']!="Unknown"]
df["Income_Category"]=df["Income_Category"].map({"Less than $40K":0,"$40K - $60K":1,"$80K - $120K":2,"$60K - $80K":3,"$120K +":4})
df['Income_Category']=df['Income_Category'].astype(int)

In [16]:
df["Card_Category"].value_counts()

Blue        6598
Silver       391
Gold          81
Platinum      11
Name: Card_Category, dtype: int64

In [17]:
df["Card_Category"]=df["Card_Category"].map({"Blue":0,"Silver":1,"Gold":2,"Platinum":3})
df['Card_Category']=df['Card_Category'].astype(int)

In [18]:
abs(df.corr()["Credit_Limit"].sort_values(ascending=False)) #We look at their correlations.

Credit_Limit                                                                                                                          1.000000
Avg_Open_To_Buy                                                                                                                       0.996041
Income_Category                                                                                                                       0.530106
Gender                                                                                                                                0.476471
Card_Category                                                                                                                         0.475312
Total_Trans_Amt                                                                                                                       0.171591
Dependent_count                                                                                                                       0.081379

### Regression

In [19]:
x,y=df.drop(["Avg_Open_To_Buy","Credit_Limit"],axis=1),df[["Credit_Limit"]]
x=scaler.fit_transform(x)
x.shape

(7081, 21)

In [20]:
def algo_test(x,y):
    L = LinearRegression()
    E = ElasticNet()
    R = Ridge()
    Lass = Lasso()
    ETR=ExtraTreeRegressor()
    GBR=GradientBoostingRegressor()
    XGBC= XGBRegressor()
    x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=13)
    algos = [L,E,R,Lass,ETR,GBR,XGBC]
    algo_names = ['Linear','ElasticNet','Ridge','Lasso','Extra Tree','Gradient Boosting','XGradientBooting']
    r_squared = []
    rmse = []
    mae = []
    result = pd.DataFrame(columns = ['R_Squared','RMSE','MAE'],index = algo_names)
    for algo in algos:
        algo.fit(x_train,y_train)    
        r_squared.append(r2_score(y_test,algo.predict(x_test)))
        rmse.append(mean_squared_error(y_test, algo.predict(x_test))**.5)
        mae.append(mean_absolute_error(y_test, algo.predict(x_test)))
    result.R_Squared = r_squared
    result.RMSE = rmse
    result.MAE= mae
    return result.sort_values('R_Squared', ascending=False)

In [21]:
algo_test(x,y)

Unnamed: 0,R_Squared,RMSE,MAE
Gradient Boosting,0.878672,3187.004334,1643.471908
XGradientBooting,0.86654,3342.544141,1505.965715
Extra Tree,0.789897,4193.900315,1661.009739
Ridge,0.59655,5811.610735,4207.966102
Lasso,0.596334,5813.16876,4206.812666
Linear,0.596144,5814.530593,4209.805458
ElasticNet,0.232004,8018.279377,5922.250288


In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)

In [23]:
model=Sequential()
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(252,activation="relu"))
model.add(Dense(1))
model.compile(optimizer="adam",loss="mse")

In [24]:
model.fit(x_train,y_train,validation_data=(x_test,y_test),batch_size=128,epochs=67)
model.summary()

Epoch 1/67
Epoch 2/67
Epoch 3/67
Epoch 4/67
Epoch 5/67
Epoch 6/67
Epoch 7/67
Epoch 8/67
Epoch 9/67
Epoch 10/67
Epoch 11/67
Epoch 12/67
Epoch 13/67
Epoch 14/67
Epoch 15/67
Epoch 16/67
Epoch 17/67
Epoch 18/67
Epoch 19/67
Epoch 20/67
Epoch 21/67
Epoch 22/67
Epoch 23/67
Epoch 24/67
Epoch 25/67
Epoch 26/67
Epoch 27/67
Epoch 28/67
Epoch 29/67
Epoch 30/67
Epoch 31/67
Epoch 32/67
Epoch 33/67
Epoch 34/67
Epoch 35/67
Epoch 36/67
Epoch 37/67
Epoch 38/67
Epoch 39/67
Epoch 40/67
Epoch 41/67
Epoch 42/67
Epoch 43/67
Epoch 44/67
Epoch 45/67
Epoch 46/67
Epoch 47/67
Epoch 48/67
Epoch 49/67
Epoch 50/67
Epoch 51/67
Epoch 52/67
Epoch 53/67
Epoch 54/67
Epoch 55/67
Epoch 56/67
Epoch 57/67
Epoch 58/67
Epoch 59/67
Epoch 60/67
Epoch 61/67
Epoch 62/67
Epoch 63/67
Epoch 64/67
Epoch 65/67
Epoch 66/67
Epoch 67/67
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 252)             

                                                                 
 dense_4 (Dense)             (None, 252)               63756     
                                                                 
 dense_5 (Dense)             (None, 252)               63756     
                                                                 
 dense_6 (Dense)             (None, 252)               63756     
                                                                 
 dense_7 (Dense)             (None, 252)               63756     
                                                                 
 dense_8 (Dense)             (None, 1)                 253       
                                                                 
Total params: 452,089
Trainable params: 452,089
Non-trainable params: 0
_________________________________________________________________


In [25]:
tahmin=model.predict(x_test)



In [26]:
r2_score(tahmin,y_test)

0.8681478798911424

In [27]:
(mean_squared_error(tahmin,y_test))**0.5

3231.43101733604