In [359]:
import pandas as pd
import numpy as np

In [360]:
df = pd.read_csv("Churn_Modelling.csv")

In [361]:
df.drop(columns=['RowNumber' , 'CustomerId' , 'Surname'],inplace=True)

In [362]:
categorical_features = df.select_dtypes(include='O').columns

In [363]:
df['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

# Applying OHE in Geography and Ordinal Encoder on Gender Feature

Female = 0

Male = 1

In [364]:
df

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [365]:
from sklearn.preprocessing import StandardScaler , OneHotEncoder , OrdinalEncoder

In [366]:
from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer

In [367]:
cols = df.columns

In [368]:
X = df.drop(columns='Exited')
y = df['Exited']

In [369]:
preprocessing = ColumnTransformer(
    transformers=[
        ('Gender_Ordinal_Encoder' , OrdinalEncoder() , ['Gender']),
        ('Geography_OHE' , OneHotEncoder(handle_unknown='ignore' , sparse_output=False) , ['Geography'])
    ],remainder=StandardScaler()
)

In [370]:
df_data = preprocessing.fit_transform(X)

In [371]:
preprocessing

0,1,2
,transformers,"[('Gender_Ordinal_Encoder', ...), ('Geography_OHE', ...)]"
,remainder,StandardScaler()
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [372]:
preprocessing.get_feature_names_out()

array(['Gender_Ordinal_Encoder__Gender',
       'Geography_OHE__Geography_France',
       'Geography_OHE__Geography_Germany',
       'Geography_OHE__Geography_Spain', 'remainder__CreditScore',
       'remainder__Age', 'remainder__Tenure', 'remainder__Balance',
       'remainder__NumOfProducts', 'remainder__HasCrCard',
       'remainder__IsActiveMember', 'remainder__EstimatedSalary'],
      dtype=object)

In [373]:
X_preprocessed = pd.DataFrame(df_data , columns=preprocessing.get_feature_names_out())

In [374]:
df.shape

(10000, 11)

In [375]:
X_preprocessed

Unnamed: 0,Gender_Ordinal_Encoder__Gender,Geography_OHE__Geography_France,Geography_OHE__Geography_Germany,Geography_OHE__Geography_Spain,remainder__CreditScore,remainder__Age,remainder__Tenure,remainder__Balance,remainder__NumOfProducts,remainder__HasCrCard,remainder__IsActiveMember,remainder__EstimatedSalary
0,0.0,1.0,0.0,0.0,-0.326221,0.293517,-1.041760,-1.225848,-0.911583,0.646092,0.970243,0.021886
1,0.0,0.0,0.0,1.0,-0.440036,0.198164,-1.387538,0.117350,-0.911583,-1.547768,0.970243,0.216534
2,0.0,1.0,0.0,0.0,-1.536794,0.293517,1.032908,1.333053,2.527057,0.646092,-1.030670,0.240687
3,0.0,1.0,0.0,0.0,0.501521,0.007457,-1.387538,-1.225848,0.807737,-1.547768,-1.030670,-0.108918
4,0.0,0.0,0.0,1.0,2.063884,0.388871,-1.041760,0.785728,-0.911583,0.646092,0.970243,-0.365276
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,1.0,0.0,0.0,1.246488,0.007457,-0.004426,-1.225848,0.807737,0.646092,-1.030670,-0.066419
9996,1.0,1.0,0.0,0.0,-1.391939,-0.373958,1.724464,-0.306379,-0.911583,0.646092,0.970243,0.027988
9997,0.0,1.0,0.0,0.0,0.604988,-0.278604,0.687130,-1.225848,-0.911583,-1.547768,0.970243,-1.008643
9998,1.0,0.0,1.0,0.0,1.256835,0.293517,-0.695982,-0.022608,0.807737,0.646092,-1.030670,-0.125231


In [376]:
import pickle

with open('preprocessed.pkl' , 'wb') as file:
    pickle.dump(preprocessing,file)

# Training The Model

In [377]:
cols = X_preprocessed.columns

In [378]:
cols

Index(['Gender_Ordinal_Encoder__Gender', 'Geography_OHE__Geography_France',
       'Geography_OHE__Geography_Germany', 'Geography_OHE__Geography_Spain',
       'remainder__CreditScore', 'remainder__Age', 'remainder__Tenure',
       'remainder__Balance', 'remainder__NumOfProducts',
       'remainder__HasCrCard', 'remainder__IsActiveMember',
       'remainder__EstimatedSalary'],
      dtype='object')

In [379]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X_preprocessed , y , test_size=0.2 , random_state=42)

In [380]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.transform(X_test)

In [381]:
# with open('scalar.pkl' , 'wb') as file:
#     pickle.dump(scalar , file)

# Applying RandomForest on this dataset

In [382]:
from sklearn.ensemble import RandomForestClassifier

In [383]:
RFC = RandomForestClassifier(n_estimators=100 , oob_score=True)
RFC.fit(X_train , y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [384]:
y_pred = RFC.predict(X_test)

In [385]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test , y_pred)

0.8645

# Applying XGBoost ALgo

In [386]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train , y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [387]:
y_pred = xgb.predict(X_test)

In [388]:
accuracy_score(y_test , y_pred)

0.858

# Applying ANN

In [389]:
import tensorflow
from tensorflow import keras 
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [390]:
X_train.shape

(8000, 12)

In [391]:
from tensorflow.keras.regularizers import l2

In [392]:
model = Sequential()

model.add(Dense(64 , activation='relu' , input_dim = X_train.shape[1] , kernel_regularizer=l2(0.001) , kernel_initializer= 'he_uniform'))

model.add(Dense(32 , activation='relu' , kernel_regularizer=l2(0.001) , kernel_initializer='he_uniform'))

model.add(Dense(32 , activation='relu' , kernel_regularizer=l2(0.001) , kernel_initializer='he_uniform'))

model.add(Dense(1 , activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [393]:
model.summary()

In [394]:
opt = tensorflow.keras.optimizers.Adam(learning_rate=0.01)
loss = tensorflow.keras.losses.BinaryCrossentropy()

In [395]:
model.compile(loss=loss , optimizer=opt , metrics=['accuracy'])

## Designing TensorBoard

In [396]:
import datetime

### Tensorboard is used to visualize all the logs that we have while training the model

In [397]:
from tensorflow.keras.callbacks import TensorBoard

log_dir = "log/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

tensorflow_callback = TensorBoard(log_dir = log_dir , histogram_freq = 1)

In [398]:
from tensorflow.keras.callbacks import EarlyStopping

callback = EarlyStopping(
    monitor = 'val_accuracy',
    min_delta = 0.0001,
    patience = 20 ,
    mode = 'auto',
    baseline = None
)

In [399]:
history = model.fit(X_train , y_train , epochs=100 ,callbacks=[callback , tensorflow_callback], validation_data=(X_test , y_test))

Epoch 1/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8265 - loss: 0.5508 - val_accuracy: 0.8460 - val_loss: 0.4380
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8503 - loss: 0.4130 - val_accuracy: 0.8495 - val_loss: 0.3896
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8499 - loss: 0.3920 - val_accuracy: 0.8505 - val_loss: 0.3882
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8495 - loss: 0.3889 - val_accuracy: 0.8560 - val_loss: 0.3992
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8554 - loss: 0.3781 - val_accuracy: 0.8560 - val_loss: 0.3693
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8536 - loss: 0.3745 - val_accuracy: 0.8475 - val_loss: 0.3812
Epoch 7/100
[1m250/25

In [400]:
model.save('model.keras')

In [401]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [402]:
%tensorboard --logdir log/fit

Reusing TensorBoard on port 6006 (pid 21468), started 1 day, 1:47:13 ago. (Use '!kill 21468' to kill it.)