In [73]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy


In [74]:
# Importing Data
df = pd.read_csv("/home/prasun/GitDemo/Learning_Deep_Learning/Lesson11/Churn_Modelling.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [75]:
# Dropping irrelevant columns
df.drop(labels= ["RowNumber", "CustomerId", "Surname"],axis= "columns",inplace= True)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [77]:
df.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [78]:
# Feature Scaling required columns
scl_col = []
for i in df.columns:
    if df[i].dtype != "object" and df[i].max()-df[i].min()>= 3:
        scl_col.append(i)

In [79]:
# Mean Normalization
for i in scl_col:
    df[i] = (df[i]-df[i].mean())/(df[i].max()-df[i].min())

In [80]:
# Feture Labelling
df.Gender.replace({'Female':1, 'Male':0},inplace= True)
df1 = pd.concat([df,pd.get_dummies(df[["Geography"]],drop_first= True, dtype= int)],axis= 1)
df1.drop(["Geography"], axis= "columns", inplace= True)
df1.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Gender.replace({'Female':1, 'Male':0},inplace= True)
  df.Gender.replace({'Female':1, 'Male':0},inplace= True)


Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain
0,-0.063058,1,0.041597,-0.30128,-0.304848,-0.176733,1,1,0.006294,1,0,0
1,-0.085058,1,0.028084,-0.40128,0.029183,-0.176733,0,1,0.062268,0,0,1
2,-0.297058,1,0.041597,0.29872,0.331509,0.489933,1,0,0.069213,1,0,0
3,0.096942,1,0.001057,-0.40128,-0.304848,0.1566,0,0,-0.031321,0,0,0
4,0.398942,1,0.055111,-0.30128,0.195398,-0.176733,1,1,-0.105041,0,0,1


In [81]:
# Balanced/imbalanced ??
df.Exited.value_counts()

Exited
0    7963
1    2037
Name: count, dtype: int64

In [82]:
# Segregating Features and Target
X = df1.drop(["Exited"],axis= "columns")
y = df1["Exited"]

In [83]:
# Creating Training-Dev-Test Set
from sklearn.model_selection import train_test_split
X_train0, X_test0, y_train0, y_test0 = train_test_split(X,y,test_size = 0.25, random_state= 0)

In [84]:
# Let's make a model retainging the imbalance
from sklearn.linear_model import LogisticRegression
log0 = LogisticRegression()
log0.fit(X_train0,y_train0)

In [85]:
#Evaluation
from sklearn.metrics import classification_report
y_pred0 = log0.predict(X_test0)
print(classification_report(y_test0,y_pred0))

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1991
           1       0.59      0.22      0.32       509

    accuracy                           0.81      2500
   macro avg       0.71      0.59      0.61      2500
weighted avg       0.78      0.81      0.77      2500



In [86]:
# Combine Dataframes
dff = pd.concat([X_train0,y_train0],axis=1)

### Method 1: Undersampling Majority Class

In [87]:
# Let's check class balance in training set
count0, count1 = y_train0.value_counts()
class0 = dff[dff["Exited"]== 0]
class1 = dff[dff["Exited"]== 1]

In [88]:
# New X_train and y_train
dff1 = pd.concat([class0.sample(count1),class1], axis= 0)
X_train1 = dff1.drop(["Exited"],axis= "columns")
y_train1 = dff1[["Exited"]]

In [89]:
# Let's make a model 
from sklearn.linear_model import LogisticRegression
log1 = LogisticRegression()
log1.fit(X_train1,y_train1)

#Evaluation
from sklearn.metrics import classification_report
y_pred1 = log1.predict(X_test0)
print(classification_report(y_test0,y_pred1))

              precision    recall  f1-score   support

           0       0.91      0.69      0.78      1991
           1       0.37      0.72      0.49       509

    accuracy                           0.70      2500
   macro avg       0.64      0.71      0.64      2500
weighted avg       0.80      0.70      0.72      2500



  y = column_or_1d(y, warn=True)


### Method 2: Oversapling Minority Class

In [90]:
# New X_train and y_train
dff2 = pd.concat([class1.sample(count0,replace= True),class0], axis= 0)
X_train2 = dff2.drop(["Exited"],axis= "columns")
y_train2 = dff2[["Exited"]]

# Let's make a model 
from sklearn.linear_model import LogisticRegression
log2 = LogisticRegression()
log2.fit(X_train2,y_train2)

#Evaluation
from sklearn.metrics import classification_report
y_pred2 = log2.predict(X_test0)
print(classification_report(y_test0,y_pred2))

              precision    recall  f1-score   support

           0       0.91      0.69      0.79      1991
           1       0.38      0.73      0.50       509

    accuracy                           0.70      2500
   macro avg       0.64      0.71      0.64      2500
weighted avg       0.80      0.70      0.73      2500



  y = column_or_1d(y, warn=True)


### Method 3: SMOTE

In [91]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='minority')
X_train3, y_train3 = sm.fit_resample(X_train0, y_train0)

In [92]:
# Let's make a model 
from sklearn.linear_model import LogisticRegression
log3 = LogisticRegression()
log3.fit(X_train3,y_train3)

#Evaluation
from sklearn.metrics import classification_report
y_pred3 = log3.predict(X_test0)
print(classification_report(y_test0,y_pred3))

              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1991
           1       0.38      0.72      0.50       509

    accuracy                           0.71      2500
   macro avg       0.65      0.71      0.65      2500
weighted avg       0.80      0.71      0.73      2500



### Method 4: Ensemble 

In [93]:
# Segmentation
ens1 = pd.concat([class0[:count1],class1],axis= 0)
ens2 = pd.concat([class0[count1:2*count1],class1],axis= 0)
ens3 = pd.concat([class0[2*count1:3*count1],class1],axis= 0)
ens4 = pd.concat([class0[3*count1:],class1],axis= 0 )

In [94]:
def log_train(df, y_pred, X_test, y_test):
    log = LogisticRegression()
    log.fit(df.drop(["Exited"],axis="columns"), df[["Exited"]])
    y_pred = log.predict(X_test)
    print(classification_report(y_test,y_pred))

In [95]:
y1_pred4 = y_test0.copy()
y2_pred4 = y_test0.copy()
y3_pred4 = y_test0.copy()
y4_pred4 = y_test0.copy()
log_train(ens1,y1_pred4,X_test0,y_test0)
log_train(ens2,y2_pred4,X_test0,y_test0)
log_train(ens3,y3_pred4,X_test0,y_test0)
log_train(ens4,y4_pred4,X_test0,y_test0)

              precision    recall  f1-score   support

           0       0.91      0.69      0.78      1991
           1       0.37      0.73      0.49       509

    accuracy                           0.70      2500
   macro avg       0.64      0.71      0.64      2500
weighted avg       0.80      0.70      0.72      2500

              precision    recall  f1-score   support

           0       0.91      0.70      0.79      1991
           1       0.38      0.73      0.50       509

    accuracy                           0.70      2500
   macro avg       0.64      0.71      0.64      2500
weighted avg       0.80      0.70      0.73      2500



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.91      0.69      0.79      1991
           1       0.38      0.73      0.50       509

    accuracy                           0.70      2500
   macro avg       0.64      0.71      0.64      2500
weighted avg       0.80      0.70      0.73      2500

              precision    recall  f1-score   support

           0       0.91      0.66      0.76      1991
           1       0.36      0.75      0.48       509

    accuracy                           0.68      2500
   macro avg       0.63      0.70      0.62      2500
weighted avg       0.80      0.68      0.71      2500



In [96]:
# Ensembling Predictions
y_pred4 = pd.Series(np.zeros(len(y_test0)))
for i in range(len(y_pred4)):
    if (y1_pred4.iloc[i] + y2_pred4.iloc[i] + y3_pred4.iloc[i] + y4_pred4.iloc[i]) == 0:
        y_pred4.iloc[i] = 0
    else:
        y_pred4.iloc[i] = 1


In [97]:
print( classification_report(y_pred4,y_test0))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1991
         1.0       1.00      1.00      1.00       509

    accuracy                           1.00      2500
   macro avg       1.00      1.00      1.00      2500
weighted avg       1.00      1.00      1.00      2500

