In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('Banking Dataset_DT.csv')
df.head()

Unnamed: 0,Cust_ID,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,C1,0,30,M,160378.6,SAL,2,26-30,826,9
1,C10,1,41,M,84370.59,SELF-EMP,14,41-45,843,9
2,C100,0,49,F,60849.26,PROF,49,46-50,328,26
3,C1000,0,49,M,10558.81,SAL,23,46-50,619,19
4,C10000,0,43,M,97100.48,SENP,3,41-45,397,8


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Cust_ID         20000 non-null  object 
 1   Target          20000 non-null  int64  
 2   Age             20000 non-null  int64  
 3   Gender          20000 non-null  object 
 4   Balance         20000 non-null  float64
 5   Occupation      20000 non-null  object 
 6   No_OF_CR_TXNS   20000 non-null  int64  
 7   AGE_BKT         20000 non-null  object 
 8   SCR             20000 non-null  int64  
 9   Holding_Period  20000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 1.5+ MB


In [9]:
df.drop('Cust_ID',axis=1,inplace = True)

In [10]:
for i in df.columns:
    if df[i].dtypes == 'object':
        print(df[i].value_counts(normalize= True))
        print('*'*50)

Gender
M    0.71395
F    0.27625
O    0.00980
Name: proportion, dtype: float64
**************************************************
Occupation
SAL         0.29195
PROF        0.27315
SENP        0.26660
SELF-EMP    0.16830
Name: proportion, dtype: float64
**************************************************
AGE_BKT
31-35    0.1744
26-30    0.1702
>50      0.1510
41-45    0.1508
36-40    0.1378
46-50    0.1266
<25      0.0892
Name: proportion, dtype: float64
**************************************************


In [11]:
df.describe()

Unnamed: 0,Target,Age,Balance,No_OF_CR_TXNS,SCR,Holding_Period
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.08665,38.3962,146181.3,16.65305,557.13605,15.3441
std,0.281329,9.600179,169812.5,12.977704,260.521453,8.952128
min,0.0,21.0,0.0,0.0,100.0,1.0
25%,0.0,30.0,23736.92,7.0,333.0,8.0
50%,0.0,38.0,79755.74,13.0,560.0,16.0
75%,0.0,47.0,217310.6,22.0,784.0,23.0
max,1.0,55.0,1246967.0,50.0,999.0,31.0


In [12]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

for i in df.drop(['Target','Age','Balance','No_OF_CR_TXNS','SCR','Holding_Period'],axis = 1).columns:
    df[i] = encode.fit_transform(df[i])

In [13]:
df.head()

Unnamed: 0,Target,Age,Gender,Balance,Occupation,No_OF_CR_TXNS,AGE_BKT,SCR,Holding_Period
0,0,30,1,160378.6,1,2,0,826,9
1,1,41,1,84370.59,2,14,3,843,9
2,0,49,0,60849.26,0,49,4,328,26
3,0,49,1,10558.81,1,23,4,619,19
4,0,43,1,97100.48,3,3,3,397,8


In [15]:
X = df.drop('Target',axis = 1)
y = df['Target']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify=y)

In [29]:
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=10, class_weight={0:0.08,1:0.91},random_state=42)

In [30]:
dt.fit(x_train,y_train)

In [31]:
pred_test = dt.predict(x_test)
pred_train = dt.predict(x_train)

In [32]:
print("test accuracy :",accuracy_score(y_test, pred_test))

print("train accuracy :",accuracy_score(y_train, pred_train))

test accuracy : 0.7036666666666667
train accuracy : 0.7458571428571429


In [33]:
print('Test classification report')
print(metrics.classification_report(y_test, pred_test))
print('Train classification report')
print(metrics.classification_report(y_train, pred_train))

Test classification report


<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

           0       0.95      0.71      0.81      5480
           1       0.17      0.63      0.27       520

    accuracy                           0.70      6000
   macro avg       0.56      0.67      0.54      6000
weighted avg       0.89      0.70      0.77      6000

Train classification report


<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

           0       0.99      0.73      0.84     12787
           1       0.24      0.92      0.38      1213

    accuracy                           0.75     14000
   macro avg       0.62      0.82      0.61     14000
weighted avg       0.92      0.75      0.80     14000



In [34]:
pd.DataFrame(dt.feature_importances_,index = X.columns,columns=['Featue Importance']).sort_values('Featue Importance',ascending=False)

Unnamed: 0,Featue Importance
Holding_Period,0.335074
No_OF_CR_TXNS,0.170882
Balance,0.170501
SCR,0.130752
Age,0.079441
Occupation,0.067696
Gender,0.028356
AGE_BKT,0.017297
