In [None]:
#Importing necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [None]:
#Loading data
bank = pd.read_csv("/content/bank.csv",sep=';')

In [None]:
bank.head()
bank.shape

In [None]:
bank.info()

In [None]:
#Statistical Analysis
bank.describe()

In [None]:
#Checking Missing values
bank.isnull().sum()

In [None]:
#Checking for duplicates
bank.duplicated().sum()

In [None]:
#Investigating these 12 duplicates
bank[bank.duplicated()]

Exploratory Data Analysis
Age Distribution

In [None]:
sns.histplot(x="age", data=bank, kde=True, hue= "y")
plt.title("Age Distribution and Deposits\n")
plt.show()

Distribution of Occupation

In [None]:
plt.figure(figsize=(15,4))
sns.countplot(x="job", data= bank, hue ="y")
plt.title("Occupation Distribution and Deposits\n")
plt.show()

Distribution of Marital Status

In [None]:
plt.figure(figsize=(7,3))
sns.countplot(x="marital", data= bank, hue ="y")
plt.title("Marital Status and Deposits\n")
plt.show()

Distribution of Education Status

In [None]:
plt.figure(figsize=(12,4))
sns.countplot(x="education", data= bank, hue ="y")
plt.title("Education Status and Deposits\n")
plt.show()

In [None]:
bank.default.value_counts()

Distribution of Housing Loan

In [None]:
plt.figure(figsize=(6,3.5))
sns.countplot(x="housing", data= bank, hue ="y")
plt.title("Housing Loan Distribution and Deposits\n")
plt.show()

Distribution of Personal Loan

In [None]:
plt.figure(figsize=(6,3.5))
sns.countplot(x="loan", data= bank, hue ="y")
plt.title("Personal Loan Distribution and Deposits\n")
plt.show()

Distribution of Outcome (Term Deposits)

In [None]:
bank.y.value_counts()
keys = bank.y.value_counts().index
data = bank.y.value_counts().values
plt.figure(figsize=(6,3.5))
explode = [0,0.1]
plt.pie(data,labels=keys,explode=explode, autopct='%.0f%%')
plt.show()

Label Encoding Categorical Features

In [None]:
cols = bank.select_dtypes("object").columns
cols

In [None]:
le = LabelEncoder()

bank[cols] = bank[cols].apply(le.fit_transform)

In [None]:
bank.head(3)

In [None]:
plt.figure(figsize=(23,10))
sns.heatmap(bank.corr(), cmap='bwr', annot=True)
plt.show()

Standardization

In [None]:
#Splitting input and output
X = bank.drop("y", axis=1)
y = bank.y

In [None]:
scaler = StandardScaler()

X_scaled = pd.DataFrame(scaler.fit_transform(X), columns = X.columns)

Model building - Decision Tree Classifier

In [None]:
#Train-test split
train_X, test_X, train_y, test_y = train_test_split(X_scaled, y, test_size=0.3)

In [None]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(train_X, train_y)

In [None]:
print('Train Score: {}'.format(decision_tree.score(train_X, train_y)))
print('Test Score: {}'.format(decision_tree.score(test_X, test_y)))

In [None]:
cross_val_score(decision_tree, train_X, train_y, cv=5).mean()

In [None]:
ypred = decision_tree.predict(test_X)
print(classification_report(test_y,ypred))

Hyperparamter tunning

In [None]:

param_grid = {
    'max_depth': [3, 5, 7,10, None],
    'criterion' : ['gini', 'entropy'],
    'min_samples_leaf': [3, 5, 7, 9,10,20]
    }

In [None]:
gscv = GridSearchCV(decision_tree, param_grid, cv=5, verbose=1)
gscv.fit(train_X, train_y)

In [None]:
gscv.best_params_

In [None]:
gscv.best_estimator_

In [None]:
cross_val_score(gscv.best_estimator_, train_X, train_y, cv=5).mean()

In [None]:
clf = DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_samples_leaf = 3)
clf.fit(train_X, train_y)

In [None]:
print('Train Score: {}'.format(clf.score(train_X, train_y)))
print('Test Score: {}'.format(clf.score(test_X, test_y)))

In [None]:
pred_y = clf.predict(test_X)

In [None]:
#Confusion Matrix
cm = confusion_matrix(pred_y, test_y)
ConfusionMatrixDisplay(cm, display_labels=clf.classes_).plot()
plt.show()

In [None]:
#Classification Report
print(classification_report(pred_y, test_y))

In [None]:
#Accuracy Score
accuracy = accuracy_score(test_y,pred_y)
print("Test Accuracy of Decision Tree Classifier : {}".format(accuracy*100))

In [None]:
#Cross Validation Score
Cross_val = cross_val_score(clf, test_X,test_y, cv=5).mean()
print("Cross-Validation Accuracy Scores Decision Tree : ",Cross_val*100)

Visualizing the Tree

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
t= tree.plot_tree(clf,filled=True,feature_names=X.columns)