In [2]:
#import library
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
#Configure labraries
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10,10)
plt.style.use('seaborn')

In [3]:
#Data pre-processing
#Load dataset
df = pd.read_csv('bank.csv', delimiter=';')
df = df.drop('day', axis=1)
df = df.drop('month', axis=1)
print(df.shape)
print(df.head())

(4521, 15)
   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  duration  campaign  pdays  previous poutcome   y  
0  cellular        79         1     -1         0  unknown  no  
1  cellular       220         1    339         4  failure  no  
2  cellular       185         1    330         1  failure  no  
3   unknown       199         4     -1         0  unknown  no  
4   unknown       226         1     -1         0  unknown  no  


In [4]:
#Class distribution
df['y'].value_counts()

y
no     4000
yes     521
Name: count, dtype: int64

In [5]:
#Missing values
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [6]:
#Scaling numeric data
from sklearn.preprocessing import StandardScaler
#Copying org dataframe
df_copy = df.copy()
scaler = StandardScaler()
num_cols = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
df_copy[num_cols] = scaler.fit_transform(df_copy[num_cols])
df_copy.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,pdays,previous,poutcome,y
0,-1.05627,unemployed,married,primary,no,0.121072,no,no,cellular,-0.711861,-0.576829,-0.407218,-0.320413,unknown,no
1,-0.772583,services,married,secondary,no,1.118644,yes,yes,cellular,-0.169194,-0.576829,2.989044,2.041734,failure,no
2,-0.583458,management,single,tertiary,no,-0.024144,yes,no,cellular,-0.303898,-0.576829,2.899143,0.270124,failure,no
3,-1.05627,management,married,tertiary,no,0.017726,yes,yes,unknown,-0.250017,0.387967,-0.407218,-0.320413,unknown,no
4,1.686036,blue-collar,married,secondary,no,-0.472753,yes,no,unknown,-0.146102,-0.576829,-0.407218,-0.320413,unknown,no


In [7]:
#Encoder categorical value
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'poutcome']
#Encoder Categorical data
df_encoder = pd.DataFrame(encoder.fit_transform(df_copy[cat_cols]))
df_encoder.columns = encoder.get_feature_names_out(cat_cols)
# #Replace Categorical Data with Encoder Data
df_copy = df_copy.drop(cat_cols, axis=1)
df_copy = pd.concat([df_encoder, df_copy], axis=1)
# Encode target value
df_copy['y'] = df_copy['y'].apply(lambda x: 1 if x == 'yes' else 0)
print('Shape of dataframe:', df_copy.shape)
df_copy.head()

Shape of dataframe: (4521, 39)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_other,poutcome_success,poutcome_unknown,age,balance,duration,campaign,pdays,previous,y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-1.05627,0.121072,-0.711861,-0.576829,-0.407218,-0.320413,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,-0.772583,1.118644,-0.169194,-0.576829,2.989044,2.041734,0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.583458,-0.024144,-0.303898,-0.576829,2.899143,0.270124,0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,-1.05627,0.017726,-0.250017,0.387967,-0.407218,-0.320413,0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.686036,-0.472753,-0.146102,-0.576829,-0.407218,-0.320413,0


In [8]:
#Split dataset for training and test 
feature = df_copy.drop('y', axis=1)
#select target
target = df_copy['y']
#Set training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature, target, shuffle=True, test_size=0.2, random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)



Shape of training feature: (3616, 38)
Shape of testing feature: (905, 38)
Shape of training label: (3616,)
Shape of training label: (905,)


In [9]:
#Building model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [11]:
#Evaluation 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(X_test)# dự đoán 

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[755  35]
 [ 91  24]]
              precision    recall  f1-score   support

           0       0.89      0.96      0.92       790
           1       0.41      0.21      0.28       115

    accuracy                           0.86       905
   macro avg       0.65      0.58      0.60       905
weighted avg       0.83      0.86      0.84       905

