In [1]:
import pandas as pd

FULL = '/kaggle/input/bank-marketing/bank-full.csv'

df = pd.read_csv(filepath_or_buffer=FULL, sep=';')
for column in ['default', 'housing', 'loan', 'y']:
    if column != 'y':
        df[column] = df[column] == 'yes'
    else:
        df['target'] = df[column] == 'yes'
df = pd.get_dummies(data=df, columns=['job', 'marital', 'education', 'contact', 'month', 'poutcome'])
df.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,False,2143,True,False,5,261,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
1,44,False,29,True,False,5,151,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
2,33,False,2,True,True,5,76,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
3,47,False,1506,True,False,5,92,1,-1,0,...,False,False,True,False,False,False,False,False,False,True
4,33,False,1,False,False,5,198,1,-1,0,...,False,False,True,False,False,False,False,False,False,True


Is our target class balanced? We would expect it not to be, as marketing campaigns tend to be low-yield affairs.

In [2]:
TARGET = 'target'
df[TARGET].value_counts(normalize=True).to_dict()

{False: 0.8830151954170445, True: 0.11698480458295547}

In [3]:
COLUMNS = ['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married', 'marital_single', 'education_primary', 'education_secondary', 'education_tertiary', 'education_unknown', 'contact_cellular', 'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep', 'poutcome_failure', 'poutcome_other', 'poutcome_success', 'poutcome_unknown']

In [4]:
from plotly import express
import warnings

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.histogram(data_frame=df, x='age', color=TARGET, facet_col=TARGET)

In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=500)
df[['x', 'y']] = umap.fit_transform(X=df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

2024-08-06 15:32:48.898278: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-06 15:32:48.898403: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-06 15:32:49.048002: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=500, n_jobs=1, random_state=2024, verbose=True)
Tue Aug  6 15:32:59 2024 Construct fuzzy simplicial set
Tue Aug  6 15:32:59 2024 Finding Nearest Neighbors
Tue Aug  6 15:32:59 2024 Building RP forest with 16 trees
Tue Aug  6 15:33:05 2024 NN descent for 15 iterations
	 1  /  15
	 2  /  15
	Stopping threshold met -- exiting after 2 iterations
Tue Aug  6 15:33:25 2024 Finished Nearest Neighbor Search
Tue Aug  6 15:33:29 2024 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	completed  250  /  500 epochs
	completed  300  /  500 epochs
	completed  350  /  500 epochs
	completed  400  /  500 epochs
	completed  450  /  500 epochs
Tue Aug  6 15:34:54 2024 Finished embedding
done with UMAP in 0:01:54.731508


In [6]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)
express.scatter(data_frame=df.sample(n=1000, random_state=2024), x='x', y='y', color=TARGET, )

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df[COLUMNS], df[TARGET], test_size=0.2, random_state=2024, stratify=df[TARGET])
model = LogisticRegression(max_iter=5000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(model.n_iter_[0]))

print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=model.predict(X=X_test))))

model fit in 1099 iterations
accuracy: 0.9014


In [8]:
from plotly import express

express.histogram(x=COLUMNS, y=model.coef_[0])

In [9]:
from sklearn.metrics import classification_report

print(classification_report(zero_division=0 , y_true=y_test, y_pred=model.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.92      0.98      0.95      7985
        True       0.65      0.34      0.45      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.66      0.70      9043
weighted avg       0.89      0.90      0.89      9043



In [10]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=5, random_state=2024).fit(X=X_train, y=y_train)
print(classification_report(zero_division=0 , y_true=y_test, y_pred=tree.predict(X=X_test)))

              precision    recall  f1-score   support

       False       0.92      0.98      0.95      7985
        True       0.66      0.36      0.46      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.67      0.70      9043
weighted avg       0.89      0.90      0.89      9043

