In [74]:
import os.path

import joblib
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np

from plotly.subplots import make_subplots
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    roc_curve,
    precision_score,
    recall_score,
    f1_score
)
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
)
from warnings import filterwarnings

filterwarnings("ignore")

### Business Objective

It comes as no surprise that large marketing campaigns have negative sentiment amongst the general populace. Think about the last time you answered an unexpected phone call from an unknown number, if your experience is anything like my then, it was either a scam caller, telemarketer or survey taker. I find myself hanging up quickly when it comes to these types of calls, if I were to ever answer them. Every failed cold call costs the company commissioning the campaign time and money. The bank partner commissioning this study is seeking to increase campaign success and reduce costs by focusing on profiles that are more likely to accept their offerings. The bank partner would like a model that can better predict the type of person that would accept offers from our partner bank.

### Understanding the Features

```
Input variables:
# bank client data:
1 - age (numeric)
2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 - default: has credit in default? (categorical: 'no','yes','unknown')
6 - housing: has housing loan? (categorical: 'no','yes','unknown')
7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: 'cellular','telephone')
9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 - cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):
21 - y - has the client subscribed a term deposit? (binary: 'yes','no')
```



### Understanding the Data


In [2]:
df = pd.read_csv('data/bank-additional-full.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [5]:
df.describe(include="object")


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
count,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188,41188
unique,12,4,8,3,3,3,2,10,5,3,2
top,admin.,married,university.degree,no,yes,no,cellular,may,thu,nonexistent,no
freq,10422,24928,12168,32588,21576,33950,26144,13769,8623,35563,36548


In [6]:
# Only 12 duplicates
print(
    f"Row count: {df.shape[0]}, Duplicate count: {df.shape[0] - df.drop_duplicates().shape[0]}"
)

Row count: 41188, Duplicate count: 12


In [7]:
# No columns missing data
df.isna().mean().round(2)

age               0.0
job               0.0
marital           0.0
education         0.0
default           0.0
housing           0.0
loan              0.0
contact           0.0
month             0.0
day_of_week       0.0
duration          0.0
campaign          0.0
pdays             0.0
previous          0.0
poutcome          0.0
emp.var.rate      0.0
cons.price.idx    0.0
cons.conf.idx     0.0
euribor3m         0.0
nr.employed       0.0
y                 0.0
dtype: float64

In [8]:
df.sample(5)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
23781,51,admin.,divorced,university.degree,unknown,no,no,cellular,aug,fri,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,no
3894,53,technician,married,high.school,no,no,yes,telephone,may,mon,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.858,5191.0,no
4666,36,services,married,high.school,no,yes,yes,telephone,may,wed,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.858,5191.0,no
14860,37,blue-collar,married,basic.9y,no,no,no,cellular,jul,wed,...,1,999,0,nonexistent,1.4,93.918,-42.7,4.957,5228.1,no
20459,38,technician,married,high.school,no,no,no,cellular,aug,tue,...,2,999,0,nonexistent,1.4,93.444,-36.1,4.966,5228.1,no


In [24]:
y_counts = df['y'].value_counts()
fig = px.bar(
    y_counts,
    y='count',
    title="Those that say yes are in a minority class, suggesting an imbalanced dataset",
    labels={"count": "Count", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/acceptance_count.png")

In [85]:
fig = px.box(
    df,
    y='age',
    title="Most potential calls are towards people aged 32-47",
    labels={"age": "Age"},
)
fig.show()
fig.write_image("images/age_box.png")

In [19]:
# Store commonly used group by
by_y_df = df.groupby('y')
by_y_df.describe(include="object")

Unnamed: 0_level_0,job,job,job,job,marital,marital,marital,marital,education,education,...,month,month,day_of_week,day_of_week,day_of_week,day_of_week,poutcome,poutcome,poutcome,poutcome
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,...,top,freq,count,unique,top,freq,count,unique,top,freq
y,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
no,36548,12,admin.,9070,36548,4,married,22396,36548,8,...,may,12883,36548,5,mon,7667,36548,3,nonexistent,32422
yes,4640,12,admin.,1352,4640,4,married,2532,4640,8,...,may,886,4640,5,thu,1045,4640,3,nonexistent,3141


In [10]:
# Ratio calculation, this should be a better metric to track over counts
job_ratio = (by_y_df['job'].value_counts() / df['job'].value_counts()).reset_index()
fig = px.bar(
    job_ratio.sort_values(by='count'),
    x='job',
    y='count',
    color='y',
    title="Students and retirees are more likely to accept campaign",
    labels={"job": "Job", "count": "Ratio accepting campaign", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/job_acceptance_ratio.png")



In [14]:
education_ratio = (by_y_df['education'].value_counts() / df['education'].value_counts()).reset_index()
fig = px.bar(
    education_ratio.sort_values(by='count'),
    x='education',
    y='count',
    color='y',
    title="People who are illiterate are more likely to accept campaign",
    labels={"education": "Education", "count": "Ratio accepting campaign", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/education_acceptance_ratio.png")

In [75]:
month_ratio = (by_y_df['month'].value_counts() / df['month'].value_counts()).reset_index()
fig = px.bar(
    month_ratio.sort_values(by='count'),
    x='month',
    y='count',
    color='y',
    title="Month has a strong effect on acceptance",
    labels={"month": "Month", "count": "Ratio accepting campaign", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/month_acceptance_ratio.png")

In [28]:
fig = px.histogram(
    df,
    x='duration',
    color='y',
    title="The longer a call goes the higher the likelihood of acceptance",
    labels={"duration": "Duration", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/duration_acceptance_ratio.png")

In [32]:
fig = px.histogram(
    df,
    x='campaign',
    color='y',
    title="There are diminishing returns when making a call to the same person",
    labels={"campaign": "Number of calls", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/calls_acceptance_ratio.png")

In [43]:
fig = px.histogram(
    df[df['pdays'] != 999],
    x='pdays',
    color='y',
    title="Following up within a week increases the chance of success",
    labels={"pdays": "Days since last contact", 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/days_since_last_contact_acceptance_ratio.png")

In [36]:
# Correlation plots
corr_matrix = (
    df.corr(numeric_only=True).round(2)
)
fig = px.imshow(
    corr_matrix,
    title="Strong positive correlation between employment statistics suggest PCA may be worthwhile",
    color_continuous_scale="RdBu_r",
    aspect="auto",
)
fig.update_layout(height=1000, width=1000, showlegend=False)
fig.show()
fig.write_image("images/correlation.png")

In [61]:
price_ratio = (by_y_df['cons.price.idx'].value_counts() / df['cons.price.idx'].value_counts()).reset_index()

fig = px.histogram(
    price_ratio.sort_values(by='count'),
    x='cons.price.idx',
    y='count',
    color='y',
    title="Consumer price index may be noisy data",
    labels={"cons.price.idx": "Consumer price index", 'y': 'Accepted Campaign', "count": "Ratio accepting campaign"},
)
fig.show()
fig.write_image("images/consumer_price_index_acceptance_ratio.png")

In [77]:
price_ratio = (by_y_df['cons.conf.idx'].value_counts() / df['cons.conf.idx'].value_counts()).reset_index()

fig = px.histogram(
    price_ratio.sort_values(by='count'),
    x='cons.conf.idx',
    y='count',
    color='y',
    title="Consumer confidence index may be noisy data",
    labels={"cons.conf.idx": "Consumer confidence index", 'y': 'Accepted Campaign',
            "count": "Ratio accepting campaign"},
    nbins=10
)
fig.update_layout(height=800, width=1000)
fig.show()
fig.write_image("images/consumer_confidence_index_acceptance_ratio.png")

In [78]:
poutcome_ratio = (by_y_df['poutcome'].value_counts() / df['poutcome'].value_counts()).reset_index()

fig = px.bar(
    poutcome_ratio,
    x='poutcome',
    y='count',
    color='y',
    title="Those that accept the previous campaign are more likely to accept the next campaign",
    labels={"poutcome": "Previous outcome", 'y': 'Accepted Campaign', "count": "Ratio accepting campaign", },
)
fig.update_layout(height=800, width=1000)
fig.show()
fig.write_image("images/poutcome_acceptance_ratio.png")

In [82]:
fig = px.scatter(
    df,
    x='duration',
    y='campaign',
    color='y',
    title="Number of calls does not correlate with duration of the call",
    labels={"campaign": "Number of calls", 'duration': 'Duration', 'y': 'Accepted Campaign'},
)
fig.show()
fig.write_image("images/calls_vs_duration.png")


### Data Preparation

### Models

### Evaluation