In [1]:
import os
import pandas as pd
import yaml

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [2]:
features_config = yaml.safe_load(open(os.path.join('..', 'src', 'config', 'feature_config.yaml'), "r"))

In [5]:
df_processed = pd.read_csv(os.path.join('..', 'data', 'processed', 'processed_dataset.csv'))

print(df_processed.shape)
df_processed.head()

(41176, 21)


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [6]:
month_to_quarter = {
    'jan': '1Q', 'feb': '1Q', 'mar': '1Q',
    'apr': '2Q', 'may': '2Q', 'jun': '2Q',
    'jul': '3Q', 'aug': '3Q', 'sep': '3Q',
    'oct': '4Q', 'nov': '4Q', 'dec': '4Q'
}

df_processed['quarter'] = df_processed['month'].map(month_to_quarter)


In [7]:
not_employed = ['retired', 'student', 'unemployed']

def categorize_employment(job):
    if job in not_employed:
        return 'not_employed'
    elif job == 'unknown':
        return 'unknown'
    else:
        return 'employed'

df_processed['is_employed'] = df_processed['job'].apply(categorize_employment)

In [8]:
df_processed['contacts_tendency'] = df_processed['campaign'] + df_processed['previous']

In [9]:
df_processed[['contacts_tendency', 'month', 'quarter', 'job', 'is_employed', 'campaign', 'previous']].head()

Unnamed: 0,contacts_tendency,month,quarter,job,is_employed,campaign,previous
0,1,may,2Q,housemaid,employed,1,0
1,1,may,2Q,services,employed,1,0
2,1,may,2Q,services,employed,1,0
3,1,may,2Q,admin.,employed,1,0
4,1,may,2Q,services,employed,1,0


In [10]:
df_processed.groupby('job')['y'].value_counts(normalize=True).unstack()


y,0,1
job,Unnamed: 1_level_1,Unnamed: 2_level_1
admin.,0.870333,0.129667
blue-collar,0.931049,0.068951
entrepreneur,0.914835,0.085165
housemaid,0.9,0.1
management,0.887825,0.112175
retired,0.747381,0.252619
self-employed,0.895144,0.104856
services,0.918578,0.081422
student,0.685714,0.314286
technician,0.891675,0.108325


In [11]:
df_processed.groupby('is_employed')['y'].value_counts(normalize=True).unstack()


y,0,1
is_employed,Unnamed: 1_level_1,Unnamed: 2_level_1
employed,0.899326,0.100674
not_employed,0.763515,0.236485
unknown,0.887879,0.112121


In [12]:
pd.crosstab(
    index=[df_processed['loan'], df_processed['housing']],
    columns=df_processed['y'],
    normalize='index'
)


Unnamed: 0_level_0,y,0,1
loan,housing,Unnamed: 2_level_1,Unnamed: 3_level_1
no,no,0.890958,0.109042
no,yes,0.882662,0.117338
unknown,unknown,0.891919,0.108081
yes,no,0.892843,0.107157
yes,yes,0.88919,0.11081
