### Packages and dataset

In [121]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
import category_encoders
from category_encoders.target_encoder import TargetEncoder

In [110]:
#Reading csv
df = pd.read_csv("bank_marketing_weka_dataset.csv")

#Separate features
X = df.drop(['y'], axis=1)

#Target variable
y = df['y']

#Transform 'y' to binary
y = y.map(dict(yes=1, no=0))

In [111]:
#Features initial state
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79.0,1,-1.0,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220.0,1,339.0,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185.0,1,330.0,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199.0,4,-1.0,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226.0,1,-1.0,0,unknown


## Target variable balance

In [112]:
y.value_counts()

0    4000
1     521
Name: y, dtype: int64

# Features engineering

## Categorical variables

In [113]:
#List of categorical variables
X.select_dtypes(include='object').columns

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'poutcome'],
      dtype='object')

### Binary values
Variables: 'default', 'housing', 'loan'\
Values: 'yes', 'no'\
Method: map to yes=1, no=0

In [114]:
X = X.replace('yes', 1)
X = X.replace('no', 0)

### Month variable
Values: string abbreviations\
Method: transform to numerical

In [115]:
X.month = X.month.map(dict(
jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
jul=7, aug=8, sep=9, oct=10, nov=11, dec=12))

### Variables: 'job', 'marital', 'education', 'poutcom', 'contact'

Thanks to EDA we know that:

- 'unknown' values are present
- 'job', 'education', 'poutcom' seem to have impact on target variable
- values distributions have no dangerous disparities (in terms of using the target encoding method)

Hence, we decide to:

 - leave the 'unknown' values, as it represents best the idea of the missing information in our case
 - apply target encoding method to variables related with target variable
 - apply one-hot encoding to other variables
 
Expecting target encoding to reflect the relationship of variables with target variable - where it is present according to EDA - and one-hot method to neutrally pass on values of other variables.

In [116]:
#Target encoding
te = TargetEncoder()
X.loc[:, ['job', 'education', 'poutcome']] = te.fit_transform(X.loc[:, ['job', 'education', 'poutcome']], y)

In [117]:
#One-hot encoding
X = pd.get_dummies(X, columns=['marital', 'contact'])

In [120]:
#aktualnie mamy coś takiego
#zrobiłem target enc i one-hot jednocześnie, trochę na "pokaz"
#myślałem czy nie dodać drugiego zbioru gdzie zrobić sam target enc,
#i porównać potem wyniki. Wtedy może na ciągłych też będziesz miał
#taką sytuację i można by zrobić podsumowanie dla kombinacji tych metod
#Bo jak dobrze zrozumiałem to ten checkpoint bardziej polega właśnie
#na zabawie tymi metodami, a te modele tak wstępnie żeby było na czym
#oceniać
X.head(10)

Unnamed: 0,age,job,education,default,balance,housing,loan,day,month,duration,campaign,pdays,previous,poutcome,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown
0,30,0.101562,0.094395,0,1787,0,0,19,10,79.0,1,-1.0,0,0.090958,0,1,0,1,0,0
1,33,0.091127,0.106245,0,4789,1,1,11,5,220.0,1,339.0,4,0.128571,0,1,0,1,0,0
2,35,0.135191,0.142963,0,1350,1,0,16,4,185.0,1,330.0,1,0.128571,0,0,1,1,0,0
3,30,0.135191,0.142963,0,1476,1,1,3,6,199.0,4,-1.0,0,0.090958,0,1,0,0,0,1
4,59,0.072939,0.106245,0,0,1,0,5,5,226.0,1,-1.0,0,0.090958,0,1,0,0,0,1
5,35,0.135191,0.142963,0,747,0,0,23,2,141.0,2,176.0,3,0.128571,0,0,1,1,0,0
6,36,0.10929,0.142963,0,307,1,0,14,5,341.0,1,330.0,2,0.192893,0,1,0,1,0,0
7,39,0.108073,0.106245,0,147,1,0,6,5,151.0,2,-1.0,0,0.090958,0,1,0,1,0,0
8,41,0.089286,0.142963,0,221,1,0,14,5,57.0,2,-1.0,0,0.090958,0,1,0,0,0,1
9,43,0.091127,0.094395,0,-88,1,1,17,4,313.0,1,147.0,2,0.128571,0,1,0,1,0,0
