### Onehot Encoder in Pipeline

- Dataset: Bank marketing dataset
- Learning Date: 24-Aug-23
- Learning from: Prasert Kanawattanachai (CBS)
    - Github: https://github.com/prasertcbs/

In [1]:
# import libraries

import pandas as pd
import numpy as np


import requests
%config InlineBackend.figure_format = 'retina'

In [2]:
pd.Timestamp.now()

Timestamp('2023-08-25 05:46:51.445404')

In [3]:
r = requests.get('https://github.com/prasertcbs/basic-dataset/raw/master/bank/bank-names.txt')
print(r.text)

Citation Request:
  This dataset is public available for research. The details are described in [Moro et al., 2011]. 
  Please include this citation if you plan to use this database:

  [Moro et al., 2011] S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European Simulation and Modelling Conference - ESM'2011, pp. 117-121, Guimarães, Portugal, October, 2011. EUROSIS.

  Available at: [pdf] http://hdl.handle.net/1822/14838
                [bib] http://www3.dsi.uminho.pt/pcortez/bib/2011-esm-1.txt

1. Title: Bank Marketing

2. Sources
   Created by: Paulo Cortez (Univ. Minho) and Sérgio Moro (ISCTE-IUL) @ 2012
   
3. Past Usage:

  The full dataset was described and analyzed in:

  S. Moro, R. Laureano and P. Cortez. Using Data Mining for Bank Direct Marketing: An Application of the CRISP-DM Methodology. 
  In P. Novais et al. (Eds.), Proceedings of the European S

In [4]:
# read text data into dataframe

url = 'https://github.com/prasertcbs/basic-dataset/raw/master/bank/bank.csv'

df = pd.read_csv(url, sep = ";")
df[:5]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [5]:
len(df.columns)

17

In [6]:
import sklearn
from sklearn.preprocessing import OneHotEncoder

In [7]:
df['marital'].unique()

array(['married', 'single', 'divorced'], dtype=object)

In [8]:
df['marital']

0       married
1       married
2        single
3       married
4       married
         ...   
4516    married
4517    married
4518    married
4519    married
4520     single
Name: marital, Length: 4521, dtype: object

In [9]:
df[['marital']] # get dataframe

Unnamed: 0,marital
0,married
1,married
2,single
3,married
4,married
...,...
4516,married
4517,married
4518,married
4519,married


In [10]:
df[:3]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no


In [11]:
ohenc1 = OneHotEncoder(sparse = False)

m1 = ohenc1.fit_transform(df[['marital']])
m1



array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [12]:
ohenc1.categories_

[array(['divorced', 'married', 'single'], dtype=object)]

In [13]:
df[:3]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no


In [14]:
m1.shape

(4521, 3)

In [15]:
ohenc2 = OneHotEncoder(sparse = False, drop = 'first')

m2 = ohenc2.fit_transform(df[['marital']])
m2



array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [16]:
m2.shape

(4521, 2)

In [17]:
df[:3]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no


In [18]:
ohenc2.categories_

[array(['divorced', 'married', 'single'], dtype=object)]

In [19]:
ohenc2.inverse_transform([[0, 0],
                         [1, 0],
                         [0, 1]])

array([['divorced'],
       ['married'],
       ['single']], dtype=object)

In [20]:
ohenc = OneHotEncoder(sparse = False, drop = 'first')
m3 = ohenc.fit_transform(df[['marital', 'default', 'education']])
m3



array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 1., 0.]])

In [21]:
m3.shape

(4521, 6)

In [22]:
ohenc.categories_

[array(['divorced', 'married', 'single'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['primary', 'secondary', 'tertiary', 'unknown'], dtype=object)]

In [25]:
ohenc.inverse_transform([
    [1, 0, 1, 0, 0, 0],
    [0, 0, 1, 1, 0, 0],
    [1, 0, 1, 1, 0, 0],
    [1, 0, 1, 0, 1, 0]
])

array([['married', 'yes', 'primary'],
       ['divorced', 'yes', 'secondary'],
       ['married', 'yes', 'secondary'],
       ['married', 'yes', 'tertiary']], dtype=object)

### Make pipeline (ColumnTransformer -> Model)

In [26]:
# import libraries

# encoder & transform columns
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# models 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# pipeline & cross validation
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import KFold, StratifiedGroupKFold
from sklearn.model_selection import cross_val_score

In [27]:
df.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
1542,49,entrepreneur,married,secondary,no,273,no,no,cellular,23,jul,58,1,-1,0,unknown,no
2248,39,services,married,secondary,no,1438,yes,no,unknown,20,may,212,2,-1,0,unknown,no
3755,33,blue-collar,single,secondary,no,3975,yes,yes,cellular,17,apr,515,1,150,1,other,no
2225,31,self-employed,single,tertiary,no,96,no,no,cellular,5,feb,577,1,-1,0,unknown,no
3151,43,entrepreneur,married,secondary,no,59,no,no,unknown,13,may,437,2,-1,0,unknown,no


In [28]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [None]:
"""   Input variables:
   # bank client data:
   1 - age (numeric)
   2 - job : type of job (categorical: "admin.","unknown","unemployed","management","housemaid","entrepreneur","student",
                                       "blue-collar","self-employed","retired","technician","services") 
   3 - marital : marital status (categorical: "married","divorced","single"; note: "divorced" means divorced or widowed)
   4 - education (categorical: "unknown","secondary","primary","tertiary")
   5 - default: has credit in default? (binary: "yes","no")
   6 - balance: average yearly balance, in euros (numeric) 
   7 - housing: has housing loan? (binary: "yes","no")
   8 - loan: has personal loan? (binary: "yes","no")
   # related with the last contact of the current campaign:
   9 - contact: contact communication type (categorical: "unknown","telephone","cellular") 
  10 - day: last contact day of the month (numeric)
  11 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec")
  12 - duration: last contact duration, in seconds (numeric)
   # other attributes:
  13 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
  14 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric, -1 means client was not previously contacted)
  15 - previous: number of contacts performed before this campaign and for this client (numeric)
  16 - poutcome: outcome of the previous marketing campaign (categorical: "unknown","other","failure","success")

  Output variable (desired target):
  17 - y - has the client subscribed a term deposit? (binary: "yes","no")"""

In [29]:
col_trans = make_column_transformer(
    (OneHotEncoder(), ['marital', 'default', 'job']),
    (StandardScaler(), ['age'])
)

col_trans

In [30]:
model = LogisticRegression( solver = 'lbfgs') # create a model
model

In [31]:
pipe = make_pipeline(col_trans, model) # crate a pipeline
pipe

In [32]:
df.sample(5)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
454,44,management,married,tertiary,no,795,no,no,cellular,28,aug,99,24,-1,0,unknown,no
878,31,self-employed,single,secondary,no,203,no,yes,cellular,19,nov,177,1,-1,0,unknown,no
498,38,technician,single,secondary,no,258,no,yes,unknown,20,jun,587,2,-1,0,unknown,no
2062,29,blue-collar,married,unknown,no,486,yes,no,cellular,6,may,422,1,363,1,failure,no
3910,49,admin.,married,secondary,no,14440,yes,no,cellular,21,nov,60,1,-1,0,unknown,no


In [34]:
# get X and y

X = df[['age', 'marital', 'default', 'job']]
X.head(3)

Unnamed: 0,age,marital,default,job
0,30,married,no,unemployed
1,33,married,no,services
2,35,single,no,management


In [35]:
y = df['y']
y.head(3)

0    no
1    no
2    no
Name: y, dtype: object

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
# split data 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3616, 4), (905, 4), (3616,), (905,))

In [38]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8784530386740331

In [39]:
accuracy = cross_val_score(pipe, X, y, cv = 5)
accuracy

array([0.8839779 , 0.88495575, 0.88495575, 0.88495575, 0.88495575])

In [40]:
accuracy.mean()

0.8847601818804087

In [41]:
ohenc = OneHotEncoder(sparse=False)
m = ohenc.fit_transform(df[['marital', 'default', 'job']])
m



array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [42]:
m.shape

(4521, 17)

In [43]:
ohenc.inverse_transform([
    [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
])

array([['married', 'yes', 'admin.']], dtype=object)

### Metrics

In [44]:
from sklearn import metrics

In [45]:
predicted = pipe.predict(X_test)
predicted[:5]

array(['no', 'no', 'no', 'no', 'no'], dtype=object)

In [46]:
pipe.predict_proba(X_test)

array([[0.94662609, 0.05337391],
       [0.82486762, 0.17513238],
       [0.85154613, 0.14845387],
       ...,
       [0.89171511, 0.10828489],
       [0.8928329 , 0.1071671 ],
       [0.90090694, 0.09909306]])

In [47]:
metrics.confusion_matrix(y_test, predicted)

array([[795,   0],
       [110,   0]])

### scikit-learn: confusion matrix

In [48]:
tn, fp, fn, tp = metrics.confusion_matrix(y_test, predicted).ravel()

In [49]:
tn, fp, fn, tp

(795, 0, 110, 0)

In [50]:
print(f'tn = {tn}')
print(f'fp = {fp}')
print(f'fn = {fn}')
print(f'tp = {tp}')

tn = 795
fp = 0
fn = 110
tp = 0


In [51]:
pipe.score(X_test, y_test)

0.8784530386740331

In [52]:
metrics.accuracy_score(y_test, predicted)

0.8784530386740331

In [53]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

          no       0.88      1.00      0.94       795
         yes       0.00      0.00      0.00       110

    accuracy                           0.88       905
   macro avg       0.44      0.50      0.47       905
weighted avg       0.77      0.88      0.82       905



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
