REFERENCIA
https://medium.com/mlearning-ai/neat-data-preprocessing-with-pipeline-and-columntransformer-2a0468865b6b

In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler, MaxAbsScaler, LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.compose import ColumnTransformer

https://data.world/data-society/bank-marketing-data

In [2]:
source = 'https://raw.githubusercontent.com/nparis87/MachineLearning/main/bank-additional-full.csv'

data = pd.read_csv(source, sep = ';')
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


## TRANSFORMACIÓN DE DATOS 2.0

In [3]:
num_cols = ['duration', 'age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'] 
cat_cols = ['job', 'month'] #'codificado simple'
cat_cols_dummy = ['day_of_week', 'contact', 'education', 'marital', 'housing', 'loan', 'poutcome'] # One hot encoder


In [33]:
le = LabelEncoder().fit(data['y'])
data['y'] = le.transform(data['y'])

In [5]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())])

cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OrdinalEncoder())])

cat_dummy_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('categorical', OneHotEncoder(handle_unknown='ignore', sparse=False))])

In [68]:
# TRANSFORMACIÓN POR COLUMNAS

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)],    
    remainder='drop',
    n_jobs=-1)

#('cat_dummy', cat_dummy_pipeline, cat_cols_dummy)],

In [69]:
X = data.drop('y', axis = 1)
X2 = col_trans.fit_transform(X)
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.2)

## Hacemos una prueba simple con un conjunto de prueba

In [64]:
pred = RandomForestClassifier().fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95      7288
           1       0.66      0.46      0.55       950

    accuracy                           0.91      8238
   macro avg       0.80      0.72      0.75      8238
weighted avg       0.90      0.91      0.90      8238



In [67]:
X2.shape

(41188, 40)

In [72]:
model = RandomForestClassifier()
cv_model = cross_validate(model, X2, y, cv=5, scoring=('accuracy'))#, 'precision', 'recall', 'f1_macro'))

In [73]:
cv_model

{'fit_time': array([2.6361196 , 2.58949399, 2.74255133, 2.55075669, 2.53986335]),
 'score_time': array([0.06880736, 0.07078719, 0.09097457, 0.07089686, 0.05055642]),
 'test_score': array([0.88892935, 0.36622967, 0.63656227, 0.1382785 , 0.14216341])}

In [26]:
model = DecisionTreeClassifier()
vc_tree = cross_validate(model, X, y, cv=5, scoring=('accuracy'), return_train_score=True)

model_pipeline = Pipeline(steps=[
    ('col_trans', col_trans),
    ('model', model)
])

In [80]:
from sklearn import set_config
set_config(display='diagram')
display(model_pipeline)

In [27]:
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('col_trans',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('num_pipeline',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   MinMaxScaler())]),
                                                  ['duration', 'age',
                                                   'campaign', 'pdays',
                                                   'previous', 'emp.var.rate',
                                                   'cons.price.idx',
                                                   'cons.conf.idx', 'euribor3m',
                                                   'nr.employed']),
                                                 ('cat_pipeline',
                             

In [28]:
model_pipeline.predict(X_test)

array([0, 1, 0, ..., 0, 0, 0])

## Trabajo en clase, ver reporte de puntajes y matriz de confusión

In [29]:
print(classification_report(y_test, model_pipeline.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      7318
           1       0.51      0.53      0.52       920

    accuracy                           0.89      8238
   macro avg       0.73      0.73      0.73      8238
weighted avg       0.89      0.89      0.89      8238

