# Template: Data Preparation

Template for data preparation with `scikit-learn`

Environment variables

#### Linux / MAC

```{bash}
export BUCKET_MODEL=belc-bigdata-models-dlk-qas
export FOLDER_MODEL=template-model
export ENV_DEPLOY=QAS
export MLFLOW_HOST=internal-alb-datalake-casetwo-qas-45252694.us-east-1.elb.amazonaws.com
```

In [1]:
%env BUCKET_MODEL=belc-bigdata-models-dlk-qas
%env FOLDER_MODEL=template-model
%env ENV_DEPLOY=QAS
%env MLFLOW_HOST=internal-alb-datalake-casetwo-qas-45252694.us-east-1.elb.amazonaws.com

env: BUCKET_MODEL=belc-bigdata-models-dlk-qas
env: FOLDER_MODEL=template-model
env: ENV_DEPLOY=QAS
env: MLFLOW_HOST=internal-alb-datalake-casetwo-qas-45252694.us-east-1.elb.amazonaws.com


In [2]:
import os
import source.utils.mlflow as ml
from datetime import datetime
import source.utils.configvariables as uc



In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#libraries
######################################################################

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

#configuration options
######################################################################

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 400)
sns.set()

seed = 12345

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
pd.__version__

'1.2.4'

In [6]:
np.__version__

'1.21.0'

In [7]:
sklearn.__version__

'0.24.1'

## Configuración MLFlow

In [8]:
ml_flow_host = uc.env_ml_flow_host
ml_flow_project = uc.env_ml_flow_project

In [9]:
Myflow = ml.MLFlowTracking(ml_flow_host)

In [10]:
run_name = 'DataPrep_ScikitLearn'
folder = 'Preparation'
seed = 12345

In [11]:
active_run = Myflow.start_experiment(ml_flow_project, run_name=run_name)
active_run

<ActiveRun: >

## dataset

In [12]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [13]:
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

In [14]:
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [15]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

### exporting descriptives

In [17]:
# X_train.describe().to_csv('descriptives.csv')
# Myflow.model_logging_artifact("descriptives.csv", "data")

### preprocessing steps

Variables Numéricas

1. imputation
2. capping
3. standarization

Variables categóricas

1. imputation
2. encoding

numeric dataprep

In [18]:
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical dataprep

In [19]:
categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))])

In [20]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

both together with `ColumnTransformer`

In [21]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['age', 'fare']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['embarked', 'sex', 'pclass'])])

In [22]:
from sklearn import set_config

set_config(display='diagram')

In [23]:
preprocessor

In [24]:
preprocessor.named_transformers_

{'num': Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                 ('scaler', StandardScaler())]),
 'cat': Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                 ('ohe', OneHotEncoder(handle_unknown='ignore'))]),
 'remainder': 'drop'}

access individual elements with keys

In [25]:
preprocessor.named_transformers_['num']['imputer']

we can also use the pipeline to transform the test in one line

In [26]:
preprocessor.transform(X_test)

array([[-1.83373686, -0.22927215,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 1.0343065 ,  0.49264725,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.12841378, -0.50918194,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.59350189, -0.3799928 ,  0.        , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.02661559,  2.51153315,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [-0.12841378, -0.52012206,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

## pipelines with training

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression())])

In [29]:
classifier

In [30]:
classifier.fit(X_train, y_train)

In [31]:
classifier.score(X_train, y_train)

0.7851002865329513

In [32]:
classifier.score(X_test, y_test)

0.8053435114503816

## pipelines with grid search

In [33]:
start_model = datetime.now()
current_time = start_model.strftime("%H:%M:%S")
print("Current Time is :", current_time)

# registramos el inicio del train
dict_params = {"train_time1_start":current_time}
Myflow.model_logging_params(dict_params)

Current Time is : 10:28:46


In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [35]:
classifier = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', DecisionTreeClassifier())])

In [36]:
param_grid = {'preprocessor__num__imputer__strategy': ['mean', 'median'],
              'classifier__max_depth': [2, 4, 6, 8, 10],
              'classifier__min_samples_split': [10, 15, 20]}

In [37]:
gs = GridSearchCV(classifier, param_grid=param_grid, scoring = 'roc_auc')

In [38]:
gs.fit(X_train, y_train)

In [39]:
gs.best_params_

{'classifier__max_depth': 6,
 'classifier__min_samples_split': 20,
 'preprocessor__num__imputer__strategy': 'median'}

In [40]:
score_train = gs.score(X_train, y_train)

In [41]:
score_test = gs.score(X_test, y_test)

## logging metrics

In [42]:
end_model = datetime.now()
current_time = end_model.strftime("%H:%M:%S")

# registramos el fin del train
dict_params = {"train_time2_end":current_time}
Myflow.model_logging_params(dict_params)

dif_time = end_model-start_model
# registramos el diferencia del train
dict_params = {"train_time3_duration":dif_time}
Myflow.model_logging_params(dict_params)

In [43]:
dict_metrics = {"score_train":score_train, "score_test":score_test}
Myflow.model_logging_metrics(dict_metrics)

In [44]:
Myflow.end_experiment()