### Imports libraries

In [71]:
from IPython.display import display, Markdown
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pickle

### Loading data

In [2]:
def loading_data():
    return pd.read_csv('../../data/processed/exercise.csv')

dictionary = pd.read_csv('../../data/external/dictionary.csv')

df = loading_data()
df, dictionary

(    id     diet  pulse  time     kind
 0    1  low fat     85     1     rest
 1    1  low fat     85    15     rest
 2    1  low fat     88    30     rest
 3    2  low fat     90     1     rest
 4    2  low fat     92    15     rest
 ..  ..      ...    ...   ...      ...
 85  29   no fat    135    15  running
 86  29   no fat    130    30  running
 87  30   no fat     99     1  running
 88  30   no fat    111    15  running
 89  30   no fat    150    30  running
 
 [90 rows x 5 columns],
   variavel                                          descricao          tipo  \
 0       id        Um identificador único para cada voluntário  quantitativa   
 1     diet  O tipo de dieta que o voluntário segue, podend...   qualitativa   
 2    pulse  A pulsação do voluntário, medida em batimentos...  quantitativa   
 3     time  O tempo em minutos que o voluntário gastou rea...  quantitativa   
 4     kind  O tipo de atividade física realizada pelo volu...   qualitativa   
 
     subtipo  
 0  discr

### Transformation with onte-hot

In [63]:
target_column = 'diet'
nominal_columns = (
    dictionary.query('subtipo == "nominal" and variavel != @target_column').variavel.to_list()
)
continuous_columns = (
    dictionary.query('subtipo == "contínua"').variavel.to_list()
)
exclude_columns = nominal_columns + continuous_columns + [target_column]

rest_columns = (
    dictionary.query('variavel not in @exclude_columns').variavel.to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]
X.shape, X, y, nominal_columns, continuous_columns, rest_columns

((90, 4),
     id  pulse  time     kind
 0    1     85     1     rest
 1    1     85    15     rest
 2    1     88    30     rest
 3    2     90     1     rest
 4    2     92    15     rest
 ..  ..    ...   ...      ...
 85  29    135    15  running
 86  29    130    30  running
 87  30     99     1  running
 88  30    111    15  running
 89  30    150    30  running
 
 [90 rows x 4 columns],
 0     low fat
 1     low fat
 2     low fat
 3     low fat
 4     low fat
        ...   
 85     no fat
 86     no fat
 87     no fat
 88     no fat
 89     no fat
 Name: diet, Length: 90, dtype: object,
 ['kind'],
 ['pulse'],
 ['id', 'time'])

In [43]:
len(np.unique(df['kind'])), len(np.unique(df['diet']))

(3, 2)

In [58]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    ('encoding', OneHotEncoder(sparse_output=False)), 
    ('normalization', StandardScaler()), 
]) 

continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), 
    ('normalization', StandardScaler()),
])

rest_preprocessor = Pipeline([
    ('missing', SimpleImputer()), 
    ('normalization', StandardScaler()), 
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
    ('rest', rest_preprocessor, rest_columns),
], remainder='passthrough')
    

In [64]:
X = preprocessor.fit_transform(X)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5
0,1.414214,-0.707107,-0.707107,-0.994877,-1.675247,-1.210427
1,1.414214,-0.707107,-0.707107,-0.994877,-1.675247,-0.028149
2,1.414214,-0.707107,-0.707107,-0.791841,-1.675247,1.238577
3,1.414214,-0.707107,-0.707107,-0.656484,-1.559712,-1.210427
4,1.414214,-0.707107,-0.707107,-0.521126,-1.559712,-0.028149
...,...,...,...,...,...,...
85,-0.707107,1.414214,-0.707107,2.389059,1.559712,-0.028149
86,-0.707107,1.414214,-0.707107,2.050665,1.559712,1.238577
87,-0.707107,1.414214,-0.707107,-0.047375,1.675247,-1.210427
88,-0.707107,1.414214,-0.707107,0.764770,1.675247,-0.028149


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.30,
                                                        random_state=0)

In [69]:
X_train.shape, y_train.shape

((63, 6), (63,))

In [70]:
X_test.shape, y_test.shape

((27, 6), (27,))

In [74]:
with open('features-exercise.pkl', mode='wb') as f:
    pickle.dump([X_train, y_train, X_test, y_test], f)