### Imports libraries

In [41]:
from IPython.display import display, Markdown
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

### Loading data

In [29]:
def loading_data():
    return pd.read_csv('../../data/processed/exercise.csv')

dictionary = pd.read_csv('../../data/external/dictionary.csv')

df = loading_data()
df, dictionary

(    id     diet  pulse  time     kind
 0    1  low fat     85     1     rest
 1    1  low fat     85    15     rest
 2    1  low fat     88    30     rest
 3    2  low fat     90     1     rest
 4    2  low fat     92    15     rest
 ..  ..      ...    ...   ...      ...
 85  29   no fat    135    15  running
 86  29   no fat    130    30  running
 87  30   no fat     99     1  running
 88  30   no fat    111    15  running
 89  30   no fat    150    30  running
 
 [90 rows x 5 columns],
   variavel                                          descricao          tipo  \
 0       id        Um identificador único para cada voluntário  quantitativa   
 1     diet  O tipo de dieta que o voluntário segue, podend...   qualitativa   
 2    pulse  A taxa metabólica basal do voluntário, medida ...  quantitativa   
 3     time  O tempo em minutos que o voluntário gastou rea...  quantitativa   
 4     kind  O tipo de atividade física realizada pelo volu...   qualitativa   
 
     subtipo  
 0  discr

### Transformation with onte-hot e dummy encoding

In [30]:
target_column = 'kind'
nominal_columns = (
    dictionary.query('subtipo == "nominal" and variavel != @target_column').variavel.to_list()
)
continuous_columns = (
    dictionary.query('subtipo == "continua"').variavel.to_list()
)

X = df.drop(columns=[target_column], axis=1)
y = df[target_column]
y

0        rest
1        rest
2        rest
3        rest
4        rest
       ...   
85    running
86    running
87    running
88    running
89    running
Name: kind, Length: 90, dtype: object

In [33]:
nominal_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='most_frequent')), 
    #('encoding', OneHotEncoder(sparse_output=False)), # ordinal encoding
    ('encoding', OneHotEncoder(sparse_output=False)), 
    ('normalization', StandardScaler()), 
]) 

continuous_preprocessor = Pipeline([
    ('missing', SimpleImputer(strategy='mean')), 
    ('normalization', StandardScaler()),
])

preprocessor = ColumnTransformer([
    ('nominal', nominal_preprocessor, nominal_columns),
    ('continuous', continuous_preprocessor, continuous_columns),
])
    

In [38]:
X_transformed = preprocessor.fit_transform(X)
X_transformed.shape, df.shape

((90, 2), (90, 5))

### Transformation with cyclical

In [43]:
y.value_counts() / len(y)

rest       0.333333
walking    0.333333
running    0.333333
Name: kind, dtype: float64