### 02 - PrÃ©-processamento dos Dados

Neste notebook aplicou-se o prÃ©-processamento customizado baseado na anÃ¡lise exploratÃ³ria

# Imports

In [None]:
import sys
sys.path.append('/home/jovyan/work')

import pandas as pd
import numpy as np
from src import S3Client, DataPreprocessor

import warnings
warnings.filterwarnings('ignore')

# Carregar Dados do MinIO

In [None]:
s3 = S3Client()

# Carregar train, test e validation
train_df = s3.read_csv('processed/train.csv')
test_df = s3.read_csv('processed/test.csv')
validation_df = s3.read_csv('processed/validation.csv')

print(f"ðŸ“Š Train: {train_df.shape}")
print(f"ðŸ“Š Test: {test_df.shape}")
print(f"ðŸ“Š Validation: {validation_df.shape}")

# Verificar Dados Brutos

print("\n--- Primeiras linhas (Train) ---")
display(train_df.head())

print("\n--- Info ---")
train_df.info()

print("\n--- EstatÃ­sticas ---")
display(train_df.describe())

# Inicializar Preprocessador

In [None]:
preprocessor = DataPreprocessor()

# Limpeza

train_clean = preprocessor.clean_heart_disease_data(train_df)
test_clean = preprocessor.clean_heart_disease_data(test_df)
validation_clean = preprocessor.clean_heart_disease_data(validation_df)

# Encoding

categorical_cols = ["chest pain type", "resting ecg", "ST slope"]

train_encoded = preprocessor.apply_onehot_encoding(train_clean, categorical_cols)
test_encoded = preprocessor.apply_onehot_encoding(test_clean, categorical_cols)
validation_encoded = preprocessor.apply_onehot_encoding(validation_clean, categorical_cols)

print(f"\nâœ… Train encoded: {train_encoded.shape}")
print(f"âœ… Test encoded: {test_encoded.shape}")
print(f"âœ… Validation encoded: {validation_encoded.shape}")

# Spliting

X_train, y_train = preprocessor.load_data(train_encoded, target_col='target')
X_test, y_test = preprocessor.load_data(test_encoded, target_col='target')
X_val, y_val = preprocessor.load_data(validation_encoded, target_col='target')

# Scaling

In [None]:
numeric_cols = ["age", "resting bp s", "cholesterol", 
                "fasting blood sugar", "max heart rate", "oldpeak"]

# Fit no treino, transform em todos
X_train_scaled, _ = preprocessor.normalize_features(X_train, method='standard')
X_test_scaled = pd.DataFrame(
    preprocessor.scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)
X_val_scaled = pd.DataFrame(
    preprocessor.scaler.transform(X_val),
    columns=X_val.columns,
    index=X_val.index
)

print("\nâœ… NormalizaÃ§Ã£o concluÃ­da!")
print(f"Train: {X_train_scaled.shape}")
print(f"Test: {X_test_scaled.shape}")
print(f"Validation: {X_val_scaled.shape}")

# Salvar os dados processados no MinIO

In [None]:
s3.write_csv(X_train_scaled, 'processed/X_train_scaled.csv')
s3.write_csv(X_test_scaled, 'processed/X_test_scaled.csv')
s3.write_csv(X_val_scaled, 'processed/X_val_scaled.csv')

# Salvar targets
pd.DataFrame(y_train).to_csv('y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('y_test.csv', index=False)
pd.DataFrame(y_val).to_csv('y_val.csv', index=False)

s3.upload_file('y_train.csv', 'processed/y_train.csv')
s3.upload_file('y_test.csv', 'processed/y_test.csv')
s3.upload_file('y_val.csv', 'processed/y_val.csv')

print("\nâœ… Dados salvos no MinIO!")