## 1. Importar Bibliotecas

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
from pathlib import Path

from src.data.preprocessing import DataPreprocessor
from src.features.feature_engineering import FeatureEngineer

## 2. Carregar Dados

In [None]:
# Carregar dataset original
data_path = Path('../data/raw/seu_dataset.csv')
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
df.head()

## 3. Tratamento de Dados

### 3.1 Valores Faltantes

In [None]:
preprocessor = DataPreprocessor()

# Tratar valores faltantes
df_clean = preprocessor.handle_missing_values(df, strategy='mean')

print(f"Valores faltantes após tratamento:")
print(df_clean.isnull().sum().sum())

### 3.2 Codificação de Variáveis Categóricas

In [None]:
# Identificar colunas categóricas
categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
print(f"Colunas categóricas: {categorical_cols}")

# Codificar variáveis categóricas
if categorical_cols:
    df_encoded = preprocessor.encode_categorical(df_clean, categorical_cols)
else:
    df_encoded = df_clean.copy()

## 4. Engenharia de Features

### 4.1 Features Polinomiais

In [None]:
engineer = FeatureEngineer()

# TODO: Selecionar colunas para features polinomiais
# poly_cols = ['col1', 'col2']
# df_features = engineer.create_polynomial_features(df_encoded, poly_cols, degree=2)

df_features = df_encoded.copy()
print(f"Shape após features polinomiais: {df_features.shape}")

### 4.2 Features de Interação

In [None]:
# TODO: Criar features de interação
# interaction_pairs = [('col1', 'col2'), ('col3', 'col4')]
# df_features = engineer.create_interaction_features(df_features, interaction_pairs)

print(f"Shape após features de interação: {df_features.shape}")

## 5. Separar Features e Target

In [None]:
# TODO: Definir coluna target
target_col = 'target'

X = df_features.drop(target_col, axis=1)
y = df_features[target_col]

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

## 6. Seleção de Features

In [None]:
# Análise de importância das features
importance = engineer.get_feature_importance(X, y)
print("\nTop 15 features mais importantes:")
print(importance.head(15))

In [None]:
# Selecionar top K features (opcional)
# k = 20
# X_selected = engineer.select_features(X, y, k=k, method='f_classif')
# print(f"Features selecionadas: {engineer.selected_features}")

X_selected = X.copy()

## 7. Normalização dos Dados

In [None]:
# Dividir em treino e teste
X_train, X_test, y_train, y_test = preprocessor.split_data(X_selected, y, test_size=0.2)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
# Normalizar features
X_train_scaled = preprocessor.scale_features(X_train, fit=True)
X_test_scaled = preprocessor.scale_features(X_test, fit=False)

print("Dados normalizados com sucesso!")

## 8. Salvar Dados Processados

In [None]:
# Salvar dados processados
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

np.save(processed_dir / 'X_train.npy', X_train_scaled)
np.save(processed_dir / 'X_test.npy', X_test_scaled)
np.save(processed_dir / 'y_train.npy', y_train)
np.save(processed_dir / 'y_test.npy', y_test)

# Salvar nomes das features
feature_names = X_selected.columns.tolist()
pd.Series(feature_names).to_csv(processed_dir / 'feature_names.csv', index=False, header=False)

print("Dados processados salvos com sucesso!")

## 9. Resumo

TODO: Adicionar resumo:
- Features criadas
- Transformações aplicadas
- Shape final dos dados
- Próximos passos