In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# 1. missing values
from feature_engine.imputation import (
    MeanMedianImputer,
    ArbitraryNumberImputer,
    CategoricalImputer,
    RandomSampleImputer,
    AddMissingIndicator,
    DropMissingData,
)


In [None]:
house_prices = pd.read_csv('train.csv')

target = 'SalePrice'
numerical_features = [col for col in house_prices.columns if house_prices[col].dtypes in ['float64', 'int64'] and house_prices[col].name != target]
categorical_features = [col for col in house_prices.columns if house_prices[col].dtypes == 'object' and house_prices[col].name != target]

X = house_prices[numerical_features + categorical_features]
y = house_prices[target]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Pegando apenas as colunas com valores faltantes

numerical_features_missing = [col for col in X_train.columns if X_train[col].isnull().any() and X_train[col].dtypes in ['float64', 'int64'] and X_train[col].name != target]
categorical_features_missing = [col for col in X_train.columns if X_train[col].isnull().any() and X_train[col].dtypes == 'object' and X_train[col].name != target]

# Missing Input Values

### Mean and Median Input

In [None]:
mmi =  MeanMedianImputer(imputation_method='median')  # podemos substituir por "mean"

X_train_mmi = mmi.fit_transform(X_train[numerical_features_missing])  
X_test_mmi = mmi.transform(X_test[numerical_features_missing])

X_train_mmi.info()

### ArbitraryNumberImputer

In [None]:
ani = ArbitraryNumberImputer(arbitrary_number=10000)

X_train_ani = ani.fit_transform(X_train[numerical_features_missing])
X_test_ani = ani.transform(X_test[numerical_features_missing])

X_train_ani.info()

### Categorical Imputer

In [None]:
cti = CategoricalImputer(imputation_method="frequent")

X_train_cti = cti.fit_transform(X_train[categorical_features_missing])
X_test_cti = cti.transform(X_train[categorical_features_missing])

X_train_cti.info()  # nenhum valor faltante

In [None]:
rsi = RandomSampleImputer()

X_train_rsi = rsi.fit_transform(X_train[numerical_features_missing + categorical_features_missing])
X_test_rsi = rsi.transform(X_train[numerical_features_missing + categorical_features_missing])

X_train_rsi.info()

In [None]:
ami = AddMissingIndicator()

X_train_ami = ami.fit_transform(X_train[numerical_features_missing + categorical_features_missing])
X_test_ami = ami.transform(X_train[numerical_features_missing + categorical_features_missing])

X_train_ami.info()

# Podemos ver que ele adicionou colunas a mais no nosso Dataset, com sufixo _na, essas colunas são os indicadores de missing.

In [None]:
dmd = DropMissingData()

X_train_dmd = dmd.fit_transform(X_train[numerical_features_missing + categorical_features_missing])
X_test_dmd = dmd.transform(X_train[numerical_features_missing + categorical_features_missing])

X_train_dmd.info()  # foram excluídas as linhas 

# Categorical Encoding