# How to handle missing values in the training data

### Load Data into training and validation 

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('./Datasets/train.csv', index_col='Id')
X_test_full = pd.read_csv('./Datasets/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
        
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(
    X, 
    y,
    train_size=0.8,
    test_size=0.2,
    random_state=0)

### Analyse missing data

In [5]:
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


## Imputation

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)


### Mean Imputer

In [14]:
from sklearn.impute import SimpleImputer

mean_imputer = SimpleImputer(strategy='mean')
imputed_X_train = pd.DataFrame(mean_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(mean_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("Training Score: ", score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

Training Score:  18062.894611872147


### Median Imputer

In [15]:
median_imputer = SimpleImputer(strategy='median')
imputed_X_train = pd.DataFrame(median_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(median_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

17791.59899543379


### Most Frequent Imputer

In [16]:
most_frequent_imputer = SimpleImputer(strategy='most_frequent')
imputed_X_train = pd.DataFrame(most_frequent_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(most_frequent_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

17956.065479452056
