# Intermediate Machine Learning - Missing Values

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [8]:
X = pd.read_csv('melbourne_housing_snapshot/melb_data.csv')
y = X.Price.copy()

X.drop('Price', axis=1, inplace=True)
X = X.select_dtypes(exclude=['object'])

print(X.shape, y.shape)

(13580, 12) (13580,)


In [9]:
X.isna().sum()

Rooms               0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
Lattitude           0
Longtitude          0
Propertycount       0
dtype: int64

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

print(f"train: X-> {X_train.shape}, y-> {y_train.shape}")
print(f"valid: X-> {X_valid.shape}, y-> {y_valid.shape}")

train: X-> (10864, 12), y-> (10864,)
valid: X-> (2716, 12), y-> (2716,)


In [11]:
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## Three Approaches

1. **Drop columns with Missing Values:** model loses access to a lot of information
2. **Imputation:** fills in the missing valueswith some number (mean) - usually leads to more accurate models than dropping the column
3. **Extension to Imputation:** not just replacing but adding a new column with saying that the data was missing

## 1 - Dropping columns

In [12]:
first_X_train = X_train.dropna(axis=1)
first_X_valid = X_valid.dropna(axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(first_X_train, first_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635


# 2 - Imputation

In [13]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation):
178166.46269899711


# 3 - Extended imputation

In [15]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
cols_with_missing = [col for col in X_train.columns
                     if X_train[col].isnull().any()]
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An Extension to Imputation):
178927.503183954


Win: Approach 2!

**Explanation:** The training data has 10864 rows and 12 columns, where three columns contain missing data. For each column, less than half of the entries are missing. Thus, dropping the columns removes a lot of useful information, and so it makes sense that imputation would perform better.