In [21]:
import pandas as pd 
from sklearn.model_selection import train_test_split
# Read the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')
# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0 , subset = ['SalePrice'], inplace = True)
y = X_full.SalePrice
X_full.drop(['SalePrice'],axis = 1, inplace = True)
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude = ['object'])
X_test = X_test_full.select_dtypes(exclude = ['object'])
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size =0.8,test_size = 0.2,random_state= 0)

In [27]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)
# Number of missing values in each column of training data
missing = X_train.isnull().sum()
print(missing[missing > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [58]:
# Function for comparing different approaches 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_absolute_error 
def dataset_score(X_train,X_valid, y_train ,y_valid):
    rf = RandomForestRegressor(n_estimators = 10 ,random_state = 0)
    rf.fit(X_train, y_train)
    preds = rf.predict( X_valid)
    return mean_absolute_error(preds, y_valid)

### Drop Columns with Missing Values¶
The simplest option is to drop columns with missing values.

Unless most values in the dropped columns are missing, the model loses access to a lot of (potentially useful!) information with this approach. 

In [60]:
# Fill in the line below: get names of columns with missing values
missing_cols = [i  for i in X_train.columns if X_train[i].isnull().sum() > 0 ]
# Fill in the line below: get names of columns with missing values
reduced_X_train = X_train.drop(missing_cols ,axis = 1)
reduced_X_valid = X_valid.drop(missing_cols , axis = 1)


In [62]:
print(dataset_score(reduced_X_train,reduced_X_valid, y_train,y_valid))

18866.728767123288


### Imputation¶
Imputation fills in the <span style="background-color: #FFFF00">missing values with some number</span>. For instance, we can fill in the mean value along each column.

The imputed value <span style="background-color: #FFFF00">won't be exactly right in most cases</span>, but it usually <span style="background-color: #FFFF00">leads to more accurate models</span> than you would get from dropping the column entirely.

In [70]:
# Fill in the lines below: imputation
from sklearn.impute import SimpleImputer 
si = SimpleImputer()
imputed_X_train = pd.DataFrame(si.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(si.transform(X_valid))
# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

### An Extension To Imputation 
Imputation is the standard approach, and it usually works well. However, imputed values may be <span style="background-color: #FFFF00">systematically above or below their actual values</span> (which weren't collected in the dataset). Or rows with missing values may be unique in some other way. In that case, your model would make better predictions <span style="background-color: #FFFF00">by considering which values were originally missing</span>.

In this approach, we impute the missing values, as before. And, additionally, <span style="background-color: #FFFF00">for each column with missing entries in the original dataset, we add a new column that shows the location of the imputed entries</span>.

In some cases, this will meaningfully improve results. In other cases, it doesn't help at all.

In [72]:
# Preprocessed training and validation features
impute = SimpleImputer(strategy = 'mean')
final_X_train = pd.DataFrame(impute.fit_transform(X_train))
final_X_valid = pd.DataFrame(impute.transform(X_valid))

In [76]:
# Define and fit model
random = RandomForestRegressor(n_estimators = 100, random_state = 0)
random.fit(final_X_train,y_train)
preds = random.predict(final_X_valid)
# Get validation predictions and MAE
print(mean_absolute_error(preds , y_valid))

18062.894611872147


In [82]:
# Fill in the line below: preprocess test data
final_X_test = pd.DataFrame(impute.transform(X_test))
preds_test = random.predict(final_X_test)
# Fill in the line below: get test predictions

In [86]:
output = pd.DataFrame({'ID' : X_test.index,'SalePrice': preds_test})
output

Unnamed: 0,ID,SalePrice
0,1461,125245.50
1,1462,155237.00
2,1463,180755.22
3,1464,184071.50
4,1465,197144.40
...,...,...
1454,2915,87277.12
1455,2916,87025.50
1456,2917,154283.87
1457,2918,107723.50
