In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
X_full = pd.read_csv('train.csv', index_col='Id')
X_full_test = pd.read_csv('test.csv', index_col='Id')

X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X = X_full.select_dtypes(exclude=['object'])
X_test = X_full_test.select_dtypes(exclude=['object'])

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=2)

In [4]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38,20,74.0,8532,5,6,1954,1990,650.0,1213,0,...,498,0,0,0,0,0,0,0,10,2009
1371,50,90.0,5400,4,6,1920,1950,0.0,315,105,...,338,0,0,198,0,0,0,0,10,2009
419,50,60.0,8160,5,6,1940,1950,0.0,312,0,...,240,0,0,0,0,0,0,0,4,2007
612,80,,10395,6,6,1978,1978,233.0,605,0,...,564,0,0,0,0,0,0,500,7,2007
1213,30,50.0,9340,4,6,1941,1950,0.0,344,0,...,234,0,113,0,0,0,0,0,8,2009


In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_val, y_train, y_val):
    model = RandomForestRegressor(n_estimators=100, random_state=3)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_absolute_error(y_val, preds)

In [6]:
cols_with_missing = [col for col in X_train.columns
                    if X_train[col].isnull().any()]

reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_val = X_val.drop(cols_with_missing, axis=1)

In [7]:
print('MAE (Dropping columns with missing values): ')
print(score_dataset(reduced_X_train, reduced_X_val, y_train, y_val))

MAE (Dropping columns with missing values): 
20878.2964109589


In [9]:
from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_val = pd.DataFrame(my_imputer.transform(X_val))

imputed_X_train.columns = X_train.columns
imputed_X_val.columns = X_val.columns

In [11]:
print('MAE (Simple Imputer): ')
print(score_dataset(imputed_X_train, imputed_X_val, y_train, y_val))

MAE (Simple Imputer): 
21018.76832876712


In [12]:
final_X_train = reduced_X_train
final_X_val = reduced_X_val

In [18]:
my_model = RandomForestRegressor(n_estimators=100, random_state=0)
my_model.fit(final_X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [19]:
preds_val = my_model.predict(final_X_val)

In [20]:
print('MAE (Dropping columns with missing values):')
print(mean_absolute_error(y_val, preds_val))

MAE (Dropping columns with missing values):
20573.4638630137


In [21]:
cols_with_missing = [col for col in X_test.columns
                    if X_train[col].isnull().any()]

reduced_X_test = X_test.drop(cols_with_missing, axis=1)

In [28]:
reduced_X_test.dropna(axis=0, inplace=True)

In [30]:
final_X_test = reduced_X_test

In [31]:
preds_test = my_model.predict(final_X_test)

In [32]:
output = pd.DataFrame({'Id': final_X_test.index, 
                      'SalesPrice': preds_test})
output.to_csv('Missing Values - Submission.csv', index=False)