In [1]:
from importlib import reload

In [2]:
import numpy as np
import pandas as pd

### Helper functions

In [3]:
from sklearn.impute import SimpleImputer


def get_cols_w_missing(X):
    return X.columns[X.isna().any()]


def create_is_missing_cols(X, cols_w_missing):
    X = X.copy()

    for col in cols_w_missing:
        X[col + 'is_missing'] = X[col].isna()

    return X


def impute_missing(X_train, X_test):
    features = X_train.columns
    imp = SimpleImputer().fit(X_train)
    
    X_train, X_test = (imp.transform(X_train), 
                       imp.transform(X_test))
    X_train, X_test = (pd.DataFrame(X_train, columns=features),
                       pd.DataFrame(X_test, columns=features))

    return X_train, X_test

## EDA on Training Data

In [4]:
train_df = pd.read_csv('train.csv')

In [5]:
# print(train_df.shape)

In [6]:
# print(train_df.head())

In [7]:
# print(train_df.iloc[0])

## Try Building a Random Forest with All Features (numeric / non-null)

### Split data into X and y

In [8]:
X = train_df.drop(columns='SalePrice')
y = train_df.SalePrice

### Add in boolean column for missing values

In [9]:
cols_w_missing = get_cols_w_missing(X)
X = create_is_missing_cols(X, cols_w_missing)

### Add in categorical features (using one-hot)

In [10]:
X = pd.get_dummies(X)

### Split data into train and test sets

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=1337)

### Add in null features (using sklearn imputing)

In [12]:
X_train, X_test = impute_missing(X_train, X_test)

### Train random forest estimator

In [13]:
from sklearn.ensemble import RandomForestRegressor

In [14]:
rf_model = RandomForestRegressor(500, criterion='mse', random_state=1337)
rf_model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=1337, verbose=0, warm_start=False)

## Make predictions on the test set and evaluate MAE

In [15]:
from sklearn.metrics import mean_absolute_error

In [16]:
y_pred = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

In [17]:
print(mae)

16352.048136986303


## Find best value for max leaf nodes

In [18]:
def get_mae(X_train, y_train, X_test, y_test, mln):
    rf = RandomForestRegressor(n_estimators=500, criterion='mse', max_leaf_nodes=mln, random_state=1337)
    rf.fit(X_train, y_train)
    y_hat = rf.predict(X_test)
    
    return mean_absolute_error(y_test, y_hat)

In [19]:
# candidate_max_leaf_nodes = [10, 30, 100, 300, 1000, 3000, 10000]

# min_mae = {'mln': np.inf, 'mae': np.inf}

# for mln in candidate_max_leaf_nodes:
#     sample_mae = get_mae(X_train, y_train, X_test, y_test, mln)
    
#     print(sample_mae)
#     if sample_mae < min_mae['mae']:
#         min_mae['mln'], min_mae['mae'] = mln, sample_mae
        
# best_mln = min_mae['mln']
# print(best_mln)

## Perfom predictions on test set

### Create the RandomForest

In [20]:
final_rf = RandomForestRegressor(n_estimators=500, random_state=1337)

### Align the training data with the test data

In [21]:
X.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [22]:
X_f = pd.read_csv('test.csv')
X_f = create_is_missing_cols(X_f, cols_w_missing)
X_f = pd.get_dummies(X_f)

In [23]:
X_f.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


In [24]:
X_t, X_f = X.align(X_f, join='left', axis=1)

In [25]:
X_t.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0


In [26]:
X_f.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1461,20,80.0,11622,5,6,1961,1961,0.0,468.0,...,0,0,0,1,0,0,0,0,1,0
1,1462,20,81.0,14267,6,6,1958,1958,108.0,923.0,...,0,0,0,1,0,0,0,0,1,0
2,1463,60,74.0,13830,5,5,1997,1998,0.0,791.0,...,0,0,0,1,0,0,0,0,1,0
3,1464,60,78.0,9978,6,6,1998,1998,20.0,602.0,...,0,0,0,1,0,0,0,0,1,0
4,1465,120,43.0,5005,8,5,1992,1992,0.0,263.0,...,0,0,0,1,0,0,0,0,1,0


### Impute values in X_t and X_f

In [27]:
X_t, X_f = impute_missing(X_t, X_f)

### Train and run RF

In [28]:
final_rf.fit(X_t, y)
y_hat_final = final_rf.predict(X_f)

In [29]:
pd.read_csv('sample_submission.csv').head()

Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


In [37]:
out = pd.DataFrame({'Id': X_f.Id.astype(int), 'SalePrice': y_hat_final})
out.to_csv('submission.csv', index=False)

In [38]:
out.head()

Unnamed: 0,Id,SalePrice
0,1461,127845.6
1,1462,155010.038
2,1463,180774.132
3,1464,181204.7
4,1465,199088.488
