In [None]:
import numpy as np
import pandas as pd

import cleaning as cl

## Try Building a Random Forest with All Features (numeric / non-null)

### Read in data

In [None]:
train_df = pd.read_csv('train.csv')
X_eval = pd.read_csv('test.csv')

### Split training data into X and y

In [None]:
X = train_df.drop(columns='SalePrice')
y = train_df.SalePrice

### Get clean data to feed into model

In [None]:
X, y, X_eval, X_t, X_v, y_t, y_v = cl.prepare_data(X, y, X_eval)

### Train random forest estimator

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf_model = RandomForestRegressor(100, criterion='mse', random_state=1337)
rf_model.fit(X_t, y_t)

## Make predictions on the test set and evaluate MAE

In [None]:
from sklearn.metrics import mean_absolute_error

In [None]:
y_pred = rf_model.predict(X_v)
mae = mean_absolute_error(y_v, y_pred)

In [None]:
print(mae)

## Find Best `n_estimators`

1. Make a list of candidate `n_estimators` values.
2. For each candidate, get its MAE on the validation set.
3. Pick the candidate with lowest MAE, and make a new list of candidates around this value.
4. Repeat steps 2 - 3.

In [None]:
lower_bound = 1
upper_bound = 1000

def make_mult_range(lower_bound, upper_bound, n):
    assert upper_bound > lower_bound
    
    log_lower, log_upper = [np.log(x) 
                            for x in (lower_bound, upper_bound)]
    
    ran = log_upper - log_lower
    
    log_candidates = [log_lower + i * ran / n for i in range(n+1)]
    
    return [int(np.exp(c)) for c in log_candidates]

In [None]:
def find_best_n_est(X_t, y_t, X_v, y_v, l_bound, u_bound, n):
    candidates = make_mult_range(l_bound, u_bound, n)
    maes = []
    
    print(candidates)
    
    for c in candidates:
        rf = RandomForestRegressor(n_estimators=c, random_state=1337)
        rf.fit(X_t, y_t)
        
        y_hat = rf.predict(X_v)
        mae = mean_absolute_error(y_hat, y_v)
        
        maes.append(mae)
        print(f'{c}: {mae}')
        
    i_min = min(range(len(maes)), key=lambda i: maes[i])
    
    return candidates[i_min]

Can find n_iters automatically by checking that (u_bound - l_bound) / n > 2

In [None]:
def find_best_n_est_iters(X_t, y_t, X_v, y_v, l_bound, u_bound, n, iters):
    for i in range(iters):
        best_c = find_best_n_est(X_t, y_t, X_v, y_v, l_bound, u_bound, n)
        
        step_size_log = .5 * (np.log(u_bound) - np.log(l_bound)) / n
        
        l_bound, u_bound = [np.exp(np.log(best_c) - step_size_log), 
                            np.exp(np.log(best_c) + step_size_log)]
        
    return best_c

In [None]:
best_n_est = find_best_n_est_iters(X_t, y_t, X_v, y_v, lower_bound, upper_bound, 10, 3)

## Perfom predictions on test set

### Create the RandomForest

In [None]:
final_rf = RandomForestRegressor(n_estimators=best_n_est, random_state=1337)

### Train and run RF

In [None]:
final_rf.fit(X, y)
y_hat_final = final_rf.predict(X_eval)

In [None]:
out = pd.DataFrame({'Id': X_eval.Id.astype(int), 'SalePrice': y_hat_final})
out.to_csv('submission.csv', index=False)

In [None]:
best_n_est