In [5]:
import preprocessing as PRE
import feature_engineering as FE
import load_data as LD

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV as GSCV

In [6]:
def rmse(a, b):
    
    a = np.array(a)
    b = np.array(b)
    
    error = b - a 
    square = (b - a) ** 2
    mean = np.mean(square)
    root = np.sqrt(mean)
    
    return(root)

In [7]:
def get_best_params(model, params, X_train, y_train, cv=5):
    clf = GSCV(model, params, cv=cv)
    clf.fit(X_train, y_train)
    
    best_params = clf.best_params_
    results = clf.cv_results_.items()
     
    return(best_params, results)

In [10]:
def build_model():
    # Param Set
    load_false_frac1 = {'outliers':False, 'frac':0.1}
    load_false_frac2 = {'outliers':False, 'frac':0.2}
    load_true_frac1 = {'outliers':True, 'frac':0.1}
    load_true_frac2 = {'outliers':True, 'frac':0.2}
    load_params = [load_false_frac1, load_false_frac2, load_true_frac1, load_true_frac2]

    # Random Forest
    model_rf = RandomForestRegressor(random_state=0, n_jobs=-1)
    model_rf_params = {'max_depth':[1, 2, 3, 4], 'random_state':[0], 'n_estimators':[50, 100, 200, 500]}
    
    # Train on each set
    for load in load_params:

        # Load Data and Model
        X_train, y_train, X_test, y_test = LD.load_data(**load)\

        # Build Model using CV
        best_params, results = get_best_params(model_rf, model_rf_params, X_train, y_train, cv=4)
        best_params['n_jobs'] = -1
        model = RandomForestRegressor(**best_params)
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        val = rmse(y_pred, y_test)
        
        print(val, '\n', best_params, '\n', load)
        print(' ')
        
    return(model)

In [11]:
rf = build_model()

0.17044053274514528 
 {'max_depth': 4, 'n_estimators': 500, 'random_state': 0, 'n_jobs': -1} 
 {'outliers': False, 'frac': 0.1}
 
0.1755840638250589 
 {'max_depth': 4, 'n_estimators': 500, 'random_state': 0, 'n_jobs': -1} 
 {'outliers': False, 'frac': 0.2}
 
0.1715776132343561 
 {'max_depth': 4, 'n_estimators': 500, 'random_state': 0, 'n_jobs': -1} 
 {'outliers': True, 'frac': 0.1}
 
0.16888468332997902 
 {'max_depth': 4, 'n_estimators': 50, 'random_state': 0, 'n_jobs': -1} 
 {'outliers': True, 'frac': 0.2}
 
