In [7]:
import h2o
import random
import mlflow
import mlflow.h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator


In [9]:
import numpy as np
import pandas as pd

df_raw = pd.read_csv('train.csv')

In [12]:
def data_process(df_raw,remove_outlier = False,remove_hard_to_fit = False,linear_model = False):

	# Make a copy so the original dataframe will not be altered.
    df_processed = df_raw.copy()

	# Feature Transformation - take the logarithm of the features to meet normality assumptions.
    df_processed.SalePrice = np.log(df_processed.SalePrice)
    df_processed.GrLivArea = np.log(df_processed.GrLivArea)
    
	# Remove outliers.
    outlier_list = [524, 1299, 463, 31, 534, 1433, 739, 1159, 108, 1231, 971, 1424]
    df_processed = df_processed.drop(outlier_list)

    ## Missing values
    
    # 259 LotFrontage  - replace missing value with 0 
#     df_processed.LotFrontage = df_processed.LotFrontage.fillna(0)

    # 1369 Alley - replace with None
#     df_processed.Alley = df_processed.Alley.fillna('None')

    # 8 MasVnrType and MasVnrArea - replace MasVnrType with None and MasVnrArea with 0
#     df_processed.MasVnrType = df_processed.MasVnrType.fillna('None')
#     df_processed.MasVnrArea = df_processed.MasVnrArea.fillna(0)

    # 37 basement: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2- replace with Nb
#     df_processed.BsmtQual = df_processed.BsmtQual.fillna('Nb')
#     df_processed.BsmtCond = df_processed.BsmtCond.fillna('Nb')
#     df_processed.BsmtExposure = df_processed.BsmtExposure.fillna('Nb')
#     df_processed.BsmtFinType1 = df_processed.BsmtFinType1.fillna('Nb')
#     df_processed.BsmtFinType2 = df_processed.BsmtFinType2.fillna('Nb')

    # 690 FireplaceQu - replace with Nf
#     df_processed.FireplaceQu = df_processed.FireplaceQu.fillna('Nf')

    # 81 Garage: GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond - replace with Ng and year with 0 
#     df_processed.GarageType = df_processed.GarageType.fillna('Ng')
#     df_processed.GarageFinish = df_processed.GarageFinish.fillna('Ng')
#     df_processed.GarageQual = df_processed.GarageQual.fillna('Ng')
#     df_processed.GarageCond = df_processed.GarageCond.fillna('Ng')
#     df_processed.GarageYrBlt = df_processed.GarageYrBlt.fillna(0)

    # 1453 PoolQC - replace with Np
#     df_processed.PoolQC = df_processed.PoolQC.fillna('Np')

    # 1179 Fence - replace with Nf
#     df_processed.Fence = df_processed.Fence.fillna('Nf')

    # 1406 MiscFeature - replace with None    
#     df_processed.MiscFeature = df_processed.MiscFeature.fillna('None')

    # 1 Electrical
#     df_processed = df_processed[pd.notnull(df_processed.Electrical)]

    ## Combine columns and drop multicollinear columns 
    
    # combine bathroom quanlitity 
    df_processed['BsmtBath'] = df_processed.BsmtFullBath + df_processed.BsmtHalfBath * 0.5
    df_processed['Bath'] = df_processed.FullBath + df_processed.HalfBath * 0.5
    df_processed = df_processed.drop(['BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath'], axis=1)

    # drop TotalBsmtSF - multicollinearaty
    #df_processed = df_processed.drop(['TotalBsmtSF'], axis=1)

    # drop GrLivArea - multicollinearaty
    #df_processed = df_processed.drop(['GrLivArea'], axis=1)

    # drop GarageArea - higher correlation than GarageACars, results are better as well
    #df_processed = df_processed.drop(['GarageArea'], axis=1) 
    
    
    # drop Id
    df_processed = df_processed.drop(['Id'], axis=1)

	# Categorical Features Processsing

	# MSSubClass processing - MSSubClass 20-90 contains only duplicate information with HouseStyle and YearBuilt.
    #df_processed['MSSubClass'] = df_processed['MSSubClass'].replace(['20','30','40','45','50','60','70','75','80','85'], '0')

    # Convert numerical to categorical. 
    #df_processed[['MSSubClass','OverallQual','OverallCond']] = df_processed[['MSSubClass','OverallQual','OverallCond']].astype(str)



    return df_processed.to_csv('processed_rf.csv')

In [11]:
data_process(df_raw)

In [5]:
df_rf = h2o.import_file(path="processed_rf.csv")

H2OConnectionError: Not connected to a cluster. Did you run `h2o.connect()`?

In [None]:
r = df_rf['SalePrice'].runif()
train = df_rf[r  < 0.7]
test  = df_rf[0.3 <= r]

In [None]:
train 

In [None]:
def train_random_forest(ntrees):
    with mlflow.start_run():
        rf = H2ORandomForestEstimator(ntrees=ntrees,max_depth=max_depth)
        train_cols = [n for n in df_rf.col_names if n != "SalePrice"]
        rf.train(train_cols, "SalePrice", training_frame=train, validation_frame=test)
        #add oor
        mlflow.log_param("ntrees", ntrees)
        mlflow.log_param("max_depth", max_depth)
        
        mlflow.log_metric("rmse", rf.rmse())
        mlflow.log_metric("r2", rf.r2())
        #mlflow.log_metric("mae", rf.mae())
        
        mlflow.h2o.log_model(rf, "model")

In [None]:
for ntrees in [50, 100, 200, 400]:
    for max_depth in [10, 15, 20, 30]:
        train_random_forest(ntrees)
        
        
#max features 

In [None]:
#import yaml

In [None]:
#yaml.safe_dump