In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
answer = input("Run for day or night? (d/n): ")

variables = ["date", "T_station", "RH_HOBO", "LOC"]

if answer == "d":
# building model for daytime
    print("Selected: daytime")

    answer = input("Run with UHII as resp. variable? (y/n): ")

    # load dataset
    if answer == "y":
        df = pd.read_csv('winter_X_all_daytime_hour_LCZ_UHII.csv')
        df.dropna(inplace=True)
        print(df.head())

    elif answer == "n":
        df = pd.read_csv('winter_X_all_daytime_hour_LCZ.csv')
        df.dropna(inplace=True)
        print(df.head())

    else:
        print("Invalid option")

elif answer == "n":
# building model for nighttime
    print("Selected: nighttime")

    answer = input("Run with UHII as resp. variable? (y/n): ")

    # load dataset
    if answer == "y":
        df = pd.read_csv('winter_X_all_nighttime_hour_LCZ_UHII.csv')
        df.dropna(inplace=True)
        print(df.head())

    elif answer == "n":
        df = pd.read_csv('winter_X_all_nighttime_hour_LCZ.csv')
        df.dropna(inplace=True)
        print(df.head())

    else:
        print("Invalid option")

else:
    print("Invalid option")

In [None]:
# train/test using unstandardized data
X = df.iloc[:,0:df.shape[1]-1]
Y = df.iloc[:,(df.shape[1]-1)]

X = X.drop(columns=variables)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

mdl = RandomForestRegressor(n_estimators = 1000, random_state = 42, oob_score = True, min_samples_leaf=1, min_samples_split=3, n_jobs=12, verbose=1)

mdl.fit(X_train, Y_train)

# predict
Y_pred = mdl.predict(X_test)

# calculate metrics
RMSE = np.sqrt(mean_squared_error(Y_pred, Y_test))
MAE = np.mean(np.abs(Y_pred - Y_test))
R2 = r2_score(Y_test, Y_pred)

# print metrics
print('RMSE:', np.round(RMSE,4))
print('MAE:', np.round(MAE,4))
print('R2:', np.round(R2,4))

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

X = df.iloc[:,0:df.shape[1]-1]
Y = df.iloc[:,(df.shape[1]-1)]

X = X.drop(columns=variables)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

rf = RandomForestRegressor()

parameters = {
    'n_estimators': [250],
    'min_samples_split': [1, 2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6],
    'bootstrap': [True, False]
}

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

rmse_scorer = make_scorer(rmse, greater_is_better=False)
mae_scorer = make_scorer(mae, greater_is_better=False)

scoring = {'r2': 'r2', 'RMSE': rmse_scorer, 'MAE': mae_scorer}

grid_search = GridSearchCV(estimator=rf,
                           param_grid=parameters,
                           verbose=2,
                           n_jobs=12,
                           scoring=scoring)
grid_search.fit(X, Y)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best score - R2: ", grid_search.best_score_)
print("Best score - RMSE: ", rmse(Y_test, grid_search.predict(X_test)))
print("Best score - MAE: ", mae(Y_test, grid_search.predict(X_test)))
