In [1]:
import pandas as pd
import numpy as np
import tqdm as tqdm
from sklearn.model_selection import train_test_split
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
import data_preprocessing

train, test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [3]:
def split_x_y(df):
    y = df["Energy Consumption"]
    X = df.drop("Energy Consumption", axis=1)
    return X, y

In [4]:
X_train, y_train = split_x_y(train)
X_test, y_test = split_x_y(test)

In [5]:
from sklearn.ensemble import *

def random_forest_reg(args):
    n_estimators, max_depth, min_samples_split = args
    reg = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=int(n_estimators), max_depth=int(max_depth), min_samples_split=int(min_samples_split))
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    return mean_squared_error(y_test, y_pred, squared=False)

In [7]:
from hyperopt import hp, space_eval, tpe, fmin

space = [
    hp.quniform("n_estimators", 100, 500, 10),
    hp.quniform("max_depth", 2, 18, 1),
    hp.quniform("min_samples_split", 2, 100, 1),

]

best = fmin(fn=random_forest_reg, space=space, algo=tpe.suggest, verbose=True, max_evals=100)

100%|██████████| 100/100 [16:50<00:00, 10.10s/trial, best loss: 11116.293656763748]


In [50]:
space_eval(space, best)

(420.0, 18.0, 15.0)