In [1]:
import pandas as pd
import numpy as np
import src.ModelRunner as MR
import re
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype
from pandas import get_dummies

TRIPS_PATH = "datasets/bus_trips.csv"
BLOCKS_PATH = "datasets/blocks.csv"
DESIGN_MATRIX_PATH = "datasets/design_matrix.csv"


def rename_column(col_name, *remove_chars):
    """
    col_name: str
        Column to rename
    remove_chars: str
        String containing characters to remove from the column
    
    Returns column lowercased and without remove_chars
    """
    col_name = col_name.replace(" ", "_").lower()
    col_name = col_name.replace("/", "_")
    return re.sub("|".join(char for char in remove_chars), "", col_name)


assert rename_column("Electric Heater Energy [kWh]", "\[", "]") == "electric_heater_energy_kwh"


# Read in CSV
M = pd.read_csv(DESIGN_MATRIX_PATH).rename(mapper=lambda x: rename_column(x), axis=1)

# Cast Bus to Categorical Dtype
categories = CategoricalDtype(categories=[22901, 22902, 22903], ordered=False)
M.bus = M.bus.astype(categories)

# Hot-one encode bus
M = pd.get_dummies(data=M, columns=["bus"], dtype=int, drop_first=True)

# Define X & Y to feed into models
X, Y = M.iloc[:, 2:4], M.iloc[:, 1]

# Instantiate model runner
mr = MR.ModelRunner(Y=Y, X=X)


In [2]:
# Run Random Forest
mr.run_random_forest_regression()
mr.score_models()
print(mr.models)
print(mr.model_scores)


Fitting 3 folds for each of 100 candidates, totalling 300 fits
{'RF': RandomForestRegressor(max_depth=10, max_features=None, min_samples_leaf=4,
                      min_samples_split=5, n_estimators=200)}
{'RF': (0.8523392493952601, 0.7711506306184926)}


In [3]:
# Run SVR
mr.run_svr()
mr.score_models()
print(mr.models)
print(mr.model_scores)


Fitting 3 folds for each of 400 candidates, totalling 1200 fits
[CV] END bootstrap=False, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=2000; total time=   5.5s
[CV] END bootstrap=False, max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   2.1s
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=2, n_estimators=800; total time=   1.5s
[CV] END bootstrap=True, max_depth=90, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=1800; total time=   2.6s
[CV] END bootstrap=False, max_depth=70, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=600; total time=   1.2s
[CV] END bootstrap=True, max_depth=110, max_features=None, min_samples_leaf=1, min_samples_split=10, n_estimators=1400; total time=   2.5s
[CV] END bootstrap=True, max_depth=110, max_features=None, min_samples_leaf=1, min_samples_split=10, n_est

In [4]:
# Run XGBoost
mr.run_xgboost()
mr.score_models()
print(mr.models)
print(mr.model_scores)


Fitting 3 folds for each of 2160 candidates, totalling 6480 fits
Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_sp

NameError: name 'xgb_grid' is not defined

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.

Parameters: { "max_features", "min_samples_

In [7]:
print(mr.model_scores)

{'SVR': (0.7638698858885555, 0.7743704989254648), 'RF': (0.8735051760789831, 0.7716024782181223)}


In [138]:
# Linear Regression
from numpy import log, sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


y = M["power_consumption"]
X_reg = pd.DataFrame({
    "temperature": M["temperature"],
    "temp_sq": M["temperature"] ** 2,
    "speed": M["speed"],
    "log_speed": log(M["speed"])
})

Xtrain_reg, Xtest_reg, ytrain_reg, ytest_reg = train_test_split(X_reg, y, test_size=0.2, random_state=42)

regr = LinearRegression()
 
regr.fit(Xtrain_reg, ytrain_reg)
print(regr.score(Xtrain_reg, ytrain_reg))
print(regr.score(Xtest_reg, ytest_reg))


0.7554774574679805
0.7732734304442204


In [None]:
import src.ModelRunner as MR


# Read in data
M = pd.read_csv("datasets/design_matrix.csv")

# Define X & Y to feed into models
X, Y = M.iloc[:, 2:], M.iloc[:, 1]

# Instantiate
mr = MR.ModelRunner(Y, X)

# Run models
mr.run_random_forest_regression()
mr.run_svr()
mr.score_models()
print(mr.model_scores)
