In [2]:
import pandas as pd
import numpy as np
import src.ModelRunner as MR
import re
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from pandas.api.types import CategoricalDtype
from pandas import get_dummies

TRIPS_PATH = "datasets/bus_trips.csv"
BLOCKS_PATH = "datasets/blocks.csv"
DESIGN_MATRIX_PATH = "datasets/design_matrix.csv"


def rename_column(col_name, *remove_chars):
    """
    col_name: str
        Column to rename
    remove_chars: str
        String containing characters to remove from the column
    
    Returns column lowercased and without remove_chars
    """
    col_name = col_name.replace(" ", "_").lower()
    col_name = col_name.replace("/", "_")
    return re.sub("|".join(char for char in remove_chars), "", col_name)


assert rename_column("Electric Heater Energy [kWh]", "\[", "]") == "electric_heater_energy_kwh"


# Read in CSV
M = pd.read_csv(DESIGN_MATRIX_PATH).rename(mapper=lambda x: rename_column(x), axis=1)

# Cast Bus to Categorical Dtype
categories = CategoricalDtype(categories=[22901, 22902, 22903], ordered=False)
M.bus = M.bus.astype(categories)

# Hot-one encode bus
M = pd.get_dummies(data=M, columns=["bus"], dtype=int, drop_first=True)

# Define X & Y to feed into models
X, Y = M.iloc[:, 2:4], M.iloc[:, 1]

# Instantiate model runner
mr = MR.ModelRunner(Y=Y, X=X)


In [16]:
# Linear Regression
import joblib
from numpy import log, sqrt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


LIN_REGR_MODEL_OUTPUT_PATH = "models/power_consumption_linear_regression.sav"


y = M["power_consumption"]
X_reg = pd.DataFrame({
    "temperature": M["temperature"],
    "temp_sq": M["temperature"] ** 2,
    "speed": M["speed"]
})

Xtrain_reg, Xtest_reg, ytrain_reg, ytest_reg = train_test_split(X_reg, y, test_size=0.2, random_state=42)

regr = LinearRegression()
 
regr.fit(Xtrain_reg, ytrain_reg)

joblib.dump(regr, LIN_REGR_MODEL_OUTPUT_PATH)


['models/power_consumption_linear_regression.sav']

In [None]:
import src.ModelRunner as MR


# Read in data
M = pd.read_csv("datasets/design_matrix.csv")

# Define X & Y to feed into models
X, Y = M.iloc[:, 2:], M.iloc[:, 1]

# Instantiate
mr = MR.ModelRunner(Y, X)

# Run models
mr.run_random_forest_regression()
mr.run_svr()
mr.score_models()
print(mr.model_scores)
