In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
from src.data_scripts import *
from src.modeling_scripts import *

In [4]:
raw_data = pd.read_csv('data/ca_energy_demand.csv')

In [5]:
data = clean_data(raw_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, val

In [6]:
# 5 dates to predict from to score each model multiple times and take average score to determine best model

dates_lst = ['2018-06-03', '2019-12-12', '2019-04-10', '2019-07-15', '2019-10-22']
weeks = 1
days = 7
hours = 24

In [7]:
# scores model that takes the average of time period before the same length as period to be predicted

avg_lst = []
for date in dates_lst:
    avg_lst.append(score_avg_model(data, date, weeks=weeks, days=days, hours=hours))
avg_lst

[4757.208374754393,
 3547.622623719834,
 3444.247336382714,
 6429.2945561681245,
 5289.4857514349105]

In [8]:
np.mean(avg_lst), np.std(avg_lst)

(4693.571728491995, 1117.6678994630647)

In [9]:
# scores model that uses previous time period demand to predict future demand

last_period_lst = []
for date in dates_lst:
    last_period_lst.append(score_last_period_model(data, date, weeks=weeks, days=days, hours=hours))
last_period_lst

[3847.070575373522,
 1029.6666907536724,
 2481.4022425406397,
 4296.838698807698,
 3276.5190615521046]

In [11]:
np.mean(last_period_lst), np.std(last_period_lst)

(2986.2994538055273, 1151.173351023021)

In [12]:
# scores model using demand for the same time last year

last_year_lst = []
for date in dates_lst:
    last_year_lst.append(score_last_year_model(data, date, weeks=weeks, days=days, hours=hours))
last_year_lst

[1859.5956597758402,
 1332.8278297700433,
 3354.9758941011783,
 5357.01751321617,
 2414.9982216793214]

In [13]:
np.mean(last_year_lst), np.std(last_year_lst)

(2863.883023708511, 1414.9991146185712)

In [14]:
dummy_data = prep_dum_data(data)

In [15]:
# scores linear regression model

lr = LinearRegression()
lr_score = score_basic_models(lr, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
lr_score

[3383.997151959796,
 2703.382406955145,
 3041.440177187874,
 2944.3325832261417,
 2738.1591097490664]

In [16]:
np.mean(lr_score), np.std(lr_score)

(2962.262285815604, 245.63873279297)

In [17]:
# finds optimal alpha and scores lasso model

lasso_alpha_lst = [20]

for i in lasso_alpha_lst:
    lm = Lasso(alpha=i)
    lm_score = score_basic_models(lm, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
    print(lm_score)
    print('{:.2f}'.format(np.mean(lm_score)))
    print('{:.2f}'.format(np.std(lm_score)))

[3105.7681360632823, 2444.1098725980128, 2945.868739627086, 3286.896825224769, 2891.340805461577]
2934.80
281.42


In [18]:
# finds optimal alpha and scores ridge model

ridge_alpha_lst = [260]

for i in ridge_alpha_lst:
    rm = Ridge(alpha=i)
    rm_score = score_basic_models(rm, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
    print(rm_score)
    print('{:.2f}'.format(np.mean(rm_score)))
    print('{:.2f}'.format(np.std(rm_score)))

[2738.4445867489803, 2461.819417103969, 2778.71891270802, 3480.548840660124, 2800.439477734133]
2851.99
337.12


In [19]:
# grid search to find optimal random forest parameters

n_estimators = [50, 100, 150, 200, 300]
max_depth = [None, 1, 2, 3, 4]
min_samples_split = [2, 3, 4]

rf_lst = []
for est in n_estimators:
    for depth in max_depth:
        for samples in min_samples_split:
            rf_gridsearch = RandomForestRegressor(n_estimators=est, max_depth=depth, min_samples_split=samples)
            rf_gridsearch_score = score_basic_models(rf_gridsearch, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
            rf_lst.append([np.mean(rf_gridsearch_score), est, depth, samples])
min(rf_lst)

[2348.3961429326114, 150, None, 4]

In [20]:
# scores random forest model

rf = RandomForestRegressor(n_estimators=150, min_samples_split=4)
rf_score = score_basic_models(rf, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
rf_score

[1531.1320679403336,
 1077.9148859213833,
 2452.1997631575414,
 3704.0534808170414,
 3052.685483269264]

In [21]:
np.mean(rf_score), np.std(rf_score)

(2363.5971362211126, 961.820495305498)

In [131]:
# grid search to find optimal gradient boosting parameters

learning_rate = [.01, .1, .25]
n_estimators = [50, 100, 150, 200, 300]
max_depth = [None, 1, 2, 3, 4]
min_samples_split = [2, 3, 4]

gb_lst = []
for lr in learning_rate:
    for est in n_estimators:
        for depth in max_depth:
            for samples in min_samples_split:
                gb_gridsearch = GradientBoostingRegressor(learning_rate = lr, n_estimators=est, max_depth=depth, min_samples_split=samples)
                gb_gridsearch_score = score_basic_models(gb_gridsearch, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
                gb_lst.append([np.mean(gb_gridsearch_score), lr, est, depth, samples])
min(gb_lst)

[2274.402008489731, 0.1, 300, 4, 4]

In [132]:
# scores gradient boosting model

gb = GradientBoostingRegressor(learning_rate=.1, n_estimators=300, max_depth=4, min_samples_split=4)
gb_score = score_basic_models(gb, dummy_data, dates_lst, weeks=weeks, days=days, hours=hours)
gb_score

[2073.0872946477093,
 1212.8539466712105,
 2474.9848748329096,
 2832.3916118623915,
 2779.025986000477]

In [133]:
np.mean(gb_score), np.std(gb_score)

(2274.4687428029397, 595.5909874593134)

In [22]:
# score models with test data

pred_date = '2020-01-29'
weeks = 1
days = 7
hours = 24

In [23]:
score_avg_model(data, pred_date, weeks=weeks, days=days, hours=hours)

3202.27367894323

In [24]:
score_last_period_model(data, pred_date, weeks=weeks, days=days, hours=hours)

917.9968227976972

In [25]:
score_last_year_model(data, pred_date, weeks=weeks, days=days, hours=hours)

1828.0521395737048

In [26]:
lr = LinearRegression()
score_basic_models_test(lr, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

2651.4617022281855

In [27]:
lm = Lasso(alpha=20)
score_basic_models_test(lm, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

2372.7824915472497

In [28]:
rm = Ridge(alpha=260)
score_basic_models_test(rm, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

2488.357238868793

In [29]:
rf = RandomForestRegressor()
score_basic_models_test(rf, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

908.8283379274895

In [30]:
best_rf = RandomForestRegressor(n_estimators=150, min_samples_split=4)
score_basic_models_test(best_rf, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

917.814002594834

In [31]:
gb = GradientBoostingRegressor()
score_basic_models_test(gb, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

2190.4921270219656

In [32]:
best_gb = GradientBoostingRegressor(learning_rate = .1, n_estimators=300, max_depth=4, min_samples_split=4)
score_basic_models_test(best_gb, dummy_data, pred_date, weeks=weeks, days=days, hours=hours)

1513.6074149697852