# GreenForecast - Model Training

# Prep


In [None]:
# !pip install -Uqq fastai
# !pip install -Uqq xgboost

import pandas as pd
import numpy as np
import io
import os
from pathlib import Path
import pickle
from datetime import datetime
import timeit
import re
import json
np.random.seed(2)
import matplotlib.pyplot as plt
from scipy.stats import logistic
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
import joblib
import xgboost

from fastai.tabular.all import *

# autoreload changed modules
%load_ext autoreload
%autoreload 2

pd.set_option('display.float_format', lambda x: '%.2f' % x)
# plt.rcParams["figure.figsize"] = (16,10)
plt.rcParams["figure.figsize"] = (32,12)



DATASET_NAME = 'dataset8'  # files will be {DATASET_NAME}_{REGION}.csv eg dataset8_NSW1.csv
DATASET_FOLDER = Path('../data')
REGIONIDS = ['NSW1', 'QLD1', 'SA1', 'TAS1', 'VIC1']
FORECAST_TIMES = list(range(2,24,2)) + list(range(24, 24*7 + 4, 4))
HOLDOUT_SET_START = '2022-09-01'
p_min, p_max = -400, 1000 # values to clip price columns to when decoding


# make sure there is a models subdirectory
if not os.path.exists('models'): os.mkdir('models')

# print ALL columns from just two rows, displays all even when pandas cuts everything off.
def print_item(df, index=0, offset=1):
    if isinstance(index, str):
        # if passing in a string, it's probably going for loc instead of iloc, convert.
        index = df.index.get_loc(index)
    cells = list(zip([x for x in df.columns],[x for x in df.iloc[index]],[x for x in df.iloc[index+offset]]))
    [print(f'{a[0]:32}{a[1]:<32}{a[2]:<32}') for a in cells]

def check_nas(df):
    return df.loc[df.isnull().any(axis=1)]

def pkl(thing, filename):
    pickle.dump( thing, open( "filename", "wb" ) )

def upkl(filename):
    with open('filename', 'rb') as f:
        data = pickle.load(f)
    return data

In [None]:
### READ DATASET FILE
#####################

''' ignored_columns() returns the columns that should be ignored for a given target forecast 
Including: 
- forcasts more than 24 away from the target time
- lags of the other variable (ie greenness lags if we're predicting price) 
'''
def ignored_columns(columns, target, lag):
    ignored = []
    for col in columns:
        if "_Tp" not in col: continue  # only concerned with forecst cols '_Tp'
        if "Predis" in col: continue  # want predispatch columns always, don't ignore
        time = int(col.split("_Tp")[1]) 
        if time > lag + 24: ignored.append(col)
        if time < lag - 24: ignored.append(col)
        
    # remove greenness-specific columns from price
    if 'Price' in target:
        ignored = ignored + [col for col in columns if '_Greenness_Tm' in col]
        ignored = ignored + [col for col in columns if '_IC_Fossil_In' in col or '_IC_Green_In' in col]
        
    else: # remove price-specific columns from greenness
        ignored = ignored + [x for x in columns if re.match('.*Price_Tm\d+\Z', x)] # ignore _Price_Tm20 but keep _Price_Tm30d
        
    return ignored

def read_dataset(region='VIC1', target="VIC1_Price_Tp84", lag=84, switch='full_training', validation_set=0):
    global df, y_names, continuous, categorical  # declaring global so can inspect later. Don't edit 
    global dataset  # data as read from the .csv file, before any time-specific changes made.  declaring global so can inspect later and also to keep it static 
    
    # don't read if we've already got the right dataset in memory
    if 'dataset' in globals() and f"{region}_Price" in dataset.columns:    
        print(f"Already got {region} datafile in memory")
    else:
        datafile = DATASET_FOLDER / f'{DATASET_NAME}_{region}.csv'
        print(f"Datafile = {datafile}")

        # we want to read as float32 not double. First, sample 1000 rows of data to determine dtypes.
        df_test = pd.read_csv(datafile, nrows=1000)
        float_cols = [c for c in df_test if str(df_test[c].dtype) in ["float64", "int64", "bool"]]
        float32_cols = {c: np.float32 for c in float_cols}
        
        dataset = pd.read_csv(datafile, parse_dates=[0], index_col=0, engine='c', dtype=float32_cols)

        assert dataset.index.name == "Date"
        
        # Transform price columns
        for col in dataset.columns:
            if 'Price' in col:
                dataset[col] = dataset[col].clip(lower=p_min, upper=p_max)

        # convenience: write a file with the column names. 
        # with open(DATASET_FOLDER / f'columns_for_{region}.json', 'w') as f:
        #     f.write(json.dumps(sorted(list(set(dataset.columns) - {'day', 'is_augment_row'})), indent=2))
                
    df = dataset.copy()

    # ignore some columns becuase it seems to improve accuracy (and faster training)
    ignore_cont = ['day', 'is_augment_row'] +  ignored_columns(df.columns, target, lag)
    
    # Select columns
    if switch == 'full_training' or switch == 'full_training_with_holdout': 
        # add another column for our target forecast
        base_col = target.split('_Tp')[0]  # Either {region}_Price or {region}_Greenness
        df[target] = df[base_col].shift(-12 * lag)
        df = df.dropna()
        categorical = [] # ['month', 'hour', 'weekday']
        y_names = [target]
        continuous = sorted(list(set(df.columns) - set(categorical) - set(y_names) - set(ignore_cont)))

    elif switch == 'greenness_by_fuel': 
        assert 'Greenness' in target
        # add columns for targets - each fuel and total demand in the future. 
        all_gen = sorted([f'{region}_GEN_{fuel}' for fuel in ['Coal', 'Gas', 'Hydro', 'Rooftop', 'Solar', 'Wind']] + [f'{region}_IC_Fossil_In', f'{region}_IC_Green_In'])
        y_names = [f'{feature}_Tp{lag}' for feature in all_gen]
                
        df[y_names] = df[all_gen].shift(-12 * lag)

        df = df.dropna()
        categorical = [] # ['month', 'hour', 'weekday']
        continuous = sorted(list(set(df.columns) - set(y_names) - set(ignore_cont)))
        
    elif switch == 'meta_model':
        ignore_cont = ['day', 'is_augment_row']   # for metamodel, want ALL forecasts. 
        # columns for ALL forecast times (lag input parameter above is ignored)
        base_col = target.split('_Tp')[0]  # Either {region}_Price or {region}_Greenness
        lags = pd.DataFrame(index=df.index)
        for lag in FORECAST_TIMES:
            lags[f'{base_col}_Tp{lag}'] = df[base_col].shift(-12 * lag)
        df = pd.concat([df, lags], axis=1)
        df = df.dropna()
        categorical = []  # ['month', 'hour', 'weekday']
        y_names = [f'{base_col}_Tp{lag}' for lag in FORECAST_TIMES]
        continuous = sorted(list(set(df.columns) - set(categorical) - set(y_names) - set(ignore_cont)))
                
        
    # validation_type = 'cross-validation'
    # validation_type = 'exclude holdout'
    # if validation_set == -1: validation_type = 'holdout set'
    if switch == 'full_training_with_holdout':
        validation_type = '1month with holdout'
    else: 
        validation_type = 'cross-validation'  # default
    
    if validation_type == '2020+':
        df['validation_set'] = df.index.year >= 2020 
    elif validation_type == 'cross-validation':
        # we want 1 month in every 5 (20%) for validation. 
        # also months mod 5 conveniently causes the two to rotat each year which month they are.
        df['validation_set'] = (df.index.year*12 + df.index.month) % 5
        df['is_validation_set'] = (df.validation_set == validation_set)
    elif validation_type == '1month with holdout':
        # 1 month in every 5 AND we want the final two-ish months to be a holdout set - exclude these
        df['validation_set'] = (df.index.year*12 + df.index.month) % 5
        df['is_validation_set'] = (df.validation_set == validation_set)
        # if switch == 'meta_model':
        #     df = df[(df.validation_set == validation_set) | (df.index >= HOLDOUT_SET_START)]
        #     df = df[df.is_augment_row == False]  # remove all augment data for meta_model
        #     validation_set = -1  # so we keep the holdoutset below
        if validation_set != -1:  # holdout the last two months (but if validation_set = -1, we still want those two months)
            df = df[df.index < HOLDOUT_SET_START]
        else:  # validation_set == -1, ie we actually want the holdout set.
            df['is_validation_set'] = (df.index >= HOLDOUT_SET_START)
    
    if 'Price' in target:
        # remove any augment data in the validation set
        df = df[~(df.is_validation_set & df.is_augment_row)]
    else:
        # remove all augment data for greenness, it's only for price
        df = df[df.is_augment_row == False]
        
    # make sure df's columns are in alphabetical order
    df = df.sort_index(axis=1)
    
    return df, sorted(y_names), sorted(continuous), sorted(categorical)  

#test:
# df, y_names, continuous, categorical = read_dataset()

# Learning

## FastAI NN

In [None]:
def fastai_nn_prep(df, y_names, continuous, categorical):
    splits = ColSplitter('is_validation_set')(df)

    to = TabularPandas(df, 
                       y_names = y_names, 
                       cat_names = categorical,
                       cont_names = continuous, 
                       procs = [Normalize], # used to also have Categorify, FillMissing, 
                       splits = splits)
    dls = to.dataloaders(bs=128)
    
    return dls
# dls.show_batch()

def fastai_nn_train(dls, y_names, epochs, hypers, run_id):
    print(f"=================  Fastai NN Fitting  {y_names[0]}")
    if 'Price' in y_names[0]:
        y_range = (p_min, p_max)
    elif 'Greenness' in y_names[0]:
        y_range = (0,100)
    else:
        y_range = None
    
    print(f"\n\n{run_id}: {hypers['layers']}, lr={hypers['lr']}")
    learner = tabular_learner(dls,
                            layers=hypers['layers'],
                            # metrics=[rmse, mae],
                            lr=hypers['lr'], #0.003 best - prev #default=0.001, 0.0005 was good
                            loss_func=L1LossFlat(), #MSE if this is commented out
                            y_range=y_range,
                            config=tabular_config(ps=0.1,embed_p=0.1),
                            ) 

    learner.fit_one_cycle(n_epoch=epochs,
                        lr_max=hypers['lr'],
                        cbs=[
                            # TrackerCallback(monitor='valid_loss'),
                            EarlyStoppingCallback(patience=7),
                            SaveModelCallback(fname=f"nn_{run_id}"),
                            CSVLogger(fname=f"models//history_{run_id}.csv"),
                        ],
                       )
    return learner

learn = None  # declaring global so we can inspect later


# save_naked_fastai_nn() saves the model but without any callbacks to reduce filesize.
# Note that the recorder callback is the one that takes all the size but removing just that seemed to cause errors.
def save_naked_fastai_nn(learn, run_id='tmp'):
    while len(learn.cbs): # for some reason we need to do this more than once. 
        learn.remove_cbs(learn.cbs)
    # print(learn.cbs)
    filename = f'models//{run_id}.pkl'
    learn.export(filename)
    print(f'Saved model at {filename}, {os.path.getsize(filename) / 1000000.0 : 0.2f}MB')
    return filename

def load_fastai(run_id):
    return load_learner(f"models/{run_id}.pkl")

### Fastai NN 
# train_model() takes in a dict describing the desired model and trains it. 
# Here's an example model: 
# model_to_train = {
#     'run_id': 'VIC1_Price_Tp84',
#     'type': 'fastai_nn',  # 'fastai_nn' or 'xgboost'
#     'target': 'VIC1_Price_Tp84',
#     'region': 'VIC1',
#     'hours_in_future': 84,  # int hours in future to forecast
#     'model_filename': None,  # initialised to None, contains filename of model after training
#     'layers': [200, 200, 100],  # model structure - found via grid search, specific to this model type
#     'lr': 0.001,  # learning rate - found via grid search, specific to this model type
#     'switch': 'full_training',  # which dataset variant to use. For dataset8: 'full_training' for price and 'greenness_by_fuel' for greenness
#     'validation_set': 0,  # which validation set to use (cross validation)
#     'small_filesize': False,
# }
def train_fastai_nn_model(model_to_train):
    region = model_to_train['region']
    target = model_to_train['target']
    
    print(f"Read dataset {model_to_train['region']}... ", end="")
    df, y_names, continuous, categorical = read_dataset(region=model_to_train['region'], 
                                                        target=model_to_train['target'],
                                                        lag=model_to_train['hours_in_future'],
                                                        switch=model_to_train['switch'],
                                                        validation_set=model_to_train['validation_set'])

    print("fastai_nn_prep()...")
    global dls  # declare global so can play with it in jupyter. Don't actually rely on this. 
    dls = fastai_nn_prep(df, y_names, continuous, categorical)

    hypers = {'layers': model_to_train['layers'], 'lr': model_to_train['lr']}

    global learn  # declare global so can play with it in jupyter. Don't actually rely on this. 
    learn = fastai_nn_train(dls, y_names, epochs=11, hypers=hypers, run_id=model_to_train['run_id'])

    # get results
    results = pd.read_csv(f"models/history_{model_to_train['run_id']}.csv")['valid_loss']
    mae, epochs = results.min(), results.idxmin()
    
    # save model
    if 'small_filesize' in model_to_train and model_to_train['small_filesize'] == True:
        filename = save_naked_fastai_nn(learn, model_to_train['run_id'])
    else:
        # learn.save(f'models/nn{region}')  
        filename = f"models/{model_to_train['run_id']}.pkl"
        learn.export(filename)
    
    model_to_train['model_filename'] = filename
    model_to_train['accuracy (MAE)'] = mae
    model_to_train['best_iteration'] = epochs
    
    return model_to_train

## NN Experiments

#### single runs

In [None]:
model_to_train = {
    'run_id': 'VIC1_Price_Tp84_train_all',
    'type': 'fastai_nn',  # 'fastai_nn' or 'xgboost'
    'target': 'VIC1_Price_Tp84',
    'region': 'VIC1',
    'hours_in_future': 84,  # int hours in future to forecast
    'model_filename': None,  # initialised to None, contains filename of model after training
    'layers': [200, 200, 100],  # model structure - found via grid search, specific to this model type
    'lr': 0.001,  # learning rate - found via grid search, specific to this model type
    'switch': 'full_training',  # which dataset variant to use
    'validation_set': 0,  # which validation set to use (cross validation)
    'small_filesize': False,
}

train_fastai_nn_model(model_to_train)

In [None]:
model_to_train = {
    'run_id': 'QLD1_Greenness_Tp84_fuel',
    'type': 'fastai_nn',  # 'fastai_nn' or 'xgboost'
    'target': 'QLD1_Greenness_Tp84',
    'region': 'QLD1',
    'hours_in_future': 84,  # int hours in future to forecast
    'model_filename': None,  # initialised to None, contains filename of model after training
    'layers': [200, 200, 100],  # model structure - found via grid search, specific to this model type
    'lr': 0.0003,  # learning rate - found via grid search, specific to this model type
    'switch': 'greenness_by_fuel',  # which dataset variant to use
    'validation_set': 1,  # which validation set to use (cross validation)
    'small_filesize': False,
}

train_fastai_nn_model(model_to_train)

## Train XGBoost

In [None]:
# TRAIN XGBOOST
###############

def new_xgb_model():
    return xgboost.XGBRegressor(max_depth=7,  # More is better...  but affects filesize. 12 gave 25mb file and 12.2, 9 gave 5mb file and 12.5. 
                               n_estimators=300,
                               min_child_weight=0.5, #0.5
                               colsample_bytree=0.6, #0.6
                               subsample=0.8, #0.8
                               eta=0.04, # eta is learning rate, 0.01 is best so far
                               seed=42,
                               eval_metric=["mae"],
                               early_stopping_rounds = 14,
                               tree_method='gpu_hist', # comment out to use CPU only
                               predictor='gpu_predictor', # comment out to use CPU only
                               # booster='dart',
                               # rate_drop= 0.1,
                               # skip_drop=0.5,
                              )

def train_xgb(x_train, x_valid, y_train, y_valid, plot_importance=False): 

    xgb = new_xgb_model()
    
    # verbose = (len(y_names)<=2)
    verbose = plot_importance
    # verbose = True
    xgb.fit(x_train, 
            y_train, 
            eval_set=[(x_train, y_train), (x_valid, y_valid)], 
            verbose=verbose)

    # Testing
    preds_train_xgb = xgb.predict(x_train, iteration_range=(0, xgb.best_iteration + 1), validate_features=False)
    preds_valid_xgb = xgb.predict(x_valid, iteration_range=(0, xgb.best_iteration + 1), validate_features=False)

    return preds_train_xgb, preds_valid_xgb, xgb


def xgb_prep(df, continuous, target):
    
    validation = df[df['is_validation_set'] == True]
    training = df[df['is_validation_set'] == False]
    
    xgb_column_names = continuous
    
    x_train = training[xgb_column_names].to_numpy(dtype='float32')
    x_valid = validation[xgb_column_names].to_numpy(dtype='float32')

    y_train = training[target]
    y_valid = validation[target]

    return x_train, x_valid, y_train, y_valid
    
    
### train_xgb_model(model_to_train) - pass in this config dict:
# model_to_train = {
#     'run_id': f'VIC1_Price_Tp84_xbg_temp',
#     'type': 'xgboost',  # 'fastai_nn' or 'xgboost'
#     'target': 'VIC1_Price_Tp84',
#     'region': 'VIC1',
#     'hours_in_future': 84,  # int hours in future to forecast
#     'model_filename': None,  # initialised to None, contains filename of model after training
#     'switch': 'full_training_with_holdout',  # which dataset variant to use
#     'validation_set': 0,  # which validation set to use (cross validation)
#     'plot_importance': True,
# }
def train_xgb_model(model_to_train):
    start_time = timeit.default_timer()

    region = model_to_train['region']
    target = model_to_train['target']
    
    print(f"Read dataset {model_to_train['region']}... ", end="")
    df, y_names, continuous, categorical = read_dataset(region=region, 
                                                        target=target,
                                                        lag=model_to_train['hours_in_future'],
                                                        switch=model_to_train['switch'],
                                                        validation_set=model_to_train['validation_set'])
    print("xgb_prep()...")
    x_train, x_valid, y_train, y_valid = xgb_prep(df, continuous, target)

    
    # plot_importance = (target == f'{region}_Price_Tp84')
    if 'plot_importance' in model_to_train:
        plot_importance = True
    else:
        plot_importance = False

    print(f"=================  XGBoost Fitting {model_to_train['run_id']}     {datetime.now()}")
    preds_train_xgb, preds_valid_xgb, xgb = train_xgb(x_train, x_valid, y_train, y_valid, plot_importance)

    
    if plot_importance:
        # set column names for plot_importance
        xgb.get_booster().feature_names = continuous
        
        old = plt.rcParams["figure.figsize"]
        plt.rcParams["figure.figsize"] = (16,25)
        xgboost.plot_importance(xgb)
        plt.rcParams["figure.figsize"] = old

    
    filename = f"models/{model_to_train['run_id']}.txt"
    xgb.save_model(filename)

    model_to_train['model_filename'] = filename
    model_to_train['accuracy (MAE)'] = xgb.evals_result()['validation_1']['mae'][xgb.best_iteration]
    model_to_train['best_iteration'] = xgb.best_iteration

    print(f"Time: {timeit.default_timer() - start_time:0.1f}s")
    return model_to_train #, xgb, preds_valid_xgb
    

# predictions with XGB
def load_xgb_model(filename):
    xgb = new_xgb_model()
    xgb.load_model(filename)
    return xgb


In [None]:
#xgb - train one model

model_to_train = {
    'run_id': f'VIC1_Price_Tp84_xbg_temp',
    'type': 'xgboost',  # 'fastai_nn' or 'xgboost'
    'target': 'VIC1_Price_Tp84',
    'region': 'VIC1',
    'hours_in_future': 84,  # int hours in future to forecast
    'model_filename': None,  # initialised to None, contains filename of model after training
    'switch': 'full_training',  # which dataset variant to use
    'validation_set': 2,  # which validation set to use (cross validation)
    'plot_importance': True,
}

train_xgb_model(model_to_train)

print(f"{os.path.getsize(f'models/VIC1_Price_Tp84_xbg_temp.txt') / 1000000:0.2f} MB (baseline = 9.5MB)")

In [None]:
### XGB test 1 model against holdout

forecast = 84

region = 'VIC1'
target = f"{region}_Price_Tp{forecast}"
print(f"Read dataset {model_to_train['region']}... ", end="")
df, y_names, continuous, categorical = read_dataset(region=region, 
                                                    target=target,
                                                    lag=forecast,
                                                    switch=model_to_train['switch'],
                                                    validation_set=-1)

x_valid = df[df.is_validation_set == True][continuous].to_numpy(dtype='float32')
y_valid = df[df.is_validation_set == True][target]

filename = f'models\\{model_to_train["run_id"]}.txt'

xgb = load_xgb_model(filename)

preds_valid_xgb = xgb.predict(x_valid, iteration_range=(0, xgb.best_ntree_limit), validate_features=False)

mae = mean_absolute_error(y_valid, preds_valid_xgb)
print(f"MAE on holdout dataset: {mae}")

### XGB Grid Search

In [None]:
### Grid Search XGB
###################

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, LeaveOneGroupOut

# Manually add one-hot encodings for categorical variables
enc = preprocessing.OneHotEncoder(sparse=False, dtype='float32') # sparse = false: will return an np array
enc.fit(training[categorical])

def one_hot_encode_categoricals(df, continuous, categorical):
    array = df[continuous].to_numpy(dtype='float32')
    return np.append(array, enc.transform(df[categorical]), axis=1)
    
# cross-validation: no separate test/validation set. all goes in x_train for now.
x_train = one_hot_encode_categoricals(df, continuous, categorical)

y_train = df[y_names[0]]

xgb = xgboost.XGBRegressor(#n_estimators=40,
                           colsample_bytree=0.6, # colsample_bytree=0.8, 
                           # max_depth=5, # max_depth=10,
                           min_child_weight=0.5, 
                           subsample=0.8, 
                           eta=0.1, # eta is learning rate
                           seed=42,
                           n_jobs=1,
                           eval_metric=["mae"],
                           # early_stopping_rounds = 14,
                           tree_method='gpu_hist', # comment out to use CPU only
                           predictor='gpu_predictor', # comment out to use CPU only
                           )

param_grid = {
    'n_estimators': [40, 50, 60, 70],
    'eta' : [0.01, 0.02, 0.05, 0.1, 0.2],
    # 'colsample_bytree': [0.5, 0.6, 0.7, 0.8],
    'max_depth': [3, 4, 5, 8], 
    # 'min_child_weight': [0.5], # doens't make a difference 0.3 ... 0.8
    # 'subsample': [0.7, 0.8, 0.9], # 0.8 betas 0.6, 1.0 by a little bit
    # 'gamma': [0],
}

# 5 batches, each with 6 months of validation. the first one will be the first 8years of data, 2nd is first 8.5, etc. 
cross_validation = TimeSeriesSplit(n_splits=5, gap=12*24*1, test_size=12*24*180) # 6 month splits. 

grid_search = GridSearchCV(xgb, param_grid, scoring='neg_mean_absolute_error', cv=cross_validation, 
                           n_jobs=-1, verbose=1, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search_results = pd.DataFrame(grid_search.cv_results_)
grid_search_results.to_csv('grid_search_results2.csv')
print("results")
grid_search_results.sort_values('rank_test_score')[[x for x in grid_search_results.columns if 'train' not in x and 'params' not in x]][:40]

grid_search.best_params_, grid_search.best_score_

## Batching Many Models

In [None]:
def train(model_to_train):
    if model_to_train['type'] == 'fastai_nn':
        return train_fastai_nn_model(model_to_train)
    elif model_to_train['type'] == 'xgboost':
        return train_xgb_model(model_to_train)
    else:
        assert False, 'invalid model type requested'
        
def print_queue():
    with open('models_to_train.json') as f:
        return pd.DataFrame(json.loads(f.read()))

def save_queue(models_to_train):
    out = json.dumps(models_to_train)
    with open('models_to_train.json', 'w') as f:
        f.write(out)

def load_queue():
    for _ in range(5):
        with open('models_to_train.json') as f:
            raw = f.read()
        if len(raw) > 100:
            break
    else:
        assert False, "models_to_train.json came back < 100 bytes"
    return json.loads(raw)

def update_queue(run_id, key, value):
    models = load_queue()
    for i, model in enumerate(models):
        if model['run_id'] == run_id:
            models[i][key] = value
            break
    save_queue(models)
    return models

def reset_pending_in_queue():
    print('These are marked "pending":')
    models = load_queue()
    for model in models:
        if model['model_filename'] == 'pending':
            print(model['run_id'])
            model['model_filename'] = None
    print('... not any more.')
    save_queue(models)
    return load_queue()

### Train the next thing on the queue

In [None]:
## Train lots of models

i = 0
while True:
    # refresh the queue from disk and find the next one to train
    models_to_train = load_queue()
    if i >= len(models_to_train):
        break
    model_to_train = models_to_train[i]
    i += 1

    # skip if already trained / pending
    if model_to_train['model_filename'] is not None: continue
    
    # if model_to_train['type'] != 'fastai_nn': continue  # skip NNs, xgboost only 
    # if model_to_train['region'] != 'VIC1': continue  # Only a specific region
    

    models_to_train = update_queue(model_to_train['run_id'], 'model_filename', 'pending')
    
    results = train(model_to_train)
    
    models_to_train = update_queue(model_to_train['run_id'], 'best_iteration', results['best_iteration'])
    models_to_train = update_queue(model_to_train['run_id'], 'accuracy (MAE)', results['accuracy (MAE)'])
    models_to_train = update_queue(model_to_train['run_id'], 'model_filename', results['model_filename'])

        

### Build the queue for training EVERYTHING

In [None]:
### build list of models to train
if os.path.exists('models_to_train.json'):
    print('models_to_train.json already exists, not overwriting')
    models_to_train = load_queue()
else:
    # create a fresh models_to_train list
    print(f"Forecasts for: {FORECAST_TIMES}")

    models_to_train = []
    for model_type in ['fastai_nn', 'xgboost']:
        for region in REGIONIDS:
            for price_or_greenness in ['Price', 'Greenness']:
                for forecast_time in FORECAST_TIMES:
                    short_model_type = "nn" if model_type == "fastai_nn" else "xgb"
                    switch = 'full_training' if price_or_greenness == 'Price' else 'greenness_by_fuel'
                    if price_or_greenness == 'Greenness' and model_type == 'xgboost': continue
                    model_to_train = {
                        'run_id': f"{region}_{price_or_greenness}_Tp{forecast_time}_{short_model_type}",
                        'type': model_type,  # 'fastai_nn' or 'xgboost'
                        'target': f"{region}_{price_or_greenness}_Tp{forecast_time}",  # eg VIC1_Price_Tp84 or NSW1_Greenness_Tp1
                        'region': region,
                        'hours_in_future': forecast_time,  # int hours in future to forecast
                        'model_filename': None,  # initialised to None, contains filename of model after training
                        'layers': [200, 200, 100],  # found via grid search, specific to this model type
                        'lr': 0.0003,  # found via grid search, specific to this model type
                        'switch': switch,  # which dataset variant to use
                        'validation_set': 1,  # which validation set to use (cross validation)
                        'small_filesize': True,
                    }
                    models_to_train.append(model_to_train)

    save_queue(models_to_train)

print(f"Goal: Train {len(models_to_train)} models")
print_queue()

### NN Gridsearch - build a queue

In [None]:
### Fastai NN Gridsearch
# Deletes current queue file models_to_train.json !



layers_to_try = [[400,400,200],
                 [200,100],
                 [300,100],
                 [300,200]
                ]
# lr = [0.1, 0.03, 0.01, 0.003]
# lr = [0.03, 0.01, 0.003, 0.001]
# lrs = [0.003, 0.001, 0.0003]
# lrs = [0.001]
lrs = [0.0003]

models_to_train = []
for region in ['QLD1']: # REGIONIDS:  # in ['VIC1']
    for forecast in [84]: #[4, 48, 84, 162]:
        for layers in layers_to_try:
            for lr in lrs: #[0.003, 0.001, 0.0003]:
                # learn = fastai_nn_train(dls, y_names, epochs=18, hypers=hypers, run_id=f'Gridsearch_{i}')
                model_to_train = {
                    'run_id':  f'Gridsearch_{len(models_to_train)}',
                    'type': 'fastai_nn',  # 'fastai_nn' or 'xgboost'
                    'target': f'{region}_Greenness_Tp{forecast}',
                    'region': region,
                    'hours_in_future': forecast,  # int hours in future to forecast
                    'model_filename': None,  # initialised to None, contains filename of model after training
                    'layers': layers,  # model structure - found via grid search, specific to this model type
                    'lr': lr,  # learning rate - found via grid search, specific to this model type
                    'switch': 'greenness_by_fuel',  # which dataset variant to use
                    'validation_set': 0,  # which validation set to use (cross validation)
                    'small_filesize': False,
                }

                # train_fastai_nn_model(model_to_train)
                models_to_train.append(model_to_train)

#     mae = pd.read_csv(f"models//history_{run_id}.csv")['valid_loss'].min()
#     # mae = learn.tracker.best
#     print(f"Best = {mae}")
#     hyperparams[i]['Result (Valid_MAE)'] = mae

#     hyperparams_df = pd.DataFrame(hyperparams)
#     hyperparams_df.to_csv(f'results_{region}.csv')
    
# hyperparams_df
print(f'Gridsearch Queue: {len(models_to_train)} combinations')

save_queue(models_to_train)
print_queue()

# Analysis

In [None]:
# preds_train_xgb 
# preds_valid_xgb 
df2 = validation.copy()
df3 = training.copy()

if "preds_valid_xgb" in locals():
    df2['preds_xgb'] = preds_valid_xgb
    df3['preds_xgb'] = preds_train_xgb
if "preds_valid" in locals():
    df2['preds_nn'] = preds_valid
    df3['preds_nn'] = preds_train
if "preds_valid_ens" in locals():
    df2['preds_ens'] = preds_valid_ens
    df3['preds_ens'] = preds_train_ens
    
df2 = pd.concat([df2, df3]).sort_index()
df3 = 0
df2.shape

In [None]:
plot_cols=[target_col]
if "preds_valid_xgb" in locals():
    plot_cols.append('preds_xgb')
if "preds_valid" in locals():
    plot_cols.append('preds_nn')
if "preds_valid_ens" in locals():
    plot_cols.append('preds_ens')
    
def plot_month(df, date, columns=plot_cols, days=32):
    if isinstance(date, str):
        date = pd.to_datetime(date)
    start = date
    end = date + pd.Timedelta(days=days)
    df[start:end].plot(y=columns)

In [None]:
plot_month(df2, '2022-5-1', days=10)

In [None]:
plot_month(df2, '2022-3-1', days=10)
plot_month(df2, '2022-3-10', days=10)
plot_month(df2, '2022-3-20', days=10)
plot_month(df2, '2022-4-1', days=10)
plot_month(df2, '2022-4-10', days=10)
plot_month(df2, '2022-4-20', days=10)

## Plot days

In [None]:
plt.rcParams["figure.figsize"] = (16,8)
start = 5200
x=240

# offset=val_start
offset=0

plt.plot(range(x), preds_train_rf[start:start+x], 'r.-', 
         range(x), targs_train_rf[start:start+x], 'b.-',
         range(x), decode_price(df[f'{region}_Price_Tm1'][offset+start:offset+start+x]), 'g.')

## Plot Greenness errors

In [None]:
val_df = validation.copy()
val_df['preds_xgb'] = preds_valid_xgb
val_df['xgb_error'] = val_df[target_col] - val_df['preds_xgb']
worst_days = val_df['xgb_error'].abs().resample('D').mean().sort_values(ascending=False).index[:10]
worst_days

In [None]:
def plot_renewables(df, date):
    start = date - pd.Timedelta(days=3)
    end  =  date + pd.Timedelta(days=2)
    df.loc[start:end].plot.area(
        y=['VIC1_GEN_Wind', 'VIC1_GEN_Hydro', 'VIC1_GEN_Solar', 'VIC1_GEN_Rooftop'],
        color=['#2dc3fa', '#075cfa', '#fae22d', '#fca128'],
        title=f"Renewables for dates around {date.date()}")
plot_renewables(df, pd.to_datetime('2020-02-01'))

In [None]:
which_day = 3
def plot_day(df, date):
    start = date - pd.Timedelta(days=3)
    end  =  date + pd.Timedelta(days=2)
    df.loc[start:end].plot(
        y=[target_col,'preds_xgb', 'xgb_error'], 
        color=['black','blue','r'], 
        title=f"plot_worst_day() - Errors for {date.date()} and nearby days",
        grid=True)
    # return df.loc[start:end]

plot_day(a, worst_days[which_day])
plot_renewables(a, worst_days[which_day])
pass

In [None]:
plot_month(df4, '2020-05')

In [None]:
plt.rcParams["figure.figsize"] = (16,16)
fig = plt.plot(targs_valid, preds_valid, 'b.', [-200,600], [-200,600], '-')
plt.xlim([0,200])
plt.ylim([0,200])