In [2]:
import pandas as pd
import glob
import matplotlib.pyplot as plt
from datetime import datetime, time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import copy
from meteostat import Stations, Daily
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torcheval.metrics import R2Score

In [None]:
path = "RawData"


def extract_date_time(filename):
    """
    extract the date and time from the filename
    :param filename:
    :return:
    """
    parts = filename.split('.')
    date = parts[1]
    time = parts[2]
    return date, time


def get_date(df, file):
    """get the date from the dataframe and the time from the filename and combine them into a datetime object
    :param df: dataframe containing the date
    :param file: filename containing the time
    :return: datetime object
    """
    #date_str = df[df.iloc[:, 2] == 1].iloc[0]['Date']
    date_str = str(file.split('.')[1])
    time_str = str(file.split('.')[2])
    #date = datetime.strptime(date_str, '%Y-%m-%d')
    date = datetime.strptime(date_str, '%Y%m%d')
    time_value = time(int(time_str), 0)
    combined_datetime = datetime.combine(date.date(), time_value)
    return combined_datetime


degree_days = 'gw_hdd'
ecmwf_files = glob.glob(path + f'/ecmwf.*.[01][02].{degree_days}.csv')
ecmwf_sorted_files = sorted(ecmwf_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[3:]

ecmwf_eps_files = glob.glob(path + f'/ecmwf-eps.*.[01][02].{degree_days}.csv')
ecmwf_eps_sorted_files = sorted(ecmwf_eps_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[2:]

gfs_ens_bc_files = glob.glob(path + f'/gfs-ens-bc.*.[01][02].{degree_days}.csv')
gfs_ens_bc_sorted_files = sorted(gfs_ens_bc_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[2:]

cmc_ens_files = glob.glob(path + f'/cmc-ens.*.[01][02].{degree_days}.csv')
cmc_ens_sorted_files = sorted(cmc_ens_files, key=lambda x: (x.split('.')[1], x.split('.')[2]))[2:]
for _ in range(2):
    set1 = set((extract_date_time(filename) for filename in ecmwf_sorted_files))
    set2 = set((extract_date_time(filename) for filename in ecmwf_eps_sorted_files))

    ecmwf_sorted_files = [filename for filename in ecmwf_sorted_files if extract_date_time(filename) in set2]
    ecmwf_eps_sorted_files = [filename for filename in ecmwf_eps_sorted_files if extract_date_time(filename) in set1]
    cmc_ens_sorted_files = [filename for filename in cmc_ens_sorted_files if extract_date_time(filename) in set1]

    master_set = set((extract_date_time(filename) for filename in cmc_ens_sorted_files))
    gfs_ens_bc_sorted_files = [filename for filename in gfs_ens_bc_sorted_files if
                               extract_date_time(filename) in master_set]

    master_set = set((extract_date_time(filename) for filename in gfs_ens_bc_sorted_files))

    ecmwf_sorted_files = [filename for filename in ecmwf_sorted_files if extract_date_time(filename) in master_set]
    ecmwf_eps_sorted_files = [filename for filename in ecmwf_eps_sorted_files if
                              extract_date_time(filename) in master_set]
    gfs_ens_bc_sorted_files = [filename for filename in gfs_ens_bc_sorted_files if
                               extract_date_time(filename) in master_set]
    cmc_ens_sorted_files = [filename for filename in cmc_ens_sorted_files if extract_date_time(filename) in master_set]

In [None]:
ecmwf_eps_change_df = pd.DataFrame(columns=['ecmwf-eps_9', 'ecmwf-eps_10', 'ecmwf-eps_11', 'ecmwf-eps_12',
                                  'ecmwf-eps_13', 'ecmwf-eps_14'])
passed_rows = []
for i in range(1, len(ecmwf_eps_sorted_files)):
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i])
    ecmwf_eps_df = ecmwf_eps_df[ecmwf_eps_df[ecmwf_eps_df.columns[2]] >= 1]
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    prev_ecmwf_eps_df = prev_ecmwf_eps_df[prev_ecmwf_eps_df[prev_ecmwf_eps_df.columns[2]] >= 1]

    date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i])
    prev_date = get_date(prev_ecmwf_eps_df, ecmwf_eps_sorted_files[i-1])
    d2 = str(date)[:10]
    d1 = str(prev_date)[:10]

    if d2 != d1:
        offset = 1
    else:
        offset = 0

    changes = []
    try:
        for day in range(8, 14):
            changes.append(ecmwf_eps_df.iloc[day - offset]['Value'] - prev_ecmwf_eps_df.iloc[day]['Value'])
        new_row = pd.DataFrame([changes], columns=ecmwf_eps_change_df.columns, index=[date])
        ecmwf_eps_change_df = pd.concat([ecmwf_eps_change_df, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
passed_rows

In [None]:
ecmwf_change_df = pd.DataFrame(columns=['ecmwf_diff_8', 'ecmwf_diff_9',])
passed_rows = []
for i in range(1, len(ecmwf_sorted_files)):
    ecmwf_df = pd.read_csv(ecmwf_sorted_files[i])
    ecmwf_df = ecmwf_df[ecmwf_df[ecmwf_df.columns[2]] >= 1]
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    ecmwf_eps_df = ecmwf_eps_df[ecmwf_eps_df[ecmwf_eps_df.columns[2]] >= 1]

    try:
        ecmwf = ecmwf_df.iloc[8]
        ecmwf_eps = ecmwf_eps_df.iloc[9]
    except IndexError:
        print(f"error on row: {i}")
        passed_rows.append(i)
        continue

    date = get_date(ecmwf_df, ecmwf_sorted_files[i])
    prev_date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i-1])
    d2 = str(date)[:10]
    d1 = str(prev_date)[:10]
    if d2 != d1:
        offset = 1
    else:
        offset = 0

    changes = []
    try:
        for day in range(8,10):
            changes.append(ecmwf_df.iloc[day - offset]['Value'] - ecmwf_eps_df.iloc[day]['Value'])
        new_row = pd.DataFrame([changes], columns=ecmwf_change_df.columns, index=[date])
        ecmwf_change_df = pd.concat([ecmwf_change_df, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
passed_rows

In [None]:
gfs_ens_bc_change_df = pd.DataFrame(columns=['gfs-ens-bc_9', 'gfs-ens-bc_10', 'gfs-ens-bc_11', 'gfs-ens-bc_12',
                                  'gfs-ens-bc_13', 'gfs-ens-bc_14'])
passed_rows = []
for i in range(1, len(gfs_ens_bc_sorted_files)):
    gfs_ens_bc_df = pd.read_csv(gfs_ens_bc_sorted_files[i])
    gfs_ens_bc_df = gfs_ens_bc_df[gfs_ens_bc_df[gfs_ens_bc_df.columns[2]] >= 1]
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    prev_ecmwf_eps_df = prev_ecmwf_eps_df[prev_ecmwf_eps_df[prev_ecmwf_eps_df.columns[2]] >= 1]

    try:
        date = get_date(gfs_ens_bc_df, gfs_ens_bc_sorted_files[i])
        prev_date = get_date(prev_ecmwf_eps_df, ecmwf_eps_sorted_files[i-1])
    except IndexError:
        print(f"error on row: {i}")
        passed_rows.append(i)
        continue

    d2 = str(date)[:10]
    d1 = str(prev_date)[:10]
    if d2 != d1:
        offset = 1
    else:
        offset = 0

    changes = []
    try:
        for day in range(8, 14):
            changes.append(gfs_ens_bc_df.iloc[day - offset]['Value'] - prev_ecmwf_eps_df.iloc[day]['Value'])
        new_row = pd.DataFrame([changes], columns=gfs_ens_bc_change_df.columns, index=[date])
        gfs_ens_bc_change_df = pd.concat([gfs_ens_bc_change_df, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
passed_rows

In [None]:
cmc_ens_change_df = pd.DataFrame(columns=['cmc-ens_9', 'cmc-ens_10', 'cmc-ens_11', 'cmc-ens_12',
                                  'cmc-ens_13', 'cmc-ens_14'])
passed_rows = []

for i in range(1, len(cmc_ens_sorted_files)):
    cmc_ens_df = pd.read_csv(cmc_ens_sorted_files[i])
    cmc_ens_df = cmc_ens_df[cmc_ens_df[cmc_ens_df.columns[2]] >= 1]
    gfs_ens_bc_df = pd.read_csv(gfs_ens_bc_sorted_files[i])
    gfs_ens_bc_df = gfs_ens_bc_df[gfs_ens_bc_df[gfs_ens_bc_df.columns[2]] >= 1]
    date = get_date(cmc_ens_df, cmc_ens_sorted_files[i])

    changes = []
    try:
        for day in range(8, 14):
            changes.append(cmc_ens_df.iloc[day]['Value'] - gfs_ens_bc_df.iloc[day]['Value'])
        new_row = pd.DataFrame([changes], columns=cmc_ens_change_df.columns, index=[date])
        cmc_ens_change_df = pd.concat([cmc_ens_change_df, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
passed_rows

In [None]:
day_8_error = pd.DataFrame(columns=['day_8_error'])
passed_rows = []

for i in range(1, len(ecmwf_eps_sorted_files)):
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i])
    ecmwf_eps_df = ecmwf_eps_df[ecmwf_eps_df[ecmwf_eps_df.columns[2]] >= 1]
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    prev_ecmwf_eps_df = prev_ecmwf_eps_df[prev_ecmwf_eps_df[prev_ecmwf_eps_df.columns[2]] >= 1]

    date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i])
    prev_date = get_date(prev_ecmwf_eps_df, ecmwf_eps_sorted_files[i-1])
    d2 = str(date)[:10]
    d1 = str(prev_date)[:10]

    if d2 != d1:
        offset = 1
    else:
        offset = 0
    day = 7
    changes = []
    try:
        changes.append(ecmwf_eps_df.iloc[day]['Value'] - prev_ecmwf_eps_df.iloc[day + offset]['Value'])
        new_row = pd.DataFrame([changes], columns=day_8_error.columns, index=[date])
        day_8_error = pd.concat([day_8_error, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
errors_df = pd.DataFrame(columns=['error_9', 'error_10', 'error_11', 'error_12', 'error_13', 'error_14'])
passed_rows = []

for i in range(2, len(ecmwf_eps_sorted_files)):
    ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-1])
    ecmwf_eps_df = ecmwf_eps_df[ecmwf_eps_df[ecmwf_eps_df.columns[2]] >= 1]
    prev_ecmwf_eps_df = pd.read_csv(ecmwf_eps_sorted_files[i-2])
    prev_ecmwf_eps_df = prev_ecmwf_eps_df[prev_ecmwf_eps_df[prev_ecmwf_eps_df.columns[2]] >= 1]

    date = get_date(ecmwf_eps_df, ecmwf_eps_sorted_files[i])
    prev_date = get_date(prev_ecmwf_eps_df, ecmwf_eps_sorted_files[i-1])
    d2 = str(date)[:10]
    d1 = str(prev_date)[:10]

    if d2 != d1:
        offset = 1
    else:
        offset = 0

    errors = []
    try:
        for day in range(8, 14):
            errors.append(ecmwf_eps_df.iloc[day - offset]['Value'] - prev_ecmwf_eps_df.iloc[day]['Value'])
        new_row = pd.DataFrame([errors], columns=errors_df.columns, index=[date])
        errors_df = pd.concat([errors_df, new_row])
    except IndexError:
        print(f"error on {date}")
        passed_rows.append(i)

In [None]:
master_df = pd.concat([gfs_ens_bc_change_df, cmc_ens_change_df, ecmwf_change_df, errors_df, day_8_error, ecmwf_eps_change_df], axis=1)
master_df.fillna(0, inplace=True)
display(master_df[-45:-35])

In [None]:
master_df.to_pickle('master_df.pkl')

random forest

In [3]:
master_df = pd.read_pickle('master_df.pkl')

In [None]:
X = master_df.iloc[:, :-6]
y = master_df.iloc[:, -6:]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model = RandomForestRegressor(n_estimators=1000, max_depth=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

RNN

In [None]:
input_features = master_df.iloc[:, :-6].values ** 2
target_variables = master_df.iloc[:, -6:].values

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    input_features, target_variables, test_size=0.2, shuffle=False)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, shuffle=False)

# Scale the input features based on the training data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Scale the validation and test data based on the training data
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data to PyTorch tensors

X_train_tensor = torch.Tensor(X_train_scaled)
y_train_tensor = torch.Tensor(y_train)
X_val_tensor = torch.Tensor(X_val_scaled)
y_val_tensor = torch.Tensor(y_val)
X_test_tensor = torch.Tensor(X_test_scaled)
y_test_tensor = torch.Tensor(y_test)

In [None]:
#benchmark error
total_mse = 0
c = 0
for i in range(1, len(y_test_tensor)):
    #mse = mean_squared_error(y_test_tensor[i], y_test_tensor[i-1])
    mse = mean_squared_error(y_test_tensor[i], [0,0,0,0,0,0])
    total_mse += mse
    c += 1

total_mse/c

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, dropout):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, 64)  # Dense layer with 64 units
        self.fc2 = nn.Linear(64, output_size)  # Final output layer

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = F.relu(out)  # Apply ReLU activation between LSTM and first dense layer
        out = self.fc1(out[:, -1, :])
        out = F.relu(out)  # Apply ReLU activation to the output of the first dense layer
        out = self.fc2(out)

        return out

In [None]:
input_size = X_train_tensor.shape[1]
output_size = y_train_tensor.shape[1]
hidden_size = 256
num_layers = 3
dropout = 0.3
lr = 0.01
mps_device = torch.device("mps")

In [None]:
model = RNN(input_size, hidden_size, output_size, num_layers, dropout)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=0.001)

In [None]:
num_epochs = 100
sequence_length = 10  # Number of previous days to consider

best_loss = float('inf')
best_model = None
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    # Loop through each sequence in the training data
    for i in range(sequence_length, X_train_tensor.shape[0]):
        # Extract the current sequence and target
        input_seq = X_train_tensor[i - sequence_length:i].view(1, sequence_length, -1)
        target_seq = y_train_tensor[i]

        # Forward pass
        output = model(input_seq)
        loss = criterion(output, target_seq.unsqueeze(0))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / (X_train_tensor.shape[0] - sequence_length)
    train_losses.append(average_loss)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Training Loss: {average_loss}')

    # Validation stage
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        total_val_loss = 0

        for i in range(sequence_length, X_val_tensor.shape[0]):
            input_seq = X_val_tensor[i - sequence_length:i].view(1, sequence_length, -1)
            target_seq = y_val_tensor[i]

            output = model(input_seq)
            val_loss = criterion(output, target_seq.unsqueeze(0))

            total_val_loss += val_loss.item()

        average_val_loss = total_val_loss / (X_val_tensor.shape[0] - sequence_length)
        val_losses.append(average_val_loss)
        print(f'Epoch [{epoch + 1}/{num_epochs}], Validation Loss: {average_val_loss}')

        # Check if current model is the best based on validation loss
        if average_val_loss < best_loss:
            best_loss = average_val_loss
            best_model = copy.deepcopy(model)

# After training, use the best model for testing
model = best_model

In [None]:
def plot_losses(train_losses, val_losses):
    fig, ax = plt.subplots(figsize=(12, 6))
    ax.plot(train_losses, label='Training Loss')
    ax.plot(val_losses, label='Validation Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend()
    plt.show()
plot_losses(train_losses, val_losses)

In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    test_loss = 0.0
    predictions = []

    for i in range(sequence_length, X_test_tensor.shape[0]):
        input_seq = X_test_tensor[i - sequence_length:i].view(1, sequence_length, -1)
        target_seq = y_test_tensor[i]

        output = model(input_seq)
        loss = criterion(output, target_seq.unsqueeze(0))

        test_loss += loss.item()

        # Extract the scalar value from the tensor and append it to predictions
        predictions.append(output.squeeze().tolist())

    average_test_loss = test_loss / (X_test_tensor.shape[0] - sequence_length)
    print(f'Test Loss: {average_test_loss}')

    # Convert the predictions and target values to numpy arrays
    predictions = np.array(predictions)
    targets = y_test_tensor[sequence_length:].numpy()

    # Evaluate the performance using appropriate metrics
    # For example, calculate mean squared error (MSE)
    mse = mean_squared_error(targets, predictions)
    metric = R2Score()
    r2 = metric.update(torch.tensor(predictions), torch.tensor(targets)).compute()
    print(f'Mean Squared Error (MSE): {mse}')
    print(f'R2 Score: {r2}')

In [None]:
def prediction(input):
    model.eval()

    with torch.no_grad():
        input_seq = torch.tensor(input).view(1, sequence_length, -1)
        output = model(input_seq)
        return output.squeeze().tolist()

In [None]:
pred = prediction(X_test_tensor[-sequence_length:])

In [None]:
pred

In [None]:
y_test_tensor[-1]

# Autogloun

In [4]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
import os.path

In [5]:
y = master_df.iloc[:, -6:].copy()
y = y.reset_index(drop=True)
X = master_df.iloc[:, :-6].copy()
X['Date'] = X.index
X = X.reset_index(drop=True)

In [10]:
auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()
X = auto_ml_pipeline_feature_generator.fit_transform(X=X)

In [11]:
X

Unnamed: 0,gfs-ens-bc_9,gfs-ens-bc_10,gfs-ens-bc_11,gfs-ens-bc_12,gfs-ens-bc_13,gfs-ens-bc_14,cmc-ens_9,cmc-ens_10,cmc-ens_11,cmc-ens_12,...,error_11,error_12,error_13,error_14,day_8_error,Date,Date.year,Date.month,Date.day,Date.dayofweek
0,0.012,0.009,0.006,0.002,0.004,0.011,-0.008,-0.005,-0.001,-0.003,...,0.000,0.000,0.000,0.000,0.000,1531267200000000000,2018,7,11,2
1,0.001,0.001,0.004,0.009,0.011,0.013,-0.003,-0.002,-0.004,-0.008,...,0.000,0.000,0.001,-0.001,0.005,1531310400000000000,2018,7,11,2
2,0.010,0.007,0.011,0.013,0.011,0.008,-0.007,-0.011,-0.013,-0.011,...,0.001,0.000,-0.001,0.000,0.000,1531353600000000000,2018,7,12,3
3,0.006,0.009,0.010,0.009,0.009,0.009,-0.008,-0.009,-0.009,-0.010,...,0.000,0.001,0.001,-0.001,-0.002,1531396800000000000,2018,7,12,3
4,0.012,0.007,0.008,0.007,0.009,0.009,-0.007,-0.009,-0.009,-0.010,...,0.000,0.000,0.001,0.001,0.000,1531440000000000000,2018,7,13,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3477,1.370,0.961,0.180,0.107,0.123,0.015,-0.752,-0.071,-0.003,0.156,...,0.076,0.004,-0.137,-0.039,0.985,1684108800000000000,2023,5,15,0
3478,-1.058,-0.058,0.440,0.334,0.098,-0.206,-0.139,-0.626,-0.916,-0.714,...,-0.115,-0.153,-0.300,-0.210,-0.607,1684152000000000000,2023,5,15,0
3479,-0.018,-0.324,-0.084,0.108,0.380,0.401,0.330,-0.207,-0.039,0.085,...,0.590,0.388,0.165,-0.251,0.375,1684195200000000000,2023,5,16,1
3480,-0.719,-0.732,-0.481,-0.186,0.040,0.155,-0.360,-0.185,-0.309,-0.252,...,-0.003,-0.011,0.082,0.179,-0.624,1684238400000000000,2023,5,16,1


In [12]:
df = pd.concat([X, y], axis=1)

In [13]:
train_len = 0.8
train_data = TabularDataset(df[:int(len(df)*train_len)])
test_data = TabularDataset(df[int(len(df)*train_len):])

In [14]:
labels = ['ecmwf-eps_9', 'ecmwf-eps_10', 'ecmwf-eps_11', 'ecmwf-eps_12', 'ecmwf-eps_13',
          'ecmwf-eps_14']
save_path = 'models'

In [15]:
display(train_data.head())

Unnamed: 0,gfs-ens-bc_9,gfs-ens-bc_10,gfs-ens-bc_11,gfs-ens-bc_12,gfs-ens-bc_13,gfs-ens-bc_14,cmc-ens_9,cmc-ens_10,cmc-ens_11,cmc-ens_12,...,Date.year,Date.month,Date.day,Date.dayofweek,ecmwf-eps_9,ecmwf-eps_10,ecmwf-eps_11,ecmwf-eps_12,ecmwf-eps_13,ecmwf-eps_14
0,0.012,0.009,0.006,0.002,0.004,0.011,-0.008,-0.005,-0.001,-0.003,...,2018,7,11,2,0.0,0.002,0.001,0.0,0.0,0.0
1,0.001,0.001,0.004,0.009,0.011,0.013,-0.003,-0.002,-0.004,-0.008,...,2018,7,11,2,0.001,0.0,0.0,0.0,-0.001,0.001
2,0.01,0.007,0.011,0.013,0.011,0.008,-0.007,-0.011,-0.013,-0.011,...,2018,7,12,3,0.0,0.003,0.001,0.0,0.001,0.0
3,0.006,0.009,0.01,0.009,0.009,0.009,-0.008,-0.009,-0.009,-0.01,...,2018,7,12,3,-0.003,-0.001,0.001,0.001,0.0,0.0
4,0.012,0.007,0.008,0.007,0.009,0.009,-0.007,-0.009,-0.009,-0.01,...,2018,7,13,4,0.0,0.0,-0.002,-0.002,0.001,0.004


In [16]:
class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def feature_imp(self,data, **kwargs):
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating feature importance for label: {label} ...")

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
            print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [17]:
multi_predictor = MultilabelPredictor(labels=labels, path=save_path)



In [18]:
multi_predictor.fit(train_data) # add presets='best_quality' for better results, but longer runtime

Beginning AutoGluon training ...
AutoGluon will save models to "models/Predictor_ecmwf-eps_9/"
AutoGluon Version:  0.7.0
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.5.0: Mon Apr 24 20:53:44 PDT 2023; root:xnu-8796.121.2~5/RELEASE_ARM64_T8103
Train Data Rows:    2785
Train Data Columns: 26
Label Column: ecmwf-eps_9
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (7.132000000000005, -7.687000000000005, 0.01289, 1.61633)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1450.04 M

Fitting TabularPredictor for label: ecmwf-eps_9 ...


	-0.7002	 = Validation score   (-root_mean_squared_error)
	2.54s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.6899	 = Validation score   (-root_mean_squared_error)
	1.21s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.6844	 = Validation score   (-root_mean_squared_error)
	0.54s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.7034	 = Validation score   (-root_mean_squared_error)
	1.77s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.6932	 = Validation score   (-root_mean_squared_error)
	1.25s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.6929	 = Validation score   (-root_mean_squared_error)
	6.81s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/

Fitting TabularPredictor for label: ecmwf-eps_10 ...


	-0.9032	 = Validation score   (-root_mean_squared_error)
	2.23s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.7988	 = Validation score   (-root_mean_squared_error)
	7.2s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.907	 = Validation score   (-root_mean_squared_error)
	0.57s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.7624	 = Validation score   (-root_mean_squared_error)
	1.58s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.8348	 = Validation score   (-root_mean_squared_error)
	2.38s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.7489	 = Validation score   (-root_mean_squared_error)
	3.91s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/n

Fitting TabularPredictor for label: ecmwf-eps_11 ...


	-0.8561	 = Validation score   (-root_mean_squared_error)
	2.74s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.7826	 = Validation score   (-root_mean_squared_error)
	3.63s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.8667	 = Validation score   (-root_mean_squared_error)
	0.57s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.6875	 = Validation score   (-root_mean_squared_error)
	1.7s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.8502	 = Validation score   (-root_mean_squared_error)
	1.45s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.6789	 = Validation score   (-root_mean_squared_error)
	4.75s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/n

Fitting TabularPredictor for label: ecmwf-eps_12 ...


	-0.7919	 = Validation score   (-root_mean_squared_error)
	2.39s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.7367	 = Validation score   (-root_mean_squared_error)
	1.44s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.7791	 = Validation score   (-root_mean_squared_error)
	0.58s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.6341	 = Validation score   (-root_mean_squared_error)
	1.72s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.7478	 = Validation score   (-root_mean_squared_error)
	2.06s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.681	 = Validation score   (-root_mean_squared_error)
	4.6s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/ni

Fitting TabularPredictor for label: ecmwf-eps_13 ...


	-0.7193	 = Validation score   (-root_mean_squared_error)
	2.5s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.6296	 = Validation score   (-root_mean_squared_error)
	3.8s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.7076	 = Validation score   (-root_mean_squared_error)
	0.61s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.5927	 = Validation score   (-root_mean_squared_error)
	1.63s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.6874	 = Validation score   (-root_mean_squared_error)
	2.41s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.5982	 = Validation score   (-root_mean_squared_error)
	3.94s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/n

Fitting TabularPredictor for label: ecmwf-eps_14 ...


	-0.6622	 = Validation score   (-root_mean_squared_error)
	2.72s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: CatBoost ...
	-0.6072	 = Validation score   (-root_mean_squared_error)
	3.87s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.6749	 = Validation score   (-root_mean_squared_error)
	0.63s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.5591	 = Validation score   (-root_mean_squared_error)
	1.65s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.6522	 = Validation score   (-root_mean_squared_error)
	2.24s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.6063	 = Validation score   (-root_mean_squared_error)
	3.57s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBMLarge ...
		`import lightgbm` failed. If you are using Mac OSX, Please try 'brew install libomp'. Detailed info: dlopen(/Users/

MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('models/')


In [19]:
multi_predictor = MultilabelPredictor.load(save_path)
test_data_nolab = test_data.drop(columns=labels)
test_data_nolab.head()

Unnamed: 0,gfs-ens-bc_9,gfs-ens-bc_10,gfs-ens-bc_11,gfs-ens-bc_12,gfs-ens-bc_13,gfs-ens-bc_14,cmc-ens_9,cmc-ens_10,cmc-ens_11,cmc-ens_12,...,error_11,error_12,error_13,error_14,day_8_error,Date,Date.year,Date.month,Date.day,Date.dayofweek
2785,-0.324,-0.276,-0.11,-0.182,-0.039,0.143,0.272,0.178,0.121,0.06,...,0.088,0.016,-0.049,-0.072,-0.137,1654214400000000000,2022,6,3,4
2786,-0.233,-0.238,-0.207,-0.074,0.055,0.106,-0.07,-0.17,-0.144,-0.096,...,0.026,0.021,0.002,0.007,-0.055,1654257600000000000,2022,6,3,4
2787,-0.164,-0.103,-0.039,0.044,0.033,0.025,0.201,-0.113,-0.27,-0.269,...,-0.085,-0.028,0.021,0.03,0.034,1654300800000000000,2022,6,4,5
2788,-0.224,-0.029,-0.032,-0.087,-0.065,0.02,-0.119,-0.236,-0.202,-0.056,...,0.036,0.072,0.047,0.033,-0.191,1654344000000000000,2022,6,4,5
2789,0.238,0.247,0.328,0.372,0.151,-0.013,-0.16,-0.389,-0.462,-0.334,...,-0.218,-0.228,-0.159,-0.049,0.387,1654387200000000000,2022,6,5,6


In [20]:
predictions = multi_predictor.predict(test_data_nolab)
print("Predictions:  \n", predictions)

Predicting with TabularPredictor for label: ecmwf-eps_9 ...
Predicting with TabularPredictor for label: ecmwf-eps_10 ...
Predicting with TabularPredictor for label: ecmwf-eps_11 ...
Predicting with TabularPredictor for label: ecmwf-eps_12 ...
Predicting with TabularPredictor for label: ecmwf-eps_13 ...
Predicting with TabularPredictor for label: ecmwf-eps_14 ...
Predictions:  
       ecmwf-eps_9  ecmwf-eps_10  ecmwf-eps_11  ecmwf-eps_12  ecmwf-eps_13  \
2785    -0.127101     -0.126792     -0.097127     -0.141238     -0.043186   
2786    -0.116464     -0.123571     -0.162110     -0.069101      0.040029   
2787     0.018806     -0.007055     -0.057805     -0.013736      0.015636   
2788    -0.188968     -0.071285     -0.120150     -0.131894     -0.028873   
2789     0.234517      0.059488      0.021974      0.073117      0.025593   
...           ...           ...           ...           ...           ...   
3477     1.353362      1.054642      0.258455      0.153893      0.098794   
347

In [21]:
display(predictions.iloc[-1])

ecmwf-eps_9     0.363347
ecmwf-eps_10    0.193944
ecmwf-eps_11    0.226445
ecmwf-eps_12    0.003179
ecmwf-eps_13   -0.181453
ecmwf-eps_14   -0.051098
Name: 3481, dtype: float32

In [22]:
test_data[labels].iloc[-1]

ecmwf-eps_9     0.482
ecmwf-eps_10    0.299
ecmwf-eps_11    0.191
ecmwf-eps_12   -0.095
ecmwf-eps_13   -0.359
ecmwf-eps_14   -0.209
Name: 3481, dtype: float64

In [23]:
evaluations = multi_predictor.evaluate(test_data)
#print(evaluations)
print("Evaluated using metrics:", multi_predictor.eval_metrics)

Evaluation: root_mean_squared_error on test data: -0.6090736574597795
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.6090736574597795,
    "mean_squared_error": -0.3709707202114328,
    "mean_absolute_error": -0.3902446393561383,
    "r2": 0.8494874320546517,
    "pearsonr": 0.9220851690553549,
    "median_absolute_error": -0.22452440834045362
}


Evaluating TabularPredictor for label: ecmwf-eps_9 ...
Evaluating TabularPredictor for label: ecmwf-eps_10 ...


Evaluation: root_mean_squared_error on test data: -0.8548174638655262
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.8548174638655262,
    "mean_squared_error": -0.7307128965294901,
    "mean_absolute_error": -0.5851445703134903,
    "r2": 0.6747954353689847,
    "pearsonr": 0.8221657400739759,
    "median_absolute_error": -0.3247323484420761
}
Evaluation: root_mean_squared_error on test data: -1.0436577367552093
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -1.0436577367552093,
    "mean_squared_error": -1.0892214714890056,
    "mean_absolute_error": -0.7145156305984719,
    "r2": 0.4148432653122116,
    "pearsonr": 0.6689676080239068,
    "median_absolute_error": -0.4271272525787353
}
Evaluation: root_mean_squared_error on test data: -1.062

Evaluating TabularPredictor for label: ecmwf-eps_11 ...
Evaluating TabularPredictor for label: ecmwf-eps_12 ...


Evaluation: root_mean_squared_error on test data: -0.9978911183051414
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.9978911183051414,
    "mean_squared_error": -0.9957866839922858,
    "mean_absolute_error": -0.6772496321475404,
    "r2": 0.21626515466658291,
    "pearsonr": 0.5097039678551379,
    "median_absolute_error": -0.3883330336809152
}


Evaluating TabularPredictor for label: ecmwf-eps_13 ...
Evaluating TabularPredictor for label: ecmwf-eps_14 ...


Evaluation: root_mean_squared_error on test data: -0.9403125556890998
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.9403125556890998,
    "mean_squared_error": -0.8841877023865663,
    "mean_absolute_error": -0.6377953638311588,
    "r2": 0.18864241275200255,
    "pearsonr": 0.46767044187488943,
    "median_absolute_error": -0.37786263179778956
}


Evaluated using metrics: {'ecmwf-eps_9': root_mean_squared_error, 'ecmwf-eps_10': root_mean_squared_error, 'ecmwf-eps_11': root_mean_squared_error, 'ecmwf-eps_12': root_mean_squared_error, 'ecmwf-eps_13': root_mean_squared_error, 'ecmwf-eps_14': root_mean_squared_error}


In [24]:
evaluations

{'ecmwf-eps_9': {'root_mean_squared_error': -0.6090736574597795,
  'mean_squared_error': -0.3709707202114328,
  'mean_absolute_error': -0.3902446393561383,
  'r2': 0.8494874320546517,
  'pearsonr': 0.9220851690553549,
  'median_absolute_error': -0.22452440834045362},
 'ecmwf-eps_10': {'root_mean_squared_error': -0.8548174638655262,
  'mean_squared_error': -0.7307128965294901,
  'mean_absolute_error': -0.5851445703134903,
  'r2': 0.6747954353689847,
  'pearsonr': 0.8221657400739759,
  'median_absolute_error': -0.3247323484420761},
 'ecmwf-eps_11': {'root_mean_squared_error': -1.0436577367552093,
  'mean_squared_error': -1.0892214714890056,
  'mean_absolute_error': -0.7145156305984719,
  'r2': 0.4148432653122116,
  'pearsonr': 0.6689676080239068,
  'median_absolute_error': -0.4271272525787353},
 'ecmwf-eps_12': {'root_mean_squared_error': -1.0621893798149786,
  'mean_squared_error': -1.128246278591729,
  'mean_absolute_error': -0.7213494947236219,
  'r2': 0.27029332647959514,
  'pearsonr

In [25]:
predictor_class = multi_predictor.get_predictor('ecmwf-eps_13')
predictor_class.leaderboard(silent=True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.573329,0.011542,9.501996,0.000238,0.128922,2,True,9
1,NeuralNetFastAI,-0.592686,0.006101,1.628658,0.006101,1.628658,1,True,6
2,NeuralNetTorch,-0.598182,0.003432,3.944523,0.003432,3.944523,1,True,8
3,CatBoost,-0.629614,0.001771,3.799893,0.001771,3.799893,1,True,4
4,XGBoost,-0.687358,0.005066,2.407276,0.005066,2.407276,1,True,7
5,ExtraTreesMSE,-0.707621,0.026708,0.606895,0.026708,0.606895,1,True,5
6,RandomForestMSE,-0.719315,0.027498,2.504079,0.027498,2.504079,1,True,3
7,KNeighborsUnif,-1.262602,0.005954,0.005703,0.005954,0.005703,1,True,1
8,KNeighborsDist,-1.379897,0.008375,0.006608,0.008375,0.006608,1,True,2


feature importance

In [78]:
day = 2
df = train_data.iloc[:, :-5+day]

In [79]:
df.head()

Unnamed: 0,gfs-ens-bc_9,gfs-ens-bc_10,gfs-ens-bc_11,gfs-ens-bc_12,gfs-ens-bc_13,gfs-ens-bc_14,cmc-ens_9,cmc-ens_10,cmc-ens_11,cmc-ens_12,...,error_14,day_8_error,Date,Date.year,Date.month,Date.day,Date.dayofweek,ecmwf-eps_9,ecmwf-eps_10,ecmwf-eps_11
0,0.012,0.009,0.006,0.002,0.004,0.011,-0.008,-0.005,-0.001,-0.003,...,0.0,0.0,1531267200000000000,2018,7,11,2,0.0,0.002,0.001
1,0.001,0.001,0.004,0.009,0.011,0.013,-0.003,-0.002,-0.004,-0.008,...,-0.001,0.005,1531310400000000000,2018,7,11,2,0.001,0.0,0.0
2,0.01,0.007,0.011,0.013,0.011,0.008,-0.007,-0.011,-0.013,-0.011,...,0.0,0.0,1531353600000000000,2018,7,12,3,0.0,0.003,0.001
3,0.006,0.009,0.01,0.009,0.009,0.009,-0.008,-0.009,-0.009,-0.01,...,-0.001,-0.002,1531396800000000000,2018,7,12,3,-0.003,-0.001,0.001
4,0.012,0.007,0.008,0.007,0.009,0.009,-0.007,-0.009,-0.009,-0.01,...,0.001,0.0,1531440000000000000,2018,7,13,4,0.0,0.0,-0.002


In [80]:
label = f"ecmwf-eps_{day+9}"

In [81]:
predictor = TabularPredictor(label=label).fit(df)

No path specified. Models will be saved in: "AutogluonModels/ag-20230607_190841/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230607_190841/"
AutoGluon Version:  0.7.0
Python Version:     3.10.9
Operating System:   Darwin
Platform Machine:   x86_64
Platform Version:   Darwin Kernel Version 22.5.0: Mon Apr 24 20:53:44 PDT 2023; root:xnu-8796.121.2~5/RELEASE_ARM64_T8103
Train Data Rows:    2785
Train Data Columns: 28
Label Column: ecmwf-eps_11
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (7.802, -7.863000000000003, 0.02167, 1.37127)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Using Feature Generators to preprocess the data ...
Fitting Auto

In [82]:
shap_values = predictor.feature_importance(test_data)

These features in provided data are not utilized by the predictor and will be ignored: ['ecmwf-eps_12', 'ecmwf-eps_13', 'ecmwf-eps_14']
Computing feature importance via permutation shuffling for 28 features using 697 rows with 5 shuffle sets...
	3.12s	= Expected runtime (0.62s per shuffle set)
	1.08s	= Actual runtime (Completed 5 of 5 shuffle sets)


In [83]:
shap_values

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
ecmwf-eps_10,1.360374,0.021828,7.95113e-09,5,1.405318,1.315431
gfs-ens-bc_11,1.028955,0.02247,2.72729e-08,5,1.075221,0.982689
gfs-ens-bc_10,0.993484,0.034132,1.669213e-07,5,1.063762,0.923205
ecmwf-eps_9,0.294865,0.01934,2.207935e-06,5,0.334686,0.255045
cmc-ens_11,0.165934,0.006879,3.535929e-07,5,0.180097,0.15177
gfs-ens-bc_9,0.076887,0.009178,2.390719e-05,5,0.095785,0.05799
gfs-ens-bc_13,0.05292,0.001326,4.732346e-08,5,0.055652,0.050189
gfs-ens-bc_12,0.047426,0.006449,4.004073e-05,5,0.060705,0.034147
cmc-ens_9,0.018814,0.002782,5.574089e-05,5,0.024542,0.013085
gfs-ens-bc_14,0.014203,0.002489,0.0001087266,5,0.019329,0.009078
