In [None]:
import os

import pandas
from tqdm import tqdm

In [None]:
figure_directory = "./figures"
os.makedirs(figure_directory, exist_ok=True)

### Data Ingestion

#### Annual Electricity Demand

In [None]:
electricity_annual_demand_folder = "./data/annual_electricity_demand/"

In [None]:
ed_annual_files = [
    file_name
    for file_name in os.listdir(electricity_annual_demand_folder)
    if file_name.endswith(".parquet")
]

In [None]:
ed_annual_files[:5]

In [None]:
df_annual_demand = pandas.DataFrame()

for file_name in tqdm(ed_annual_files):
    df_current = pandas.read_parquet(
        electricity_annual_demand_folder + file_name
    )

    df_current = df_current.resample(
        "1h", label="right", closed="right"
    ).mean()

    # Add a column for the region name
    df_current["region_code"] = file_name.split(".")[0]

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_annual_demand = pandas.concat(
        [df_annual_demand, df_current], ignore_index=True
    )

In [None]:
print(df_annual_demand.shape)
df_annual_demand.head()

#### Temperature Data

In [None]:
temperature_folder = "./data/temperature/"

In [None]:
temperature_files = [
    file_name
    for file_name in os.listdir(temperature_folder)
    if file_name.endswith(".parquet")
]

In [None]:
temperature_files[:5]

In [None]:
df_all_temperature = pandas.DataFrame()

for file_name in tqdm(temperature_files):
    df_current = pandas.read_parquet(temperature_folder + file_name)

    # Add a column for the region name
    df_current["region_code"] = file_name.split("_temp")[0]

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_all_temperature = pandas.concat(
        [df_all_temperature, df_current], ignore_index=True
    )

In [None]:
print(df_all_temperature.shape)
df_all_temperature.head()

#### GDP Data

In [None]:
import pickle

# Read in gdp data
with open("./data/gdp_data.pkl", "rb") as f:
    gdp_data = pickle.load(f)

In [None]:
df_gdp_data = pandas.DataFrame()

for country_code, region_dictionary in gdp_data.items():
    for region_code, gdp_series in region_dictionary.items():
        df_current = pandas.DataFrame(gdp_series).reset_index()
        df_current.columns = ["year", "GDP"]
        df_current["year"] = df_current["year"].astype(int)

        df_current["region_code"] = region_code
        df_current["country_code"] = country_code

        df_gdp_data = pandas.concat(
            [df_gdp_data, df_current], ignore_index=True
        )

In [None]:
print(df_gdp_data.shape)
df_gdp_data.head()

#### Electricity Demand Data

In [None]:
electricity_demand_folder = "./data/electricity_demand/"

In [None]:
demand_files = [
    file_name
    for file_name in os.listdir(electricity_demand_folder)
    if file_name.endswith(".parquet")
]

In [None]:
df_demand = pandas.DataFrame()

for file_name in tqdm(demand_files):
    df_current = pandas.read_parquet(electricity_demand_folder + file_name)

    df_current["Load (MW)"] = df_current["Load (MW)"].astype(float)

    df_current = df_current.resample(
        "1h", label="right", closed="right"
    ).mean()

    # Add a column for the region name
    df_current["region_code"] = str.join("_", file_name.split("_")[:-1])

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_demand = pandas.concat([df_demand, df_current], ignore_index=True)

In [None]:
print(df_demand.shape)
df_demand.head()

#### Combine all datasets

In [None]:
df_annual_demand = df_annual_demand.sort_values(by=["Time (UTC)"])
df_all_temperature = df_all_temperature.sort_values(by=["Time (UTC)"])
df_demand = df_demand.sort_values(by=["Time (UTC)"])

In [None]:
combined_dataset = pandas.merge(
    df_all_temperature, df_annual_demand, on=["Time (UTC)", "region_code"]
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = pandas.merge(
    combined_dataset, df_demand, on=["Time (UTC)", "region_code"]
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = pandas.merge(
    combined_dataset,
    df_gdp_data.drop(columns=["country_code"]),
    left_on=["Local year", "region_code"],
    right_on=["year", "region_code"],
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = combined_dataset.drop(columns=["year"])

In [None]:
row_count = len(combined_dataset)
print("Before removing duplicates:", row_count)
combined_dataset = combined_dataset.drop_duplicates(
    subset=[col for col in combined_dataset.columns if col != "Load (MW)"]
)
print("Without duplicates: ", len(combined_dataset))
print("Difference", row_count - len(combined_dataset))

In [None]:
row_count = len(combined_dataset)
print("Before removing NaN values:", row_count)
combined_dataset = combined_dataset.dropna()
print("Without duplicates: ", len(combined_dataset))
print("Difference", row_count - len(combined_dataset))

In [None]:
combined_dataset = combined_dataset.rename(
    columns={
        "Time (UTC)": "time_utc",
        "Local hour of the day": "local_hour",
        "Local weekend indicator": "is_weekend",
        "Local month of the year": "local_month",
        "Local year": "local_year",
        "Temperature - Top 1 (K)": "year_temp_top1",
        "Temperature - Top 3 (K)": "year_temp_top3",
        "Monthly average temperature - Top 1 (K)": "monthly_temp_avg_top1",
        "Monthly average temperature rank - Top 1": "monthly_temp_avg_rank_top1",
        "Annual average temperature - Top 1 (K)": "year_temp_avg_top1",
        "5 percentile temperature - Top 1 (K)": "year_temp_percentile_5",
        "95 percentile temperature - Top 1 (K)": "year_temp_percentile_95",
        "Annual electricity demand (TWh)": "year_electricity_demand",
        "Annual electricity demand per capita (MWh)": "year_electricity_demand_per_capita_mwh",
        "Load (MW)": "load_mw",
        "GDP": "year_gdp_ppp",
    }
)

In [None]:
combined_dataset["year_electricity_demand_mw"] = (
    combined_dataset["year_electricity_demand"] * 1000000
)
combined_dataset = combined_dataset.drop(columns=["year_electricity_demand"])

In [None]:
list_amount_hours_region = []
for name, group in combined_dataset.groupby(["region_code", "local_year"]):
    list_amount_hours_region.append([name[0], name[1], len(group)])

df_amount_hours_region = pandas.DataFrame(
    list_amount_hours_region,
    columns=["region_code", "local_year", "count_available_hours"],
)

In [None]:
df_amount_hours_region["count_available_hours"].hist(bins=50)

In [None]:
combined_dataset.head()

In [None]:
combined_dataset.to_parquet(
    "./data/combined_dataset.parquet", engine="pyarrow"
)

In [None]:
combined_dataset = pandas.read_parquet(
    "./data/combined_dataset.parquet", engine="pyarrow"
)

### Split into train, test, and validation datasets

In [None]:
test_set = pandas.DataFrame()
test_set_indices = []
validation_set = pandas.DataFrame()
validation_set_indices = []

for name, group in combined_dataset.groupby("region_code"):
    max_year = group["local_year"].max()

    group_test_set = group[group["local_year"] == max_year].copy()

    test_set_indices.append(group_test_set.index)

    test_set = pandas.concat([test_set, group_test_set], ignore_index=True)

    group_val_set = group[group["local_year"] == max_year - 1].copy()

    validation_set_indices.append(group_val_set.index)

    validation_set = pandas.concat(
        [validation_set, group_val_set], ignore_index=True
    )

In [None]:
len(test_set) / len(combined_dataset)

In [None]:
len(validation_set) / len(combined_dataset)

In [None]:
all_test_set_indices = [
    index for list_indicies in test_set_indices for index in list_indicies
]
all_val_set_indices = [
    index
    for list_indicies in validation_set_indices
    for index in list_indicies
]

In [None]:
# Drop test and validation sets from the combined dataset
print(len(combined_dataset))
train_set = combined_dataset.drop(index=all_test_set_indices).copy()
train_set = train_set.drop(index=all_val_set_indices)
print(len(train_set))

In [None]:
train_set.head()

In [None]:
def prepare_data(dataset: pandas.DataFrame):
    """
    Process the dataset into splits to be used in training the model.

    Returns
    -------
    features : pandas.DataFrame
        Features for the model.
    target : pandas.Series
        Column with the target variable.
    groups : pandas.Series
        Column containing the region codes
    """
    features = dataset[
        [
            "local_hour",
            "is_weekend",
            "local_month",
            "year_temp_top1",
            "year_temp_top3",
            "monthly_temp_avg_top1",
            "monthly_temp_avg_rank_top1",
            "year_temp_avg_top1",
            "year_temp_percentile_5",
            "year_temp_percentile_95",
            "year_electricity_demand_per_capita_mwh",
            "year_gdp_ppp",
        ]
    ].copy()

    categorical_features = [
        "local_hour",
        "is_weekend",
        "local_month",
        "monthly_temp_avg_rank_top1",
    ]

    for cat_feature in categorical_features:
        features[cat_feature] = features[cat_feature].astype("category")

    target = dataset["load_mw"].copy()
    groups = dataset["region_code"].copy()

    return features, target, groups

In [None]:
train_features, train_target, train_groups = prepare_data(train_set)
val_features, val_target, val_groups = prepare_data(validation_set)

test_features, test_target, test_groups = prepare_data(test_set)

### Scale the target variable

In [None]:
import joblib
from sklearn.preprocessing import StandardScaler

In [None]:
train_target_scaler = StandardScaler()
train_target_scaler.fit(train_target.to_numpy().reshape(-1, 1))

In [None]:
train_target_scaler.mean_

In [None]:
train_target_scaled = train_target_scaler.transform(
    train_target.to_numpy().reshape(-1, 1)
)

In [None]:
# Save the scaler for use in future predictions
joblib.dump(train_target_scaler, "./data/target_scaler.bin")

### Training

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
from xgboost import XGBRegressor

In [None]:
# Initialize the XGBoost regressor
xgb_model = XGBRegressor(
    random_state=42,
    enable_categorical=True,
    eval_metric=mean_absolute_percentage_error,
)

In [None]:
# Train the model
xgb_model.fit(
    train_features, train_target, eval_set=[(val_features, val_target)]
)

In [None]:
xgb_model.save_model("./data/xgboost_model.bin")

In [None]:
# Initialize the XGBoost regressor for the scaled target variable
xgb_model_scaled = XGBRegressor(
    random_state=42,
    enable_categorical=True,
    eval_metric=mean_absolute_percentage_error,
)

In [None]:
# Train the model
# Note: the validation features are not scaled yet TODO
xgb_model_scaled.fit(
    train_features, train_target_scaled, eval_set=[(val_features, val_target)]
)

In [None]:
xgb_model_scaled.save_model("./data/xgboost_model_scaled.bin")

#### Prediction on test set

In [None]:
def calculate_test_mape(
    test_set, test_predictions, test_target, message: str = ""
) -> pandas.DataFrame:
    """
    Calcuate the mean absolute percentage error for the test set.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with the region codes, years,
        and mean absolute percentage errors for the test set.
    """
    list_test_mae_values = []
    for name, group in test_set.groupby(["local_year", "region_code"]):
        current_mae = mean_absolute_percentage_error(
            test_predictions[group.index], test_target.iloc[group.index]
        )

        list_test_mae_values.append([name[1], name[0], current_mae])

    df_validation_mae_values = pandas.DataFrame(
        list_test_mae_values, columns=["region_code", "year", "mape"]
    )

    df_validation_mae_values.to_parquet(
        "data/test_mae_values" + message + ".parquet", engine="pyarrow"
    )
    df_validation_mae_values.to_csv("data/test_mae_values" + message + ".csv")

    return df_validation_mae_values

In [None]:
# Predict on test set and calculate mae
test_predictions = xgb_model.predict(test_features)
calculate_test_mape(test_set, test_predictions, test_target)

In [None]:
# Predict on scaled test set and calculate mae
test_predictions_scaled = xgb_model_scaled.predict(test_features)
test_predictions_scaled = train_target_scaler.inverse_transform(
    test_predictions_scaled.reshape(-1, 1)
)
calculate_test_mape(test_set, test_predictions_scaled, test_target, "_scaled")

In [None]:
train_predictions = xgb_model.predict(train_features)

In [None]:
calculate_test_mape(
    train_set.reset_index(), train_predictions, train_target, "_train"
)

#### Cross validate

In [None]:
from sklearn.model_selection import LeaveOneGroupOut, cross_validate

In [None]:
# Perform cross-validation
cv_results = cross_validate(
    xgb_model,
    train_features,
    train_target,
    groups=train_groups,
    cv=LeaveOneGroupOut(),
    scoring=["neg_mean_absolute_percentage_error"],
    return_train_score=True,
    return_indices=True,
    return_estimator=True,
    n_jobs=1,
)

In [None]:
cv_results.keys()

In [None]:
cv_results["indices_train"] = cv_results["indices"]["train"]
cv_results["indices_test"] = cv_results["indices"]["test"]

In [None]:
# Assuming 'cv_results' is your original dictionary and
# 'indices' is the key you want to exclude
cv_results_filtered = {k: v for k, v in cv_results.items() if k != "indices"}

# Create DataFrame from the filtered dictionary
df_cv_results = pandas.DataFrame(cv_results_filtered)

df_cv_results["test_mape"] = -df_cv_results[
    "test_neg_mean_absolute_percentage_error"
]
df_cv_results["train_mape"] = -df_cv_results[
    "train_neg_mean_absolute_percentage_error"
]

In [None]:
df_cv_results.head()

In [None]:
list_test_group_id = []
for test_indices in cv_results["indices"]["test"]:
    list_test_group_id.append(train_groups.iloc[test_indices[0]])

df_cv_results["group_id"] = list_test_group_id

In [None]:
df_cv_results.head()

In [None]:
df_cv_output = df_cv_results[
    ["group_id", "train_mape", "test_mape", "fit_time", "score_time"]
]

In [None]:
df_cv_output.to_parquet("./data/cv_results.parquet", engine="pyarrow")
df_cv_output.to_csv("./data/cv_results.csv")

### Synthetic data for all collected data

In [None]:
entire_dataset = pandas.read_parquet(
    "./data/combined_dataset.parquet", engine="pyarrow"
)

In [None]:
trained_xgb_model = XGBRegressor()
trained_xgb_model.load_model("./data/xgboost_model.bin")

In [None]:
entire_dataset.head()

In [None]:
input_features_columns = trained_xgb_model.feature_names_in_

In [None]:
input_features = entire_dataset[input_features_columns]

In [None]:
predictions = trained_xgb_model.predict(input_features)

In [None]:
synthetic_dataset = entire_dataset.drop(columns=input_features_columns)

In [None]:
synthetic_dataset["predictions"] = predictions

In [None]:
synthetic_dataset.head()

In [None]:
synthetic_dataset = synthetic_dataset.drop(
    columns=[
        "local_year",
        "load_mw",
        "year_electricity_demand_mw",
    ]
)

In [None]:
synthetic_dataset.head()

In [None]:
synthetic_dataset.to_parquet(
    "./data/synthetic_dataset.parquet", engine="pyarrow"
)