In [None]:
import os

import pandas
from tqdm import tqdm

### Data Ingestion

#### Annual Electricity Demand

In [None]:
electricity_annual_demand_folder = "./data/annual_electricity_demand/"

In [None]:
ed_annual_files = [
    file_name
    for file_name in os.listdir(electricity_annual_demand_folder)
    if file_name.endswith(".parquet")
]

In [None]:
ed_annual_files[:5]

In [None]:
df_annual_demand = pandas.DataFrame()

for file_name in tqdm(ed_annual_files):
    df_current = pandas.read_parquet(
        electricity_annual_demand_folder + file_name
    )

    df_current = df_current.resample(
        "1h", label="right", closed="right"
    ).mean()

    # Add a column for the region name
    df_current["region_code"] = file_name.split(".")[0]

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_annual_demand = pandas.concat(
        [df_annual_demand, df_current], ignore_index=True
    )

In [None]:
print(df_annual_demand.shape)
df_annual_demand.head()

#### Temperature Data

In [None]:
temperature_folder = "./data/temperature/"

In [None]:
temperature_files = [
    file_name
    for file_name in os.listdir(temperature_folder)
    if file_name.endswith(".parquet")
]

In [None]:
temperature_files[:5]

In [None]:
df_all_temperature = pandas.DataFrame()

for file_name in tqdm(temperature_files):
    df_current = pandas.read_parquet(temperature_folder + file_name)

    # Add a column for the region name
    df_current["region_code"] = file_name.split("_temp")[0]

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_all_temperature = pandas.concat(
        [df_all_temperature, df_current], ignore_index=True
    )

In [None]:
print(df_all_temperature.shape)
df_all_temperature.head()

#### GDP Data

In [None]:
import pickle

# Read in gdp data
with open("./data/gdp_data.pkl", "rb") as f:
    gdp_data = pickle.load(f)

In [None]:
df_gdp_data = pandas.DataFrame()

for country_code, region_dictionary in gdp_data.items():
    for region_code, gdp_series in region_dictionary.items():
        df_current = pandas.DataFrame(gdp_series).reset_index()
        df_current.columns = ["year", "GDP"]
        df_current["year"] = df_current["year"].astype(int)

        df_current["region_code"] = region_code
        df_current["country_code"] = country_code

        df_gdp_data = pandas.concat(
            [df_gdp_data, df_current], ignore_index=True
        )

In [None]:
print(df_gdp_data.shape)
df_gdp_data.head()

#### Electricity Demand Data

In [None]:
electricity_demand_folder = "./data/electricity_demand/"

In [None]:
demand_files = [
    file_name
    for file_name in os.listdir(electricity_demand_folder)
    if file_name.endswith(".parquet")
]

In [None]:
df_demand = pandas.DataFrame()

for file_name in tqdm(demand_files):
    df_current = pandas.read_parquet(electricity_demand_folder + file_name)

    df_current["Load (MW)"] = df_current["Load (MW)"].astype(float)

    df_current = df_current.resample(
        "1h", label="right", closed="right"
    ).mean()

    # Add a column for the region name
    df_current["region_code"] = str.join("_", file_name.split("_")[:-1])

    # Reset index to move "Time (UTC)" to a column
    df_current = df_current.reset_index()

    df_demand = pandas.concat([df_demand, df_current], ignore_index=True)

In [None]:
print(df_demand.shape)
df_demand.head()

#### Combine all datasets

In [None]:
df_annual_demand = df_annual_demand.sort_values(by=["Time (UTC)"])
df_all_temperature = df_all_temperature.sort_values(by=["Time (UTC)"])
df_demand = df_demand.sort_values(by=["Time (UTC)"])

In [None]:
combined_dataset = pandas.merge(
    df_all_temperature, df_annual_demand, on=["Time (UTC)", "region_code"]
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = pandas.merge(
    combined_dataset, df_demand, on=["Time (UTC)", "region_code"]
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = pandas.merge(
    combined_dataset,
    df_gdp_data.drop(columns=["country_code"]),
    left_on=["Local year", "region_code"],
    right_on=["year", "region_code"],
)

In [None]:
print(combined_dataset.shape)
combined_dataset.head()

In [None]:
combined_dataset = combined_dataset.drop(columns=["year"])

In [None]:
row_count = len(combined_dataset)
print("Before removing duplicates:", row_count)
combined_dataset = combined_dataset.drop_duplicates(
    subset=[col for col in combined_dataset.columns if col != "Load (MW)"]
)
print("Without duplicates: ", len(combined_dataset))
print("Difference", row_count - len(combined_dataset))

In [None]:
row_count = len(combined_dataset)
print("Before removing NaN values:", row_count)
combined_dataset = combined_dataset.dropna()
print("Without duplicates: ", len(combined_dataset))
print("Difference", row_count - len(combined_dataset))

In [None]:
combined_dataset = combined_dataset.rename(
    columns={
        "Time (UTC)": "time_utc",
        "Local hour of the day": "local_hour",
        "Local weekend indicator": "is_weekend",
        "Local month of the year": "local_month",
        "Local year": "local_year",
        "Temperature - Top 1 (K)": "year_temp_top1",
        "Temperature - Top 3 (K)": "year_temp_top3",
        "Monthly average temperature - Top 1 (K)": "monthly_temp_avg_top1",
        "Monthly average temperature rank - Top 1": "monthly_temp_avg_rank_top1",
        "Annual average temperature - Top 1 (K)": "year_temp_avg_top1",
        "5 percentile temperature - Top 1 (K)": "year_temp_percentile_5",
        "95 percentile temperature - Top 1 (K)": "year_temp_percentile_95",
        "Annual electricity demand (TWh)": "year_electricity_demand",
        "Annual electricity demand per capita (MWh)": "year_electricity_demand_per_capita",
        "Load (MW)": "load_mw",
        "GDP": "year_gdp_ppp",
    }
)

In [None]:
combined_dataset.to_parquet(
    "./data/combined_dataset.parquet", engine="pyarrow"
)

In [None]:
combined_dataset = pandas.read_parquet(
    "./data/combined_dataset.parquet", engine="pyarrow"
)

### Split into train, test, and validation datasets

In [None]:
test_set = pandas.DataFrame()
test_set_indices = []
validation_set = pandas.DataFrame()
validation_set_indices = []

for name, group in combined_dataset.groupby("region_code"):
    max_year = group["local_year"].max()

    group_val_set = group[group["local_year"] == max_year].copy()

    validation_set_indices.append(group_val_set.index)

    validation_set = pandas.concat(
        [validation_set, group_val_set], ignore_index=True
    )

    group_test_set = group[group["local_year"] == max_year - 1].copy()

    test_set_indices.append(group_test_set.index)

    test_set = pandas.concat([test_set, group_test_set], ignore_index=True)

In [None]:
len(test_set) / len(combined_dataset)

In [None]:
len(validation_set) / len(combined_dataset)

In [None]:
# Drop test and validation sets from the combined dataset
train_set = combined_dataset.drop(index=test_set_indices[0])
train_set = train_set.drop(index=validation_set_indices[0])

In [None]:
train_set.head()

In [None]:
def prepare_data(dataset: pandas.DataFrame):
    """
    Process the dataset into splits to be used in training the model.

    Returns
    -------
    features : pandas.DataFrame
        Features for the model.
    target : pandas.Series
        Column with the target variable.
    groups : pandas.Series
        Column containing the region codes
    """
    features = dataset.drop(
        columns=["time_utc", "local_year", "load_mw", "region_code"]
    ).copy()
    target = dataset["load_mw"].copy()
    groups = dataset["region_code"].copy()

    return features, target, groups

In [None]:
train_features, train_target, train_groups = prepare_data(train_set)
test_features, test_target, test_groups = prepare_data(test_set)
val_features, val_target, val_groups = prepare_data(validation_set)

In [None]:
train_features.head()

In [None]:
train_features.describe()

### Training

In [None]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [None]:
# Initialize the XGBoost regressor
xgb_model = XGBRegressor(random_state=42, eval_metric=mean_absolute_error)

In [None]:
# Train the model
xgb_model.fit(
    train_features, train_target, eval_set=[(test_features, test_target)]
)

In [None]:
val_features, val_target, val_groups = prepare_data(validation_set)

In [None]:
test_features, test_target, test_groups = prepare_data(test_set)

In [None]:
val_predictions = xgb_model.predict(val_features)

In [None]:
mean_absolute_error(val_predictions, val_target)

In [None]:
xgb_model.evals_result()

In [None]:
xgb_model.save_model("./data/xgboost_model.bin")