In [2]:
import h5py
import rasterio
from rasterio.mask import mask
from rasterio.transform import from_origin
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd 
import pandas as pd
from dnb_annual import *
from variables import years, composites, ukr_region_map, pol_region_map

import xgboost as xgb

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Resizing, Dropout, BatchNormalization, Activation, Add, GlobalAveragePooling2D, Input, Reshape, Conv2DTranspose, Cropping2D

from keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler






In [3]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

Prepare the data

In [4]:
# turn the region column into a categorical variable using one hot encoding
# ukraine_data = pd.get_dummies(ukraine_data, columns=["region"])

# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] > 2021]

column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
                   "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
                   "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

general_characteristics = ("num_zeros", "sum", "mean", "median", "sd")

In [5]:
def build_train_test_sets(selected_columns, train_data, test_data, scale = False):

    # select columns
    train_data_selected = train_data[["real_gdp", "region"] + selected_columns]
    test_data_selected = test_data[["real_gdp", "region"] + selected_columns]

    # one hot encode region
    train_data_selected = pd.get_dummies(train_data_selected, columns=["region"])
    test_data_selected = pd.get_dummies(test_data_selected, columns=["region"])

    return train_data_selected, test_data_selected

def build_xgboost_model(train_data, test_data, selected_columns):

    # build train and test sets
    train_data_selected, test_data_selected = build_train_test_sets(selected_columns, train_data, test_data)

    # get input and output data
    X_train = train_data_selected.drop(columns=["real_gdp"])
    y_train = train_data_selected["real_gdp"]

    X_test = test_data_selected.drop(columns=["real_gdp"])
    y_test = test_data_selected["real_gdp"]

    # build xgboost model
    model = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)

    # fit model
    model.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)

    # calculate mse and mpe
    mse = np.mean((y_pred - y_test)**2)
    mpe = np.mean(100*(y_pred - y_test) / y_test)

    return y_pred, mse, mpe

In [6]:
prefix = "nearnad_snow_free"

# create general column names: prefix + general_characteristics
general_columns = [prefix + "_" + char for char in general_characteristics]
log_bin_columns = [prefix + "_log_" + str(i) for i in range(1, 11)]
idr_bin_columns = [prefix + "_idr_" + str(i) for i in range(1, 11)]

# build cgb models
y_pred_general, mse_general, mpe_general = build_xgboost_model(train_data, test_data, general_columns)
y_pred_log_bin, mse_log_bin, mpe_log_bin = build_xgboost_model(train_data, test_data, log_bin_columns)
y_pred_idr_bin, mse_idr_bin, mpe_idr_bin = build_xgboost_model(train_data, test_data, idr_bin_columns)

print("General MSE: ", mse_general)
print("Log Bin MSE: ", mse_log_bin)
print("IDR Bin MSE: ", mse_idr_bin)

print("General MPE: ", mpe_general)
print("Log Bin MPE: ", mpe_log_bin)
print("IDR Bin MPE: ", mpe_idr_bin)

General MSE:  11410510.001277847
Log Bin MSE:  77482191.46609668
IDR Bin MSE:  115190282.79653612
General MPE:  2.4028691400937676
Log Bin MPE:  8.959852910793174
IDR Bin MPE:  5.49072604468757
