In [40]:
import h5py
import rasterio
from rasterio.mask import mask
from rasterio.transform import from_origin
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd 
import pandas as pd
from dnb_annual import *
from variables import years, composites, ukr_region_map, pol_region_map

import xgboost as xgb

import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Resizing, Dropout, BatchNormalization, Activation, Add, GlobalAveragePooling2D, Input, Reshape, Conv2DTranspose, Cropping2D

from keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [41]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

Prepare the data

In [60]:
# turn the region column into a categorical variable using one hot encoding
# ukraine_data = pd.get_dummies(ukraine_data, columns=["region"])

# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
                   "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
                   "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

general_characteristics = ("mean", "median", "sd", "sum")

In [64]:
def build_train_test_sets(selected_columns, train_data, test_data, log_transform = False, scale = False):

    # select columns
    train_data_selected = train_data[["real_gdp", "region"] + selected_columns]
    test_data_selected = test_data[["real_gdp", "region"] + selected_columns]

    if log_transform:
        # real_gdp and columns that contain the word"sum" are log transformed
        train_data_selected["real_gdp"] = np.log(train_data_selected["real_gdp"])
        test_data_selected["real_gdp"] = np.log(test_data_selected["real_gdp"])

        for column in selected_columns:
            if "sum" in column:
                train_data_selected[column] = np.log(train_data_selected[column])
                test_data_selected[column] = np.log(test_data_selected[column])

    if scale:
        # scale the data
        scaler = StandardScaler()
        train_data_selected[selected_columns] = scaler.fit_transform(train_data_selected[selected_columns])
        test_data_selected[selected_columns] = scaler.transform(test_data_selected[selected_columns])

    # one hot encode region
    train_data_selected = pd.get_dummies(train_data_selected, columns=["region"])
    test_data_selected = pd.get_dummies(test_data_selected, columns=["region"])

    return train_data_selected, test_data_selected

def build_xgboost_model(train_data, test_data, selected_columns, log_transform = False, scale = False):

    # build train and test sets
    train_data_selected, test_data_selected = build_train_test_sets(selected_columns, train_data, test_data, log_transform, scale)

    # get input and output data
    X_train = train_data_selected.drop(columns=["real_gdp"])
    y_train = train_data_selected["real_gdp"]

    X_test = test_data_selected.drop(columns=["real_gdp"])
    y_test = test_data_selected["real_gdp"]

    # build xgboost model
    model_test = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)

    # fit model
    model_test.fit(X_train, y_train)

    # make predictions
    y_pred = model_test.predict(X_test)

    # calculate mse and mpe
    mse = np.mean((y_pred - y_test)**2)
    mpe = np.mean(100*(y_pred - y_test) / y_test)

    return mse, mpe


def predict_with_xgboost(pre_war_data, prediction_data, selected_columns, data_2021, log_transform = False, scale = False):

    # build pre war and prediction sets
    pre_war_data_selected, prediction_data_selected = build_train_test_sets(selected_columns, pre_war_data, prediction_data, log_transform, scale)

    # get input and output data
    X_pre_war = pre_war_data_selected.drop(columns=["real_gdp"])
    y_pre_war = pre_war_data_selected["real_gdp"]

    X_prediction = prediction_data_selected.drop(columns=["real_gdp"])

    # build xgboost model
    model_pred = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)

    # fit model
    model_pred.fit(X_pre_war, y_pre_war)

    # make predictions
    y_pred = model_pred.predict(X_prediction)

    # calculate the predicted change in the real gdp on the national level
    if log_transform:
        y_pred = np.exp(y_pred)
    pred_gdp_change = 100*(np.sum(y_pred) - np.sum(data_2021["real_gdp"])) / np.sum(data_2021["real_gdp"])

    return pred_gdp_change

In [68]:
prefix = "nearnad_snow_free_hq"

# create general column names: prefix + general_characteristics
general_columns = [prefix + "_" + char for char in general_characteristics]
log_bin_columns = [prefix + "_log_" + str(i) for i in range(1, 11)] + general_columns
idr_bin_columns = [prefix + "_idr_" + str(i) for i in range(1, 11)] + general_columns

# build xgb models
mse_general, mpe_general = build_xgboost_model(train_data, test_data, general_columns, log_transform = True, scale = True)
mse_log_bin, mpe_log_bin = build_xgboost_model(train_data, test_data, log_bin_columns, log_transform = True, scale = True)
mse_idr_bin, mpe_idr_bin = build_xgboost_model(train_data, test_data, idr_bin_columns, log_transform = True, scale = True)

print("General MSE: ", mse_general)
print("Log Bin MSE: ", mse_log_bin)
print("IDR Bin MSE: ", mse_idr_bin)

print("General MPE: ", mpe_general)
print("Log Bin MPE: ", mpe_log_bin)
print("IDR Bin MPE: ", mpe_idr_bin)

gdp_change_general = predict_with_xgboost(pre_war_data, prediction_data, general_columns, test_data, log_transform = True, scale= True)
gdp_change_log_bin = predict_with_xgboost(pre_war_data, prediction_data, log_bin_columns, test_data, log_transform = True, scale = True)
gdp_change_idr_bin = predict_with_xgboost(pre_war_data, prediction_data, idr_bin_columns, test_data, log_transform = True, scale = True)

print("General GDP Change: ", gdp_change_general)
print("Log Bin GDP Change: ", gdp_change_log_bin)
print("IDR Bin GDP Change: ", gdp_change_idr_bin)

# very promising results with nearnad_snow_free_hq, idr_bin_columns + general_columns (mean, median, sd, sum) with log transform and scale

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected["real_gdp"] = np.log(train_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected["real_gdp"] = np.log(test_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected[column] = np.log(train_data_selected[co

General MSE:  0.014588929949433727
Log Bin MSE:  0.025760164534868766
IDR Bin MSE:  0.012468430183940086
General MPE:  -0.08996523690173695
Log Bin MPE:  0.42271273017460237
IDR Bin MPE:  0.27337237410386367


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected["real_gdp"] = np.log(train_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected["real_gdp"] = np.log(test_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected[column] = np.log(train_data_selected[co

General GDP Change:  -48.489493879896386
Log Bin GDP Change:  -43.727423004832445
IDR Bin GDP Change:  -33.34964489283803


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected["real_gdp"] = np.log(train_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_selected["real_gdp"] = np.log(test_data_selected["real_gdp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_selected[column] = np.log(train_data_selected[co