In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings

import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso


warnings.filterwarnings('ignore')



Prepare the data

In [17]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
                   "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
                   "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

general_characteristics = ("mean", "median", "sd", "sum")

Define general functions

In [27]:
ukraine_data

Unnamed: 0,year,region,real_gdp,nearnad_snow_cov_idr_1,nearnad_snow_cov_idr_2,nearnad_snow_cov_idr_3,nearnad_snow_cov_idr_4,nearnad_snow_cov_idr_5,nearnad_snow_cov_idr_6,nearnad_snow_cov_idr_7,...,allangle_snow_free_hq_log_6,allangle_snow_free_hq_log_7,allangle_snow_free_hq_log_8,allangle_snow_free_hq_log_9,allangle_snow_free_hq_log_10,allangle_snow_free_hq_num_zeros,allangle_snow_free_hq_sum,allangle_snow_free_hq_mean,allangle_snow_free_hq_median,allangle_snow_free_hq_sd
0,2012,Vinnytsia_Oblast,33024.000000,25990,16564,4888,2587,1696,1191,892,...,7,0,0,0,0,792401,312720.0,10.173395,6.0,20.219772
1,2013,Vinnytsia_Oblast,34609.152000,12792,9565,3185,1703,1014,657,480,...,11,0,0,0,0,799435,287881.0,12.144316,6.0,24.641935
2,2014,Vinnytsia_Oblast,36201.172992,52833,29585,6386,2994,1641,1037,705,...,11,0,0,0,0,797891,283071.0,11.211177,6.0,23.294181
3,2015,Vinnytsia_Oblast,35151.338975,19965,30187,8175,2141,1127,600,299,...,7,0,0,0,0,804403,230866.0,12.321396,6.0,24.631440
4,2016,Vinnytsia_Oblast,37436.176009,22658,20507,6770,2907,1732,972,645,...,7,0,0,0,0,806626,228765.0,13.852792,6.0,27.176620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,2019,Kyiv,313562.894528,174,437,261,104,83,73,77,...,357,16,0,0,0,817195,734649.0,123.574264,68.0,131.787904
296,2020,Kyiv,304156.007693,155,452,291,168,261,212,141,...,356,1,0,0,0,817294,712673.0,121.907800,64.0,127.682900
297,2021,Kyiv,316322.248000,73,312,348,54,71,81,107,...,490,18,0,0,0,817197,803012.0,135.118963,71.0,142.436598
298,2022,Kyiv,,392,546,354,263,170,169,120,...,16,0,0,0,0,817301,362988.0,62.166124,26.0,69.998102


In [62]:
def build_train_test_sets(selected_columns, train_data, test_data, log_transform = False, scale = False):

    # select columns
    train_data_selected = train_data[["real_gdp", "region"] + selected_columns]
    test_data_selected = test_data[["real_gdp", "region"] + selected_columns]

    if log_transform:
        # real_gdp and columns that contain the word"sum" are log transformed
        train_data_selected["real_gdp"] = np.log(train_data_selected["real_gdp"])
        test_data_selected["real_gdp"] = np.log(test_data_selected["real_gdp"])

        for column in selected_columns:
            if "sum" in column:
                train_data_selected[column] = np.log(train_data_selected[column])
                test_data_selected[column] = np.log(test_data_selected[column])

    if scale:
        # scale the data
        scaler = StandardScaler()
        train_data_selected[selected_columns] = scaler.fit_transform(train_data_selected[selected_columns])
        test_data_selected[selected_columns] = scaler.transform(test_data_selected[selected_columns])

    # one hot encode region
    train_data_selected = pd.get_dummies(train_data_selected, columns=["region"])
    test_data_selected = pd.get_dummies(test_data_selected, columns=["region"])

    return train_data_selected, test_data_selected

def build_model(train_data, test_data, selected_columns, model_type, log_transform = False, scale = False):

    # build train and test sets
    train_data_selected, test_data_selected = build_train_test_sets(selected_columns, train_data, test_data, log_transform, scale)

    # get input and output data
    X_train = train_data_selected.drop(columns=["real_gdp"])
    y_train = train_data_selected["real_gdp"]

    X_test = test_data_selected.drop(columns=["real_gdp"])
    y_test = test_data_selected["real_gdp"]

    # build model
    if model_type == "xgboost":
        model_test = xgb.XGBRegressor(objective ='reg:squarederror', random_state=0)
    elif model_type == "random_forest":
        model_test = RandomForestRegressor(n_estimators=1000, random_state = 0)
    elif model_type == "lasso":
        model_test = Lasso(alpha=0, random_state=0)

    # fit model
    model_test.fit(X_train, y_train)

    # make predictions
    y_pred = model_test.predict(X_test)

    # calculate mse and mpe
    mae = np.mean(abs(y_pred - y_test))
    mpe = abs(np.mean(100*(y_pred - y_test) / y_test))

    return mae, mpe


def predict_with_model(pre_war_data, prediction_data, selected_columns, data_2021, model_type, log_transform = False, scale = False):

    # build pre war and prediction sets
    pre_war_data_selected, prediction_data_selected = build_train_test_sets(selected_columns, pre_war_data, prediction_data, log_transform, scale)

    # get input and output data
    X_pre_war = pre_war_data_selected.drop(columns=["real_gdp"])
    y_pre_war = pre_war_data_selected["real_gdp"]

    X_prediction = prediction_data_selected.drop(columns=["real_gdp"])

    # build model, objective: absolute error
    if model_type == "xgboost":
        model_pred = xgb.XGBRegressor(random_state=0)
    elif model_type == "random_forest":
        model_pred = RandomForestRegressor(n_estimators=1000, random_state = 0)
    elif model_type == "lasso":
        model_pred = Lasso(alpha=0, random_state=0)

    # fit model
    model_pred.fit(X_pre_war, y_pre_war)

    # make predictions
    y_pred = model_pred.predict(X_prediction)

    # calculate the predicted change in the real gdp on the national level
    if log_transform:
        y_pred = np.exp(y_pred)
    pred_gdp_change = 100*(np.sum(y_pred) - np.sum(data_2021["real_gdp"])) / np.sum(data_2021["real_gdp"])

    return pred_gdp_change

In [63]:
def create_column_names(prefix, general_characteristics):
    general_columns = [prefix + "_" + char for char in general_characteristics]
    log_bin_columns = [prefix + "_log_" + str(i) for i in range(1, 11)] + general_columns
    idr_bin_columns = [prefix + "_idr_" + str(i) for i in range(1, 11)] + general_columns
    
    return general_columns, log_bin_columns, idr_bin_columns

def build_model_and_predict(pre_war_data, prediction_data, selected_columns, model_type, log_transform, scale, total_metrics = False):

    if total_metrics:
        total_mse = 0
        total_mpe = 0
        
        for year in range(2012, 2022):

            train_data = pre_war_data[pre_war_data['year'] != year]
            test_data = pre_war_data[pre_war_data['year'] == year]     
            mae, mpe = build_model(train_data, test_data, selected_columns, model_type, log_transform, scale)

            total_mse += mae
            total_mpe += mpe

        gdp_change = predict_with_model(pre_war_data, prediction_data, selected_columns, test_data, model_type, log_transform, scale)
        
        return total_mse, total_mpe, gdp_change

    else:

        train_data = pre_war_data[pre_war_data['year'] != 2021]
        test_data = pre_war_data[pre_war_data['year'] == 2021]
        mae, mpe = build_model(train_data, test_data, selected_columns, model_type, log_transform, scale)
        gdp_change = predict_with_model(pre_war_data, prediction_data, selected_columns, test_data, model_type, log_transform, scale)

        return mae, mpe, gdp_change

XGBoost

In [50]:
year = 2012
country_data = pre_war_data
prefix = "nearnad_snow_cov"
selected_columns =  [prefix + "_" + char for char in general_characteristics]
model_type = "xgboost"
log_transform = True
scale = True

train_data = country_data[country_data['year'] != year]
test_data = country_data[country_data['year'] == year]     
mae, mpe = build_model(train_data, test_data, selected_columns, model_type, log_transform, scale)

In [64]:
# very promising results with nearnad_snow_free_hq, idr_bin_columns + general_columns (mean, median, sd, sum) with log transform and scale

# initialise a df to store the results
xgb_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models for each year, calculate average mpe and mse, predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([general_columns, log_bin_columns, idr_bin_columns], ["general", "log_bin", "idr_bin"]):
        mae, mpe, gdp_change = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "xgboost", log_transform = True, scale = True, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        xgb_results = pd.concat([xgb_results, new_results], ignore_index=True)

# sort by mae, print the results
xgb_results = xgb_results.sort_values(by="mpe")
print(xgb_results)

                   prefix  columns       mae       mpe  national_gdp_change
17     allangle_snow_free  idr_bin  0.651318  1.771794           -37.377913
20   nearnad_snow_free_hq  idr_bin  0.733063  1.825277           -33.349645
24  allangle_snow_free_hq  general  0.745586  1.919596           -52.597119
18   nearnad_snow_free_hq  general  0.681473  1.927914           -48.489494
3       nearnad_snow_free  general  0.719047  1.943860           -48.820596
5       nearnad_snow_free  idr_bin  0.741236  2.052553           -34.492737
26  allangle_snow_free_hq  idr_bin  0.655789  2.064634           -38.781616
16     allangle_snow_free  log_bin  0.626044  2.068407           -53.050333
9        offnad_snow_free  general  0.837833  2.087885           -50.015357
11       offnad_snow_free  idr_bin  0.685696  2.152681           -55.201591
19   nearnad_snow_free_hq  log_bin  0.744478  2.176770           -43.727423
4       nearnad_snow_free  log_bin  0.764520  2.218422           -43.940452
10       off

Random Forest

In [65]:
# initialise a df to store the results
rf_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([general_columns, log_bin_columns, idr_bin_columns], ["general", "log_bin", "idr_bin"]):
        mae, mpe, gdp_change = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "random_forest", log_transform = True, scale = True, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        rf_results = pd.concat([rf_results, new_results], ignore_index=True)

# sort by mae, print the results
rf_results = rf_results.sort_values(by="mae")
print(rf_results)

                   prefix  columns       mae       mpe  national_gdp_change
10       offnad_snow_free  log_bin  0.575454  1.386454           -52.382034
22    offnad_snow_free_hq  log_bin  0.576572  1.388298           -52.408583
18   nearnad_snow_free_hq  general  0.606735  1.870962           -33.354871
3       nearnad_snow_free  general  0.621368  1.868365           -32.821282
11       offnad_snow_free  idr_bin  0.643705  2.517445           -55.081732
23    offnad_snow_free_hq  idr_bin  0.649969  2.598795           -55.072029
25  allangle_snow_free_hq  log_bin  0.652402  2.382167           -52.901592
16     allangle_snow_free  log_bin  0.654106  2.402909           -52.913767
19   nearnad_snow_free_hq  log_bin  0.672343  1.680547           -43.213735
26  allangle_snow_free_hq  idr_bin  0.674872  1.756912           -48.313653
17     allangle_snow_free  idr_bin  0.675165  1.781194           -47.521767
24  allangle_snow_free_hq  general  0.676035  2.654021           -53.460059
4       near

Lasso

In [55]:
# initialise a df to store the results
lasso_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([general_columns, log_bin_columns, idr_bin_columns], ["general", "log_bin", "idr_bin"]):
        mae, mpe, gdp_change = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "lasso", log_transform = True, scale = True, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        lasso_results = pd.concat([lasso_results, new_results], ignore_index=True)

# sort by mae, print the results
lasso_results = lasso_results.sort_values(by="mae")
print(lasso_results)

                   prefix  columns       mse       mpe  national_gdp_change
25  allangle_snow_free_hq  log_bin  0.637474  2.373354           -49.219083
22    offnad_snow_free_hq  log_bin  0.642742  2.713852           -44.180048
16     allangle_snow_free  log_bin  0.645108  2.333461           -47.482118
10       offnad_snow_free  log_bin  0.649109  2.776145           -42.830990
19   nearnad_snow_free_hq  log_bin  0.677822  2.356734           -45.145320
4       nearnad_snow_free  log_bin  0.680314  2.372246           -45.180366
26  allangle_snow_free_hq  idr_bin  0.685259  3.401663           -50.658702
17     allangle_snow_free  idr_bin  0.692888  3.382237           -49.578292
5       nearnad_snow_free  idr_bin  0.699907  2.960581           -46.429659
23    offnad_snow_free_hq  idr_bin  0.705588  3.204838           -51.422675
11       offnad_snow_free  idr_bin  0.705919  2.994791           -50.607058
20   nearnad_snow_free_hq  idr_bin  0.707774  3.119322           -45.672186
18   nearnad