In [1]:
from tools import *
import warnings
warnings.filterwarnings('ignore')






Prepare the data

In [2]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

# delete Kyiv and Kyiv_Oblast
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv"]
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast"]
ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast_City"]


# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

# reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
pre_war_data = pre_war_data.reset_index(drop=True)
prediction_data = prediction_data.reset_index(drop=True)

# column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
#                    "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
#                    "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

# column_prefixes = ("nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

column_prefixes = ["allangle_snow_free_hq"]

general_characteristics = ("mean", "sd")

Define general functions

XGBoost

In [13]:
# Define parameter grid for XGBoost
# param_grid_xgb = {
#     'max_depth': [5, 4, 6],
#     'min_child_weight': [6, 4, 5],
#     'random_state': [0] 
# }

param_grid_xgb = {
    'eta': [0.01, 0.1, 0.2, 0.3],
    'gamma': [10, 20, 50, 100],
    'max_depth': [4, 6, 8, 10],
    'min_child_weight': [3, 4, 5, 6],
    'random_state': [0] 
}

In [14]:
# very promising results with nearnad_snow_free_hq, idr_bin_columns + general_columns (mean, median, sd, sum) with log transform and scale

# initialise a df to store the results
xgb_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models for each year, calculate average mpe and mse, predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "xgboost", param_grid_xgb, log_transform = False, scale = False, total_metrics = False)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        xgb_results = pd.concat([xgb_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# sort by mae, print the results
xgb_results = xgb_results.sort_values(by="mae")
print(xgb_results)

# add the results to prediction data

prediction_data["gdp_pred"] = y_pred

Finished allangle_snow_free
               prefix  columns          mae        mpe  national_gdp_change
0  allangle_snow_free  log_bin  4721.050089  11.283239           -30.255199


In [15]:
print(prediction_data[["region", "gdp_pred"]])


                    region      gdp_pred
0         Vinnytsia_Oblast  36776.000000
1             Volyn_Oblast  47145.269531
2    Dnipropetrovsk_Oblast  19734.585938
3           Donetsk_Oblast  50099.304688
4          Zhytomyr_Oblast  47110.492188
5       Zakarpattia_Oblast  22814.728516
6        Zaporizhia_Oblast  20642.404297
7   Ivano-Frankivsk_Oblast  51976.734375
8              Kyiv_Oblast  24443.671875
9        Kirovohrad_Oblast  46897.398438
10          Luhansk_Oblast  21360.117188
11             Lviv_Oblast  28214.775391
12         Mykolaiv_Oblast  55375.273438
13           Odessa_Oblast  33906.468750
14          Poltava_Oblast  22718.968750
15            Rivne_Oblast  20637.617188
16             Sumy_Oblast  47129.148438
17         Ternopil_Oblast  54086.910156
18          Kharkiv_Oblast  47129.148438
19          Kherson_Oblast  18164.017578
20     Khmelnytskyi_Oblast  22545.201172
21         Cherkasy_Oblast  23848.566406
22       Chernivtsi_Oblast  13694.843750
23        Cherni

In [16]:
test_data[["region", "real_gdp"]]

Unnamed: 0,region,real_gdp
0,Vinnytsia_Oblast,43372.874934
1,Volyn_Oblast,19152.748903
2,Dnipropetrovsk_Oblast,128445.647049
3,Donetsk_Oblast,64004.166728
4,Zhytomyr_Oblast,30071.647441
5,Zakarpattia_Oblast,20968.462635
6,Zaporizhia_Oblast,54817.528012
7,Ivano-Frankivsk_Oblast,33126.166246
8,Kyiv_Oblast,73951.522212
9,Kirovohrad_Oblast,25128.837437


Random Forest

In [3]:
# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 15, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 4, 6, 8],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 4, 6, 8],  # Minimum number of samples required to be at a leaf node
    'random_state': [0],  # Ensures reproducibility

}

In [5]:
# initialise a df to store the results
rf_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "random_forest", param_grid_rf, log_transform = False, scale = False, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        rf_results = pd.concat([rf_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# sort by mae, print the results
rf_results = rf_results.sort_values(by="mae")
print(rf_results)
print(y_pred)

Finished allangle_snow_free_hq
                  prefix  columns          mae       mpe  national_gdp_change
0  allangle_snow_free_hq  log_bin  4141.319873  7.762472           -27.843676
[28314.78442318 25950.72000971 21401.98477722 46346.73081339
 72136.77912726 22161.07995837 21059.27081207 39423.24622299
 25330.01419429 21418.90632189 21370.41410471 36322.15995263
 72452.00566556 37643.07745155 22617.22906661 21271.14295177
 72147.87078542 69701.14665265 21268.97794756 20889.9130924
 22443.77384051 21656.06011256 12905.53760094 72374.06344949
 99461.13079797]


Lasso

In [None]:
# Define parameter grid for Lasso
param_grid_lasso = {
    'alpha': [0.1, 0.5, 1]    
}

In [None]:
# initialise a df to store the results
lasso_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([general_columns, log_bin_columns, idr_bin_columns], ["general", "log_bin", "idr_bin"]):
        mae, mpe, gdp_change, best_params = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "lasso", param_grid_lasso, log_transform = False, scale = False, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        lasso_results = pd.concat([lasso_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# sort by mae, print the results
lasso_results = lasso_results.sort_values(by="mae")
print(lasso_results)