In [2]:
from tools import *
import warnings
warnings.filterwarnings('ignore')

from keras.callbacks import EarlyStopping

from keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler







Prepare the data

In [6]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

# delete Kyiv and Kyiv_Oblast
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv"]
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast"]
ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast_City"]


# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

# reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
pre_war_data = pre_war_data.reset_index(drop=True)
prediction_data = prediction_data.reset_index(drop=True)

# column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
#                    "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
#                    "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

# column_prefixes = ("nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

column_prefixes = ["allangle_snow_free_hq"]

general_characteristics = ["sum"]

In [7]:
pre_war_data.sort_values(by=['region', 'year'], inplace=True)
pre_war_data_diff = pre_war_data.groupby('region').diff()
pre_war_data_diff['region'] = pre_war_data['region']
pre_war_data_diff['year'] = pre_war_data['year']
pre_war_data_diff.reset_index(drop=True, inplace=True)
pre_war_data_diff = pre_war_data_diff[pre_war_data_diff['year'] != 2012]

prediction_data = pd.concat([ukraine_data[ukraine_data['year'] == 2022], ukraine_data[ukraine_data['year'] == 2021]])
prediction_data.sort_values(by=['region', 'year'], inplace=True)
prediction_data['real_gdp'] = 0
prediction_data_diff = prediction_data.groupby('region').diff()
prediction_data_diff['region'] = prediction_data['region']
prediction_data_diff['year'] = prediction_data['year']
prediction_data_diff.reset_index(drop=True, inplace=True)
prediction_data_diff = prediction_data_diff[prediction_data_diff['year'] == 2022]

Define general functions

XGBoost

In [8]:
# Define parameter grid for XGBoost
param_grid_xgb = {
    'eta': [0.1, 0.2, 0.3, 0.4],
    'gamma': [0, 5, 10, 20],
    'max_depth': [4, 6, 8, 10],
    'min_child_weight': [3, 4, 5, 6],
    'random_state': [0] 
}

In [9]:
# initialise a df to store the results
xgb_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models for each year, calculate average mpe and mse, predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "xgboost", param_grid_xgb, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        xgb_results = pd.concat([xgb_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
# metrics.to_csv("xgb_results.csv", index=False)

Finished allangle_snow_free_hq


In [12]:
metrics["mae"].mean()

2871.5517195111765

Random Forest

In [9]:
# Define parameter grid for Random Forest
param_grid_rf = {
        'n_estimators': [100, 200, 300, 400],
        'max_depth': [5, 10, 15, 20],  # Maximum depth of the tree
        'min_samples_split': [2, 4, 6, 8],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [2, 4, 6, 8],  # Minimum number of samples required to be at a leaf node
        'random_state': [0],  # Ensures reproducibility

}

In [10]:
# initialise a df to store the results
rf_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "random_forest", param_grid_rf, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        rf_results = pd.concat([rf_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
metrics.to_csv("rf_results.csv", index=False)

Finished allangle_snow_free_hq


In [37]:
# initialise a df to store the results
nn_results = pd.DataFrame(columns = ["mae", "year"])
log_bin_columns = ["allangle_snow_free_hq" + "_log_" + str(i) for i in range(1, 11)]
set_seed(1)

for test_year in range(2013, 2022):
    # calculate differences
    ukraine = pd.read_csv("data/tabular_data_ukraine.csv")
    ukraine = ukraine[ukraine["region"] != "Kyiv_Oblast_City"]
    ukraine_sum = ukraine[['year', 'region', 'real_gdp', 'allangle_snow_free_hq_sum'] + log_bin_columns]
    ukraine = ukraine[ukraine["year"] < 2022]

    full_data = ukraine_sum
    full_data.sort_values(by=['region', 'year'], inplace=True)
    full_data_diff = full_data.groupby('region').diff()
    full_data_diff['region'] = full_data['region']
    full_data_diff['year'] = full_data['year']
    full_data_diff.reset_index(drop=True, inplace=True)
    full_data_diff.dropna(inplace=True)

    train_data_diff = full_data_diff[full_data_diff["year"] != test_year]
    test_data_diff = full_data_diff[full_data_diff["year"] == test_year]
    test_data_diff.reset_index(drop=True, inplace=True)
    train_data_diff.reset_index(drop=True, inplace=True)

    # build the model
    # mae, _, _ = build_model(train_data_diff, test_data_diff, selected_columns, "random_forest", param_grid_rf, 
    #                                                                             log_transform = False, scale = False)

    train_data_nn = train_data_diff.copy()
    pred_data_nn = test_data_diff.copy()

    train_data_nn = pd.get_dummies(train_data_nn, columns=["region"])
    pred_data_nn = pd.get_dummies(pred_data_nn, columns=["region"])

    X_train_nn = train_data_nn.drop(columns=["year", "real_gdp"])
    y_train_nn = train_data_nn["real_gdp"]

    X_test_nn = pred_data_nn.drop(columns=["year", "real_gdp"])
    y_test_nn = pred_data_nn["real_gdp"]

    X_train_nn = np.array(X_train_nn, dtype=np.float32)
    X_test_nn = np.array(X_test_nn, dtype=np.float32)
    y_train_nn = np.array(y_train_nn, dtype=np.float32)
    y_test_nn = np.array(y_test_nn, dtype=np.float32)


    # scale the data
    scaler = StandardScaler()
    X_train_nn = scaler.fit_transform(X_train_nn)
    X_test_nn = scaler.transform(X_test_nn)

    # fit the model on the X_train and y_train
    model = Sequential()
    # model.add(Dense(256, activation="relu", input_dim=X_train_nn.shape[1]))  # First layer
    # model.add(Dense(128, activation="relu"))  # First layer
    # model.add(Dense(64, activation="relu"))  # First layer
    model.add(Dense(32, activation="relu", input_dim=X_train_nn.shape[1]))  # Fourth layer
    model.add(Dense(16, activation="relu"))  # New additional layer
    model.add(Dense(8, activation="relu"))  # New additional layer
    model.add(Dense(1, activation = 'linear')) 

    model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])


    early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
    model.fit(X_train_nn, y_train_nn, epochs=500, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

    y_pred = model.predict(X_test_nn).flatten()
    mae = np.mean(np.abs(y_pred - y_test_nn))

    # add the results to the dataframe
    new_row = pd.DataFrame([{
        "mae": mae,
        "year": test_year
    }])

    nn_results = pd.concat([nn_results, new_row], ignore_index=True)
    print("Results for year = ", test_year)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [41]:
# nn_results.to_csv("nn_results_aa_sf_hq.csv", index=False)
print(nn_results["mae"].mean())


2631.9602


In [42]:
nn_results

Unnamed: 0,mae,year
0,2284.263672,2013
1,3435.992188,2014
2,4568.526367,2015
3,2481.959961,2016
4,1909.524658,2017
5,3141.90332,2018
6,1757.951294,2019
7,2198.557861,2020
8,1908.963745,2021


In [77]:
# predict
log_bin_columns = ["nearnad_snow_cov" + "_log_" + str(i) for i in range(1, 11)]
ukraine = pd.read_csv("data/tabular_data_ukraine.csv")
# ukraine = pd.read_csv("data/tabular_data_poland.csv")
ukraine = ukraine[ukraine["region"] != "Kyiv_Oblast_City"]
ukraine_sum = ukraine[['year', 'region', 'real_gdp', 'nearnad_snow_cov_sum'] + log_bin_columns]
ukraine_sum = ukraine_sum[ukraine_sum["year"] < 2023]

full_data = ukraine_sum
full_data.sort_values(by=['region', 'year'], inplace=True)
full_data_diff = full_data.groupby('region').diff()
full_data_diff['region'] = full_data['region']
full_data_diff['year'] = full_data['year']
full_data_diff.reset_index(drop=True, inplace=True)

train_data_diff = full_data_diff[full_data_diff["year"] != 2022]
train_data_diff.dropna(inplace=True)
test_data_diff = full_data_diff[full_data_diff["year"] == 2022]
test_data_diff.reset_index(drop=True, inplace=True)
train_data_diff.reset_index(drop=True, inplace=True)

# build the model
# mae, _, _ = build_model(train_data_diff, test_data_diff, selected_columns, "random_forest", param_grid_rf, 
#                                                                             log_transform = False, scale = False)

train_data_nn = train_data_diff.copy()
pred_data_nn = test_data_diff.copy()

train_data_nn = pd.get_dummies(train_data_nn, columns=["region"])
pred_data_nn = pd.get_dummies(pred_data_nn, columns=["region"])

X_train_nn = train_data_nn.drop(columns=["year", "real_gdp"])
y_train_nn = train_data_nn["real_gdp"]

X_test_nn = pred_data_nn.drop(columns=["year", "real_gdp"])
y_test_nn = pred_data_nn["real_gdp"]

X_train_nn = np.array(X_train_nn, dtype=np.float32)
X_test_nn = np.array(X_test_nn, dtype=np.float32)
y_train_nn = np.array(y_train_nn, dtype=np.float32)
y_test_nn = np.array(y_test_nn, dtype=np.float32)


# scale the data
scaler = StandardScaler()
X_train_nn = scaler.fit_transform(X_train_nn)
X_test_nn = scaler.transform(X_test_nn)
set_seed(1) 
# fit the model on the X_train and y_train
model = Sequential()
# model.add(Dense(256, activation="relu", input_dim=X_train_nn.shape[1]))  # First layer
# model.add(Dense(128, activation="relu"))  # First layer
# model.add(Dense(64, activation="relu"))  # First layer
model.add(Dense(32, activation="relu", input_dim=X_train_nn.shape[1]))  # Fourth layer
model.add(Dense(16, activation="relu"))  # New additional layer
model.add(Dense(8, activation="relu"))  # New additional layer
model.add(Dense(1, activation = 'linear')) 

model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"], )


# add early stopping
early_stopping = EarlyStopping(monitor="val_loss", patience=30, restore_best_weights=True)
model.fit(X_train_nn, y_train_nn, epochs=200, batch_size=64, validation_split=0.2)

y_pred = model.predict(X_test_nn).flatten()


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [78]:
sum(y_pred)/ukraine_sum[ukraine_sum["year"] == 2021]["real_gdp"].sum()

-0.08447469647039074

In [79]:

test_data_diff["gdp_pred"] = y_pred
ukraine_2022_pred = test_data_diff[["region", "gdp_pred"]]
ukraine_2021 = ukraine_sum[ukraine_sum["year"] == 2021]
ukraine_2022_pred = pd.merge(ukraine_2022_pred, ukraine_2021[["region", "real_gdp"]], on=["region"])
ukraine_2022_pred["gdp_pred"] = ukraine_2022_pred["gdp_pred"] + ukraine_2022_pred["real_gdp"]
ukraine_2022_pred = ukraine_2022_pred[["region", "gdp_pred"]]

# save the results
ukraine_2022_pred.to_csv("gdp_predictions_ukraine_nn_sc_lq.csv", index=False)

In [21]:
ukraine_2022_pred

Unnamed: 0,region,gdp_pred
0,Cherkasy_Oblast,25683.734003
1,Chernihiv_Oblast,17936.880791
2,Chernivtsi_Oblast,6675.608835
3,Dnipropetrovsk_Oblast,58510.873612
4,Donetsk_Oblast,33712.934306
5,Ivano-Frankivsk_Oblast,21746.145738
6,Kharkiv_Oblast,45717.755352
7,Kherson_Oblast,16179.705971
8,Khmelnytskyi_Oblast,23206.423099
9,Kirovohrad_Oblast,17040.693394


In [19]:
test_data_diff["gdp_pred"] = y_pred
poland_2022_pred = test_data_diff[["region", "gdp_pred", "real_gdp"]]
poland_2022_pred.to_csv("gdp_predictions_poland_nn.csv", index=False)   