In [1]:
from tools import *
import warnings
warnings.filterwarnings('ignore')






Prepare the data

In [7]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

# delete Kyiv and Kyiv_Oblast
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv"]
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast"]
ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast_City"]


# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

# reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
pre_war_data = pre_war_data.reset_index(drop=True)
prediction_data = prediction_data.reset_index(drop=True)

# column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
#                    "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
#                    "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

# column_prefixes = ("nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

column_prefixes = ["allangle_snow_free_hq"]

general_characteristics = ["sum"]

In [8]:
pre_war_data.sort_values(by=['region', 'year'], inplace=True)
pre_war_data_diff = pre_war_data.groupby('region').diff()
pre_war_data_diff['region'] = pre_war_data['region']
pre_war_data_diff['year'] = pre_war_data['year']
pre_war_data_diff.reset_index(drop=True, inplace=True)
pre_war_data_diff = pre_war_data_diff[pre_war_data_diff['year'] != 2012]

prediction_data = pd.concat([ukraine_data[ukraine_data['year'] == 2022], ukraine_data[ukraine_data['year'] == 2021]])
prediction_data.sort_values(by=['region', 'year'], inplace=True)
prediction_data['real_gdp'] = 0
prediction_data_diff = prediction_data.groupby('region').diff()
prediction_data_diff['region'] = prediction_data['region']
prediction_data_diff['year'] = prediction_data['year']
prediction_data_diff.reset_index(drop=True, inplace=True)
prediction_data_diff = prediction_data_diff[prediction_data_diff['year'] == 2022]

Define general functions

XGBoost

In [13]:
# Define parameter grid for XGBoost
param_grid_xgb = {
    'eta': [0.1, 0.2, 0.3, 0.4],
    'gamma': [10, 20, 50, 100],
    'max_depth': [4, 6, 8, 10],
    'min_child_weight': [3, 4, 5, 6],
    'random_state': [0] 
}

In [14]:
# initialise a df to store the results
xgb_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models for each year, calculate average mpe and mse, predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "xgboost", param_grid_xgb, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        xgb_results = pd.concat([xgb_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
metrics.to_csv("xgb_results.csv", index=False)

Finished allangle_snow_free_hq


Random Forest

In [16]:
# Define parameter grid for Random Forest
param_grid_rf = {
        'n_estimators': [100, 200, 300, 400],
        'max_depth': [5, 10, 15, 20],  # Maximum depth of the tree
        'min_samples_split': [2, 4, 6, 8],  # Minimum number of samples required to split an internal node
        'min_samples_leaf': [2, 4, 6, 8],  # Minimum number of samples required to be at a leaf node
        'random_state': [0],  # Ensures reproducibility

}

In [17]:
# initialise a df to store the results
rf_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "random_forest", param_grid_rf, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        rf_results = pd.concat([rf_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
metrics.to_csv("rf_results.csv", index=False)

Finished allangle_snow_free_hq
