In [1]:
from tools import *
import warnings
warnings.filterwarnings('ignore')






Prepare the data

In [2]:
# load tabular ukraine data
ukraine_data = pd.read_csv('data/tabular_data_ukraine.csv')

# delete Kyiv and Kyiv_Oblast
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv"]
# ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast"]
ukraine_data = ukraine_data[ukraine_data["region"] != "Kyiv_Oblast_City"]


# get training, test, pre_war and prediction data
train_data = ukraine_data[ukraine_data['year'] < 2021]
test_data = ukraine_data[ukraine_data['year'] == 2021]
pre_war_data = ukraine_data[ukraine_data['year'] < 2022]
prediction_data = ukraine_data[ukraine_data['year'] == 2022]

# reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
pre_war_data = pre_war_data.reset_index(drop=True)
prediction_data = prediction_data.reset_index(drop=True)

# column_prefixes = ("nearnad_snow_cov", "nearnad_snow_free", "offnad_snow_cov",
#                    "offnad_snow_free", "allangle_snow_cov", "allangle_snow_free", 
#                    "nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

# column_prefixes = ("nearnad_snow_free_hq", "offnad_snow_free_hq", "allangle_snow_free_hq")

column_prefixes = ["allangle_snow_free_hq"]

general_characteristics = ("sum", "mean")

In [3]:
pre_war_data.sort_values(by=['region', 'year'], inplace=True)
pre_war_data_diff = pre_war_data.groupby('region').diff()
pre_war_data_diff['region'] = pre_war_data['region']
pre_war_data_diff['year'] = pre_war_data['year']
pre_war_data_diff.reset_index(drop=True, inplace=True)
pre_war_data_diff = pre_war_data_diff[pre_war_data_diff['year'] != 2012]

prediction_data = pd.concat([ukraine_data[ukraine_data['year'] == 2022], ukraine_data[ukraine_data['year'] == 2021]])
prediction_data.sort_values(by=['region', 'year'], inplace=True)
prediction_data['real_gdp'] = 0
prediction_data_diff = prediction_data.groupby('region').diff()
prediction_data_diff['region'] = prediction_data['region']
prediction_data_diff['year'] = prediction_data['year']
prediction_data_diff.reset_index(drop=True, inplace=True)
prediction_data_diff = prediction_data_diff[prediction_data_diff['year'] == 2022]

Define general functions

XGBoost

In [5]:
# Define parameter grid for XGBoost
param_grid_xgb = {
    'eta': [0.01, 0.1, 0.2, 0.3],
    'gamma': [10, 20, 50, 100],
    'max_depth': [4, 6, 8, 10],
    'min_child_weight': [3, 4, 5, 6],
    'random_state': [0] 
}

In [6]:
# initialise a df to store the results
xgb_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models for each year, calculate average mpe and mse, predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "xgboost", param_grid_xgb, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        xgb_results = pd.concat([xgb_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
metrics.to_csv("xgb_results.csv", index=False)

Finished allangle_snow_free_hq


In [7]:
print(best_params)
# 'gamma': 10, 'max_depth': 8, 'min_child_weight': 3,  'eta': 0.01

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': 10, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 8, 'max_leaves': None, 'min_child_weight': 3, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 0, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'eta': 0.01}


Random Forest

In [8]:
# Define parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 15, 20],  # Maximum depth of the tree
    'min_samples_split': [1, 2, 4],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'random_state': [0],  # Ensures reproducibility

}

In [9]:
# initialise a df to store the results
rf_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([log_bin_columns], ["log_bin"]):
        mae, mpe, gdp_change, y_pred, best_params, metrics = build_model_and_predict(pre_war_data_diff, prediction_data_diff, selected_columns, "random_forest", param_grid_rf, log_transform = False, scale = False, total_metrics = True, diff = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        rf_results = pd.concat([rf_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# save metrics as a csv
metrics.to_csv("rf_results.csv", index=False)

Finished allangle_snow_free_hq


In [10]:
print(best_params)
# 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 50 'max_depth': 15

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 15, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


Lasso

In [None]:
# Define parameter grid for Lasso
param_grid_lasso = {
    'alpha': [0.1, 0.5, 1]    
}

In [None]:
# initialise a df to store the results
lasso_results = pd.DataFrame(columns=["prefix", "columns", "mae", "mpe", "national_gdp_change"])

for prefix in column_prefixes:
    # create general column names
    general_columns, log_bin_columns, idr_bin_columns = create_column_names(prefix, general_characteristics)
    
    # build xgb models and predict the national gdp change, add the results to the df
    for selected_columns, columns_category in zip([general_columns, log_bin_columns, idr_bin_columns], ["general", "log_bin", "idr_bin"]):
        mae, mpe, gdp_change, best_params = build_model_and_predict(pre_war_data, prediction_data, selected_columns, "lasso", param_grid_lasso, log_transform = False, scale = False, total_metrics = True)
        new_results = pd.DataFrame([{"prefix": prefix, "columns": columns_category, "mae": mae, "mpe": mpe, "national_gdp_change": gdp_change}])
        lasso_results = pd.concat([lasso_results, new_results], ignore_index=True)

    print(f"Finished {prefix}")

# sort by mae, print the results
lasso_results = lasso_results.sort_values(by="mae")
print(lasso_results)