# Baseline Economic Models
## Nominal Terms Growth Rates VS Levels

In [1]:
# Adjust Notebook Display
from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
# Hide Warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import GridSearchCV, HalvingGridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from panelsplit import PanelSplit
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.impute import KNNImputer
from category_encoders import CatBoostEncoder
import shap

In [4]:
# Load Data
econ_df = pd.read_csv('economic_nominal_variables.csv')
econ_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10642 entries, 0 to 10641
Data columns (total 97 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               10642 non-null  int64  
 1   country            10642 non-null  object 
 2   geo                10642 non-null  object 
 3   GVA [A]            10642 non-null  float64
 4   GVA [B-E]          10642 non-null  float64
 5   GVA [C]            10642 non-null  float64
 6   GVA [F]            10642 non-null  float64
 7   GVA [G-I]          10642 non-null  float64
 8   GVA [G-J]          10642 non-null  float64
 9   GVA [J]            10642 non-null  float64
 10  GVA [K]            10642 non-null  float64
 11  GVA [K-N]          10642 non-null  float64
 12  GVA [L]            10642 non-null  float64
 13  GVA [M_N]          10642 non-null  float64
 14  GVA [O-Q]          10642 non-null  float64
 15  GVA [O-U]          10642 non-null  float64
 16  GVA [R-U]          106

In [5]:
# Seperate Growth Rates and Levels Data
gr_vars = [item for item in list(econ_df) if 'Gr' in item]
level_df = econ_df.drop(gr_vars, axis=1)
level_df = level_df.sort_values(by='year')
print('Levels Dataframe')
print(level_df.info())
print(' ')
gr_df_vars = gr_vars + ['year', 'country', 'geo']
growth_df = econ_df[[col for col in econ_df.columns if col in gr_df_vars]] 
growth_df = growth_df.sort_values(by='year')
print('Growth Rates Dataframe')
print(growth_df.info())

Levels Dataframe
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10642 entries, 2856 to 738
Data columns (total 50 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   year            10642 non-null  int64  
 1   country         10642 non-null  object 
 2   geo             10642 non-null  object 
 3   GVA [A]         10642 non-null  float64
 4   GVA [B-E]       10642 non-null  float64
 5   GVA [C]         10642 non-null  float64
 6   GVA [F]         10642 non-null  float64
 7   GVA [G-I]       10642 non-null  float64
 8   GVA [G-J]       10642 non-null  float64
 9   GVA [J]         10642 non-null  float64
 10  GVA [K]         10642 non-null  float64
 11  GVA [K-N]       10642 non-null  float64
 12  GVA [L]         10642 non-null  float64
 13  GVA [M_N]       10642 non-null  float64
 14  GVA [O-Q]       10642 non-null  float64
 15  GVA [O-U]       10642 non-null  float64
 16  GVA [R-U]       10642 non-null  float64
 17  GVA [TOTAL]  

In [6]:
# Get Sectoral Dataframes

sectors = ['[A]', '[B-E]', '[C]', '[F]', '[G-I]', '[G-J]', '[J]', '[K]', '[K-N]', '[L]', '[M_N]', '[O-Q]', '[O-U]', '[R-U]', '[TOTAL]']

sec_lev_dfs = []
sec_gr_dfs = []

for i in sectors:
    # Levels Data
    sec_vars_lev = [item for item in list(level_df) if i in item]
    vars_lev = ['year', 'geo', 'GDP [CNT]'] + sec_vars_lev
    sec_lev_dfs.append(level_df[[col for col in level_df.columns if col in vars_lev]])
    # Growth Data
    sec_vars_gr = [item for item in list(growth_df) if i in item]
    vars_gr = ['year', 'geo', 'Gr GDP [CNT]'] + sec_vars_gr
    sec_gr_dfs.append(growth_df[[col for col in growth_df.columns if col in vars_gr]])

In [7]:
# Get Train-Test Split

# Levels
train_lv = []
test_lv = []
for i in range(len(sec_lev_dfs)):
    train_lv.append(sec_lev_dfs[i][sec_lev_dfs[i]['year'] <= 2017])
    test_lv.append(sec_lev_dfs[i][sec_lev_dfs[i]['year'] > 2017])
    
# Growth Rates
train_gr = []
test_gr = []
for i in range(len(sec_gr_dfs)):
    train_gr.append(sec_gr_dfs[i][sec_gr_dfs[i]['year'] <= 2017])
    test_gr.append(sec_gr_dfs[i][sec_gr_dfs[i]['year'] > 2017])

In [8]:
# Parameter Grid

# XGBRegressor
#param_grid = {'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5],
#              'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
#              'min_child_weight': range(1, 10, 2),
#              'max_depth': range(3, 10, 1),
#              'n_estimators': range(50, 200, 50)}

# HistGradientBoostingRegressor
param_grid = {'max_iter': range(50, 200, 50), 
              'max_leaf_nodes': range(20, 160, 20),
              'max_depth': range(3, 10, 1), 
              'min_samples_leaf': [5, 10, 15, 20, 30, 40, 50],
              'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5]}

In [9]:
# Create a scorer
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

### Base Economic Mode
Structure: $GVA_{t}$ = $GVA_{t-1}$ + $EMPLOYMENT-RATE_{t}$ + $GDP-CNT_{t}$

In [10]:
# Hyperparameter Tuning for Levels Data Models

for i in range(len(train_lv)):
    
    print('Sector ', sectors[i])
    print(' ')
    
    # Data Preparation
    train_lv[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    target = 'GVA '+ sectors[i].replace('[', '').replace(']', '')
    lag = target + ' L1'
    train_lv[i] = train_lv[i].dropna(subset = [target, lag])
    
    # Impute Missing Data
    try:
        imputer = KNNImputer(n_neighbors=2)
        X = pd.DataFrame(imputer.fit_transform(train_lv[i].drop(['year', 'geo', target], axis=1)), columns=train_lv[i].columns[-3:].tolist())
    except:
        X = train_lv[i].drop(['year', 'geo', target], axis=1)
    
    # CV Splits
    #panel_split = PanelSplit(train_lv[i].year, n_splits=5, gap=0, test_size=1)
    TSCV = TimeSeriesSplit(n_splits=5)
    
    # Cross Validation for Tuning
    hgb_reg = HistGradientBoostingRegressor(random_state = 42)
    #grid_search = GridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, cv = TSCV, n_jobs = 3, verbose = 2, error_score='raise')
    grid_search = HalvingGridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, random_state = 42, cv = TSCV, n_jobs = 3, error_score='raise')
    grid_search.fit(X, train_lv[i][target])
    
    print('GridSearch results:')
    print(f"Best parameters: {grid_search.best_params_}")
    print(' ')

Sector  [A]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 6, 'max_iter': 50, 'max_leaf_nodes': 60, 'min_samples_leaf': 5}
 
Sector  [B-E]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_iter': 50, 'max_leaf_nodes': 140, 'min_samples_leaf': 5}
 
Sector  [C]
 
GridSearch results:
Best parameters: {'learning_rate': 0.2, 'max_depth': 4, 'max_iter': 50, 'max_leaf_nodes': 60, 'min_samples_leaf': 5}
 
Sector  [F]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 50, 'max_leaf_nodes': 20, 'min_samples_leaf': 5}
 
Sector  [G-I]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 80, 'min_samples_leaf': 5}
 
Sector  [G-J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 50, 'max_leaf_nodes': 140, 'min_samples_leaf': 5}
 
Sector  [J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 

In [11]:
# Hyperparameter Tuning for Growth Rates Data Models

for i in range(len(train_gr)):
    
    print('Sector ', sectors[i])
    print(' ')
    
    # Data Preparation
    train_gr[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    target = 'Gr GVA '+ sectors[i].replace('[', '').replace(']', '')
    lag = target + ' L1'
    train_gr[i] = train_gr[i].dropna(subset = [target, lag])
    
    # Impute Missing Data
    try:
        imputer = KNNImputer(n_neighbors=2)
        X = pd.DataFrame(imputer.fit_transform(train_gr[i].drop(['year', 'geo', target], axis=1)), columns=train_gr[i].columns[-3:].tolist())
    except:
        X = train_gr[i].drop(['year', 'geo', target], axis=1)
    
    # CV Splits
    #panel_split = PanelSplit(train_lv[i].year, n_splits=5, gap=0, test_size=1)
    TSCV = TimeSeriesSplit(n_splits=5)
    
    # Cross Validation for Tuning
    hgb_reg = HistGradientBoostingRegressor(random_state = 42)
    #grid_search = GridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, cv = TSCV, n_jobs = 3, verbose = 2, error_score='raise')
    grid_search = HalvingGridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, random_state = 42, cv = TSCV, n_jobs = 3, error_score='raise')
    grid_search.fit(X, train_gr[i][target])
    
    print('GridSearch results:')
    print(f"Best parameters: {grid_search.best_params_}")
    print(' ')

Sector  [A]
 
GridSearch results:
Best parameters: {'learning_rate': 0.001, 'max_depth': 8, 'max_iter': 150, 'max_leaf_nodes': 100, 'min_samples_leaf': 10}
 
Sector  [B-E]
 
GridSearch results:
Best parameters: {'learning_rate': 0.01, 'max_depth': 8, 'max_iter': 100, 'max_leaf_nodes': 40, 'min_samples_leaf': 10}
 
Sector  [C]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 50, 'max_leaf_nodes': 40, 'min_samples_leaf': 30}
 
Sector  [F]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'max_iter': 50, 'max_leaf_nodes': 40, 'min_samples_leaf': 15}
 
Sector  [G-I]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 120, 'min_samples_leaf': 10}
 
Sector  [G-J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 60, 'min_samples_leaf': 10}
 
Sector  [J]
 
GridSearch results:
Best parameters: {'learning_r

In [17]:
# Check Model Results

# Data Types
model_type = ['Nominal Terms Values', 'Nominal Growth Rates']

# Levels Data Hyperparameters by Sector
learning_rates_lv = [0.1, 0.1, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.1]
max_depth_lv = [6, 4, 4, 5, 3, 7, 6, 4, 3, 3, 5, 5, 7, 3, 8]
max_iter_lv = [50, 50, 50, 50, 50, 50, 50, 50, 50, 100, 50, 50, 100, 150, 100]
max_leaf_lv = [60, 140, 60, 20, 80, 140, 20, 120, 40, 20, 120, 140, 140, 60, 20]
min_samples_lv = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]

# Growth Rates Data Hyperparametrs by Sector
learning_rates_gr = [0.001, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 0.01, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.1]
max_depth_gr = [8, 8, 5, 8, 3, 3, 3, 8, 3, 8, 3, 3, 3, 5, 3]
max_iter_gr = [150, 100, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
max_leaf_gr = [100, 40, 40, 40, 120, 60, 120, 120, 100, 20, 120, 20, 120, 120, 60]
min_samples_gr = [10, 10, 30, 15, 10, 10, 10, 15, 10, 40, 10, 20, 20, 50, 10]

for i in model_type:
    
    print(i)
    print(' ')
    
    for j in range(len(sectors)):
        
        if i == 'Nominal Terms Values':
            
            # Data Preparation
            train_lv[j].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
            test_lv[j].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
            target = 'GVA '+ sectors[j].replace('[', '').replace(']', '')
            lag = target + ' L1'
            train_lv[j] = train_lv[j].dropna(subset = [target, lag])
            test_lv[j] = test_lv[j].dropna(subset = [target, lag])

            # Impute Missing Data
            try:
                imputer = KNNImputer(n_neighbors=2)
                X = pd.DataFrame(imputer.fit_transform(train_lv[j].drop(['year', 'geo', target], axis=1)), columns=train_lv[j].columns[-3:].tolist())
            except:
                X = train_lv[j].drop(['year', 'geo', target], axis=1)

            # Fit HistGradientBoostingRegressor with given best parameters
            hgb = HistGradientBoostingRegressor(max_iter = max_depth_lv[j], max_depth = max_depth_lv[j], 
                                                min_samples_leaf = min_samples_lv[j], 
                                                learning_rate = learning_rates_lv[j], random_state = 42)
            hgb.fit(X, train_lv[j][target])
            
            # Show Results
            print('Sector ', sectors[j])
            print(' ')
            print('In-sample MSE :', mean_squared_error(train_lv[j][target], hgb.predict(X)))
            print('In-sample R Squared :', r2_score(train_lv[j][target], hgb.predict(X)))
            print(' ')
            print('Out of sample MSE :', mean_squared_error(test_lv[j][target], hgb.predict(test_lv[j].drop(['year', 'geo', target], axis=1))))
            print('Out of sample R Squared :', r2_score(test_lv[j][target], hgb.predict(test_lv[j].drop(['year', 'geo', target], axis=1))))
            print(' ')
            
        else:
            
            # Data Preparation
            train_gr[j].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
            test_gr[j].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
            target = 'Gr GVA '+ sectors[j].replace('[', '').replace(']', '')
            lag = target + ' L1'
            train_gr[j] = train_gr[j].dropna(subset = [target, lag])
            test_gr[j] = test_gr[j].dropna(subset = [target, lag])

            # Impute Missing Data
            try:
                imputer = KNNImputer(n_neighbors=2)
                X = pd.DataFrame(imputer.fit_transform(train_gr[j].drop(['year', 'geo', target], axis=1)), columns=train_gr[j].columns[-3:].tolist())
            except:
                X = train_gr[j].drop(['year', 'geo', target], axis=1)
                
                # Fit HistGradientBoostingRegressor with given best parameters
            hgb = HistGradientBoostingRegressor(max_iter = max_depth_gr[j], max_depth = max_depth_gr[j], 
                                                min_samples_leaf = min_samples_gr[j], 
                                                learning_rate = learning_rates_gr[j], random_state = 42)
            hgb.fit(X, train_gr[j][target])
            
            # Show Results
            print('Sector ', sectors[j])
            print(' ')
            print('In-sample MSE :', mean_squared_error(train_gr[j][target], hgb.predict(X)))
            print('In-sample R Squared :', r2_score(train_gr[j][target], hgb.predict(X)))
            print(' ')
            print('Out of sample MSE :', mean_squared_error(test_gr[j][target], hgb.predict(test_gr[j].drop(['year', 'geo', target], axis=1))))
            print('Out of sample R Squared :', r2_score(test_gr[j][target], hgb.predict(test_gr[j].drop(['year', 'geo', target], axis=1))))
            print(' ')

Nominal Terms Values
 
Sector  [A]
 
In-sample MSE : 12243.379242720497
In-sample R Squared : 0.6905565431999592
 
Out of sample MSE : 12243.379242720497
Out of sample R Squared : 0.6905565431999592
 
Sector  [B-E]
 
In-sample MSE : 2654647.262497549
In-sample R Squared : 0.5573471307292337
 
Out of sample MSE : 2654647.262497549
Out of sample R Squared : 0.5573471307292337
 
Sector  [C]
 
In-sample MSE : 794777.3006590288
In-sample R Squared : 0.8166624606174805
 
Out of sample MSE : 794777.3006590288
Out of sample R Squared : 0.8166624606174805
 
Sector  [F]
 
In-sample MSE : 201942.32944799596
In-sample R Squared : 0.6408586468439107
 
Out of sample MSE : 201942.32944799596
Out of sample R Squared : 0.6408586468439107
 
Sector  [G-I]
 
In-sample MSE : 5669270.972374071
In-sample R Squared : 0.4560868788117396
 
Out of sample MSE : 5669270.972374071
Out of sample R Squared : 0.4560868788117396
 
Sector  [G-J]
 
In-sample MSE : 5601032.631778388
In-sample R Squared : 0.767680657232121

### Base Economic Model 
Structure: $GVA_{t}$ = $GVA_{t-1}$ + $EMPLOYMENT-RATE_{t}$ + $GDP-CNT_{t}$ + $GEO$

In [12]:
# Encode 'geo' Variable using CatBoost Encoder

X_train_lv = []
X_test_lv = []
X_train_gr = []
X_test_gr = []

for i in range(len(sectors)):
    
    catboost_encoder = CatBoostEncoder(cols=['geo'])
    
    # Levels Data
    
    train_lv[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    test_lv[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    target = 'GVA '+ sectors[i].replace('[', '').replace(']', '')
    lag = target + ' L1'
    train_lv[i] = train_lv[i].dropna(subset = [target, lag])
    test_lv[i] = train_lv[i].dropna(subset = [target, lag])
    
    X_train_encoded = catboost_encoder.fit_transform(train_lv[i].drop(['year',target], axis=1), train_lv[i][target])
    X_train_lv.append(X_train_encoded)
    test_encoded = catboost_encoder.transform(test_lv[i].drop(['year',target], axis=1))
    X_test_lv.append(test_encoded)
    
    # Growth Rates Data
    
    train_gr[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    test_gr[i].rename(columns=lambda x: x.replace('[', '').replace(']', ''), inplace=True)
    target = 'Gr GVA '+ sectors[i].replace('[', '').replace(']', '')
    lag = target + ' L1'
    train_gr[i] = train_gr[i].dropna(subset = [target, lag])
    test_gr[i] = train_gr[i].dropna(subset = [target, lag])
    
    X_train_encoded = catboost_encoder.fit_transform(train_gr[i].drop(['year',target], axis=1), train_gr[i][target])
    X_train_gr.append(X_train_encoded)
    test_encoded = catboost_encoder.transform(test_gr[i].drop(['year',target], axis=1))
    X_test_gr.append(test_encoded)

In [13]:
# Hyperparameter Tuning for Levels Data Models

for i in range(len(sectors)):
    
    print('Sector ', sectors[i])
    print(' ')
    
    # Define Target
    target = 'GVA '+ sectors[i].replace('[', '').replace(']', '')
    
    # Impute Missing Data
    try:
        imputer = KNNImputer(n_neighbors=2)
        X = pd.DataFrame(imputer.fit_transform(X_train_lv[i]), columns=X_train_lv[i].columns.tolist())
    except:
        X = X_train_lv[i]
    
    # CV Splits
    #panel_split = PanelSplit(train_lv[i].year, n_splits=5, gap=0, test_size=1)
    TSCV = TimeSeriesSplit(n_splits=5)
    
    # Cross Validation for Tuning
    hgb_reg = HistGradientBoostingRegressor(random_state = 42)
    #grid_search = GridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, cv = TSCV, n_jobs = 3, verbose = 2, error_score='raise')
    grid_search = HalvingGridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, random_state = 42, cv = TSCV, n_jobs = 3, error_score='raise')
    grid_search.fit(X, train_lv[i][target])
    
    print('GridSearch results:')
    print(f"Best parameters: {grid_search.best_params_}")
    print(' ')

Sector  [A]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 50, 'max_leaf_nodes': 20, 'min_samples_leaf': 5}
 
Sector  [B-E]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 120, 'min_samples_leaf': 5}
 
Sector  [C]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 100, 'max_leaf_nodes': 20, 'min_samples_leaf': 5}
 
Sector  [F]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_iter': 50, 'max_leaf_nodes': 80, 'min_samples_leaf': 5}
 
Sector  [G-I]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 40, 'min_samples_leaf': 5}
 
Sector  [G-J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 50, 'max_leaf_nodes': 80, 'min_samples_leaf': 5}
 
Sector  [J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 

In [14]:
# Hyperparameter Tuning for Growth Rates Data Models

for i in range(len(sectors)):
    
    print('Sector ', sectors[i])
    print(' ')
    
    # Set Target
    target = 'Gr GVA '+ sectors[i].replace('[', '').replace(']', '')
    
    # Impute Missing Data
    try:
        imputer = KNNImputer(n_neighbors=2)
        X = pd.DataFrame(imputer.fit_transform(X_train_gr[i]), columns=X_train_gr[i].columns.tolist())
    except:
        X = X_train_gr[i]
    
    # CV Splits
    #panel_split = PanelSplit(train_lv[i].year, n_splits=5, gap=0, test_size=1)
    TSCV = TimeSeriesSplit(n_splits=5)
    
    # Cross Validation for Tuning
    hgb_reg = HistGradientBoostingRegressor(random_state = 42)
    #grid_search = GridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, cv = TSCV, n_jobs = 3, verbose = 2, error_score='raise')
    grid_search = HalvingGridSearchCV(hgb_reg, param_grid = param_grid, scoring = mse_scorer, random_state = 42, cv = TSCV, n_jobs = 3, error_score='raise')
    grid_search.fit(X, train_gr[i][target])
    
    print('GridSearch results:')
    print(f"Best parameters: {grid_search.best_params_}")
    print(' ')

Sector  [A]
 
GridSearch results:
Best parameters: {'learning_rate': 0.01, 'max_depth': 9, 'max_iter': 50, 'max_leaf_nodes': 40, 'min_samples_leaf': 5}
 
Sector  [B-E]
 
GridSearch results:
Best parameters: {'learning_rate': 0.01, 'max_depth': 8, 'max_iter': 150, 'max_leaf_nodes': 20, 'min_samples_leaf': 10}
 
Sector  [C]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 8, 'max_iter': 50, 'max_leaf_nodes': 20, 'min_samples_leaf': 30}
 
Sector  [F]
 
GridSearch results:
Best parameters: {'learning_rate': 0.01, 'max_depth': 9, 'max_iter': 150, 'max_leaf_nodes': 20, 'min_samples_leaf': 5}
 
Sector  [G-I]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 100, 'min_samples_leaf': 10}
 
Sector  [G-J]
 
GridSearch results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 50, 'max_leaf_nodes': 20, 'min_samples_leaf': 10}
 
Sector  [J]
 
GridSearch results:
Best parameters: {'learning_rate

In [18]:
# Check Model Results

# Data Types
model_type = ['Nominal Terms Values', 'Nominal Growth Rates']

# Levels Data Hyperparameters by Sector
learning_rates_lv = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.2]
max_depth_lv = [5, 3, 7, 4, 3, 5, 9, 9, 3, 3, 7, 3, 6, 9, 5]
max_iter_lv = [50, 50, 100, 50, 50, 50, 50, 50, 100, 150, 100, 50, 100, 150, 50]
max_leaf_lv = [20, 120, 20, 80, 40, 80, 120, 20, 60, 120, 120, 80, 40, 80, 20]
min_samples_lv = [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]

# Growth Rates Data Hyperparametrs by Sector
learning_rates_gr = [0.01, 0.01, 0.1, 0.01, 0.1, 0.1, 0.1, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.1]
max_depth_gr = [9, 8, 8, 9, 3, 3, 8, 8, 8, 8, 3, 8, 8, 3, 3]
max_iter_gr = [50, 150, 50, 150, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]
max_leaf_gr = [40, 20, 20, 20, 100, 20, 20, 120, 20, 20, 80, 20, 40, 20, 120]
min_samples_gr = [5, 10, 30, 5, 10, 10, 20, 15, 20, 40, 10, 20, 20, 50, 10]

for i in model_type:
    
    print(i)
    print(' ')
    
    for j in range(len(sectors)):
        
        if i == 'Nominal Terms Values':

            # Define Target
            target = 'GVA '+ sectors[j].replace('[', '').replace(']', '')

            # Impute Missing Data
            try:
                imputer = KNNImputer(n_neighbors=2)
                X = pd.DataFrame(imputer.fit_transform(X_train_lv[j]), columns=X_train_lv[j].columns.tolist())
            except:
                X = X_train_lv[j]

            # Fit HistGradientBoostingRegressor with given best parameters
            hgb = HistGradientBoostingRegressor(max_iter = max_depth_lv[j], max_depth = max_depth_lv[j], 
                                                min_samples_leaf = min_samples_lv[j], 
                                                learning_rate = learning_rates_lv[j], random_state = 42)
            hgb.fit(X, train_lv[j][target])
            
            # Show Results
            print('Sector ', sectors[j])
            print(' ')
            print('In-sample MSE :', mean_squared_error(train_lv[j][target], hgb.predict(X)))
            print('In-sample R Squared :', r2_score(train_lv[j][target], hgb.predict(X)))
            print(' ')
            print('Out of sample MSE :', mean_squared_error(test_lv[j][target], hgb.predict(X_test_lv[j])))
            print('Out of sample R Squared :', r2_score(test_lv[j][target], hgb.predict(X_test_lv[j])))
            print(' ')
            
        else:

            # Set Target
            target = 'Gr GVA '+ sectors[j].replace('[', '').replace(']', '')

            # Impute Missing Data
            try:
                imputer = KNNImputer(n_neighbors=2)
                X = pd.DataFrame(imputer.fit_transform(X_train_gr[j]), columns=X_train_gr[j].columns.tolist())
            except:
                X = X_train_gr[j]
                
            # Fit HistGradientBoostingRegressor with given best parameters
            hgb = HistGradientBoostingRegressor(max_iter = max_depth_gr[j], max_depth = max_depth_gr[j], 
                                                min_samples_leaf = min_samples_gr[j], 
                                                learning_rate = learning_rates_gr[j], random_state = 42)
            hgb.fit(X, train_gr[j][target])
            
            # Show Results
            print('Sector ', sectors[j])
            print(' ')
            print('In-sample MSE :', mean_squared_error(train_gr[j][target], hgb.predict(X)))
            print('In-sample R Squared :', r2_score(train_gr[j][target], hgb.predict(X)))
            print(' ')
            print('Out of sample MSE :', mean_squared_error(test_gr[j][target], hgb.predict(X_test_gr[j])))
            print('Out of sample R Squared :', r2_score(test_gr[j][target], hgb.predict(X_test_gr[j])))
            print(' ')

Nominal Terms Values
 
Sector  [A]
 
In-sample MSE : 14742.287536009166
In-sample R Squared : 0.6273982594310971
 
Out of sample MSE : 14394.034533317721
Out of sample R Squared : 0.6362001278415612
 
Sector  [B-E]
 
In-sample MSE : 3293997.9409780935
In-sample R Squared : 0.4507377079642061
 
Out of sample MSE : 3293997.9409780935
Out of sample R Squared : 0.4507377079642061
 
Sector  [C]
 
In-sample MSE : 1042109.0891816892
In-sample R Squared : 0.7596084890442838
 
Out of sample MSE : 1034197.9081306999
Out of sample R Squared : 0.7614334234835227
 
Sector  [F]
 
In-sample MSE : 248512.61235547715
In-sample R Squared : 0.5580364150415298
 
Out of sample MSE : 248512.61235547715
Out of sample R Squared : 0.5580364150415298
 
Sector  [G-I]
 
In-sample MSE : 5669113.122281814
In-sample R Squared : 0.4561020230404734
 
Out of sample MSE : 5645223.970232271
Out of sample R Squared : 0.45839396204235294
 
Sector  [G-J]
 
In-sample MSE : 8492409.380845197
In-sample R Squared : 0.6477522814