# Between model
This model takes as input any variable that is static, that is the OSM variables, ESA Landcover variables and the WSF variables. Moreover, it takes the mean over all dynamic variables. The dynamic variables include Nightlights, NDVI, and NDWI_Gao as well as NDWI_McF. 

The idea is that the between model captures variation between clusters and thus the target variable for the between model is $\bar{w}_c = \frac{1}{T_c}\sum_t^{T_c} w_{c,t}$ 

# Within model
To augment the number of training observations, I train the model on deltas, rather than on the demeaned variables. This substantially increases the number of training observations and covers a wider range of differences, making the training dataset more versatile and robust. Ideally, this helps to learn from a wider range of differences and thus increases the out-of-sample when predicting $\tilde{\boldsymbol{w}}_{ct}$.

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [22]:
# load the necessary functions from the analysis package

# load the variable names, this allows to access the variables in the feature data in a compact way
from analysis_utils.variable_names import *

# load flagged ids 
from analysis_utils.flagged_uids import *

# load the functions to do spatial k-fold CV
from analysis_utils.spatial_CV import *

# load the helper functions
from analysis_utils.analysis_helpers import *

# load the random forest trainer and cross_validator
import analysis_utils.RandomForest as rf

# load the combien model
from analysis_utils.CombinedModel import CombinedModel

In [23]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 423
spatial_cv_random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

In [24]:
# load the feature and the label data
lsms_df = pd.read_csv(lsms_pth)
# remove flagged ids form dataset
lsms_df = lsms_df[~lsms_df.unique_id.isin(flagged_uids)].reset_index()
lsms_df['delta_id'] = lsms_df.unique_id
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')
lsms_df['avg_mean_asset_index_yeh'] = lsms_df.groupby('cluster_id')['mean_asset_index_yeh'].transform('mean')
feat_df = pd.read_csv(feat_data_pth)

# describe the training data broadly
print(f"Number of observations {len(lsms_df)}")
print(f"Number of clusters {len(np.unique(lsms_df.cluster_id))}")
print(f"Number of x vars {len(feat_df.columns)-2}")

Number of observations 6401
Number of clusters 2128
Number of x vars 113


In [25]:
# merge the label and the feature data to one dataset
lsms_vars = ['unique_id', 'n_households',           
             'log_mean_pc_cons_usd_2017', 'avg_log_mean_pc_cons_usd_2017',
             'mean_asset_index_yeh', 'avg_mean_asset_index_yeh']
df = pd.merge(lsms_df[lsms_vars], feat_df, on = 'unique_id', how = 'left')

# Run Training

In [26]:
# define the within and between x variables
avg_rs_vars = avg_ndvi_vars + avg_ndwi_gao_vars + avg_nl_vars
osm_vars = osm_dist_vars + osm_count_vars + osm_road_vars

between_x_vars = osm_vars + esa_lc_vars + wsf_vars + avg_rs_vars + avg_preciptiation

dyn_rs_vars = dyn_ndvi_vars + dyn_ndwi_gao_vars + dyn_nl_vars
within_x_vars = dyn_rs_vars + precipitation

### Target: Log per capita consumption

In [27]:
between_target_var = 'avg_log_mean_pc_cons_usd_2017'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

In [29]:
# get the within dataframe
# define the within variables
within_target_var = 'log_mean_pc_cons_usd_2017'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

# create a delta df
demeaned_df = demean_df(within_df)
print("Creating Delta df")
delta_df = make_delta_df(within_df)

# combine the delta df, with the demeaned df
demeaned_df = demeaned_df.rename(columns = {'unique_id': 'delta_id'})
delta_dmn_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)
delta_dmn_df_norm = standardise_df(delta_dmn_df, exclude_cols = [within_target_var])

# subset the normalised dataframe into the demeaned data (used in validation) and the delta data (used in training)
demeaned_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(demeaned_df.delta_id)].copy().reset_index(drop = True)
delta_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(delta_df.delta_id)].copy().reset_index(drop = True)

print(f"Number of observations in delta df: {len(delta_df_norm)}")

Creating Delta df


  0%|          | 0/2128 [00:00<?, ?it/s]

Number of observations in delta df: 8600


In [9]:
# run repeated cross validation
rep_cv_res_cons = {
    'between_r2': [],
    'within_r2': [],
    'delta_r2': [],
    'overall_r2': []
}

for j in range(10):
    print("="*100)
    print(f"Iteration {j}")
    print("="*100)
    rep_seed = random_seed + j
    
    # divide the data into k different folds
    fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = spatial_cv_random_seed + j)
    
    # run the bewtween training
    print('Between training')
    between_cv_trainer = rf.CrossValidator(cl_df_norm, 
                                                fold_ids, 
                                                between_target_var, 
                                                between_x_vars, 
                                                id_var = 'cluster_id', 
                                                random_seed = rep_seed)
    between_cv_trainer.run_cv_training(min_samples_leaf = 1)
    
    # run the within training
    print("\nWithin training")
    delta_trainer = rf.CrossValidator(delta_df_norm, 
                                               fold_ids, 
                                               within_target_var, 
                                               within_x_vars, 
                                               id_var = 'delta_id', 
                                               random_seed = rep_seed)
    delta_trainer.run_cv_training(min_samples_leaf = 15)
    
    # get results of the delta predictions
    delta_res = delta_trainer.compute_overall_performance(use_fold_weights = True)
    
    # evaluate the delta model on the demeaned variables
    delta_evaluator = rf.CV_Evaluator(demeaned_df_norm, fold_ids, delta_trainer, id_var = 'delta_id')
    delta_evaluator.evaluate()
    delta_evaluator.compute_overall_performance()
    
    # add the predictions to the delta_trainer
    delta_trainer.predictions = delta_evaluator.predictions

    # combine both models
    combined_model = CombinedModel(lsms_df, between_cv_trainer, delta_trainer)
    combined_model.evaluate()
    combined_results = combined_model.compute_overall_performance(use_fold_weights = True)
    
    # store the results 
    rep_cv_res_cons['between_r2'].append(combined_results['r2']['between'])
    rep_cv_res_cons['within_r2'].append(combined_results['r2']['within'])
    rep_cv_res_cons['delta_r2'].append(delta_res['val_r2'])
    rep_cv_res_cons['overall_r2'].append(combined_results['r2']['overall'])
    
    # print the results
    print("."*100)
    print(combined_results)
    print(delta_res['val_r2'])
    print("."*100)

Iteration 0
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 179 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 537 seconds
....................................................................................................
{'r2': {'overall': 0.3763864542088609, 'between': 0.47766405472198875, 'within': -0.018147430388866438}, 'mse': {'overall': 0.21855888779176405, 'between': 0.1524115651385797, 'within': 0.17773319230114065}}
-0.01837167233755093
....................................................................................................
Iteration 1
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 202 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 514 seconds
....................................................................................................
{'r2': {'overall': 0.3495235633569438, 'between': 0.452485541188437, 'within': -0.012277494170292492}, 'mse': {'overall': 0.21732390022132173, 'between': 0.15160646232075717, 'within': 0.17667340605795692}}
-0.011787398455365087
....................................................................................................
Iteration 2
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 192 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 459 seconds
....................................................................................................
{'r2': {'overall': 0.3716137957526419, 'between': 0.4813245441294979, 'within': -0.016225160545617224}, 'mse': {'overall': 0.21661935955761408, 'between': 0.1498122983236336, 'within': 0.17672298239763898}}
-0.014957765237514157
....................................................................................................
Iteration 3
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.17
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 239 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 493 seconds
....................................................................................................
{'r2': {'overall': 0.34588323540456706, 'between': 0.4505660670254869, 'within': -0.023601695595994787}, 'mse': {'overall': 0.2211368492023913, 'between': 0.15350490870634637, 'within': 0.17818656664942661}}
-0.026701200863838717
....................................................................................................
Iteration 4
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 201 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 457 seconds
....................................................................................................
{'r2': {'overall': 0.35446774796551483, 'between': 0.4578227552495696, 'within': -0.023408558546974923}, 'mse': {'overall': 0.2196834554270559, 'between': 0.15318111256398848, 'within': 0.17760864142573343}}
-0.02459103393354441
....................................................................................................
Iteration 5
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.26
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.13
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 285 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 489 seconds
....................................................................................................
{'r2': {'overall': 0.36979817878179694, 'between': 0.4720337870786768, 'within': -0.007958044346077098}, 'mse': {'overall': 0.2130593824690269, 'between': 0.14889884901028436, 'within': 0.17710141692399145}}
-0.008216404224330776
....................................................................................................
Iteration 6
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 208 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 502 seconds
....................................................................................................
{'r2': {'overall': 0.3773823236854457, 'between': 0.47962406763698506, 'within': -0.010330839825649063}, 'mse': {'overall': 0.21611518384921077, 'between': 0.14960635985083662, 'within': 0.17671986682500754}}
-0.01245741625766459
....................................................................................................
Iteration 7
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.14
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 209 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 458 seconds
....................................................................................................
{'r2': {'overall': 0.3592767530577625, 'between': 0.4667590487192104, 'within': -0.010001587050515393}, 'mse': {'overall': 0.21389137481011844, 'between': 0.1494449647477894, 'within': 0.17684032779804568}}
-0.011000974591502065
....................................................................................................
Iteration 8
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 201 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 521 seconds
....................................................................................................
{'r2': {'overall': 0.35940032268522154, 'between': 0.46790376125867494, 'within': -0.010115951292006774}, 'mse': {'overall': 0.2170716853334766, 'between': 0.15123758738520723, 'within': 0.17479998790640833}}
-0.01166933405764155
....................................................................................................
Iteration 9
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 188 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 468 seconds
....................................................................................................
{'r2': {'overall': 0.3645485457025137, 'between': 0.4728559804377793, 'within': -0.015896056011614923}, 'mse': {'overall': 0.21655514406445342, 'between': 0.15039117716823575, 'within': 0.1777265255708545}}
-0.014975757597155193
....................................................................................................


In [10]:
pth = 'results/baseline/rep_cv_cons_delta.pkl'
with open(pth, 'wb') as f:
    pickle.dump(rep_cv_res_cons, f)

In [17]:
pth = 'results/baseline/rep_cv_cons_delta.pkl'
with open(pth, 'rb') as f:
    rep_cv_res_cons = pickle.load(f)

mean_r2_cons = {k: np.mean(v) for k,v in rep_cv_res_cons.items()}
se_r2_cons = {k: np.std(v)/np.sqrt(10) for k,v in rep_cv_res_cons.items()}

# print as tex
print(f"& {mean_r2_cons['between_r2']:.4f} & {mean_r2_cons['within_r2']:.4f} & {mean_r2_cons['overall_r2']:.4f} & {mean_r2_cons['delta_r2']:.4f}  \\\\")
print(f"& \\footnotesize({se_r2_cons['between_r2']:.4f}) & \\footnotesize({se_r2_cons['within_r2']:.4f}) & \\footnotesize({se_r2_cons['overall_r2']:.4f}) & \\footnotesize({se_r2_cons['delta_r2']:.4f})\\\\")


& 0.4679 & -0.0148 & 0.3628 & -0.0155  \\
& \footnotesize(0.0033) & \footnotesize(0.0017) & \footnotesize(0.0033) & \footnotesize(0.0018)\\


In [7]:
pth = 'results/baseline/rep_cv_cons_delta.pkl'
with open(pth, 'rb') as f:
    cons_res = pickle.load(f)

In [15]:
print(f"Delta R2: {np.mean(cons_res['delta_r2']):.4f}")
print(f"Delta SE: {np.std(cons_res['delta_r2'])/np.sqrt(10):.4f}")

Delta R2: -0.0155
Delta SE: 0.0018


### Target: Asset index

In [12]:
# get a dataset that only varies at the cluster level
between_target_var = 'avg_mean_asset_index_yeh'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

In [13]:
# define the within variables
within_target_var = 'mean_asset_index_yeh'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

# create a delta df
demeaned_df = demean_df(within_df)
print("Creating Delta df")
delta_df = make_delta_df(within_df)

# combine the delta df, with the demeaned df
demeaned_df = demeaned_df.rename(columns = {'unique_id': 'delta_id'})
delta_dmn_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)
delta_dmn_df_norm = standardise_df(delta_dmn_df, exclude_cols = [within_target_var])

# subset the normalised dataframe into the demeaned data (used in validation) and the delta data (used in training)
demeaned_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(demeaned_df.delta_id)].copy().reset_index(drop = True)
delta_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(delta_df.delta_id)].copy().reset_index(drop = True)

Creating Delta df


  0%|          | 0/2128 [00:00<?, ?it/s]

In [14]:
# run repeated cross validation
rep_cv_res_asset = {
    'between_r2': [],
    'within_r2': [],
    'delta_r2': [],
    'overall_r2': []
}

for j in range(10):
    print("="*100)
    print(f"Iteration {j}")
    print("="*100)
    rep_seed = random_seed + j
    
    # divide the data into k different folds
    fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = spatial_cv_random_seed + j)
    
    # run the bewtween training
    print('Between training')
    between_cv_trainer = rf.CrossValidator(cl_df_norm, 
                                                fold_ids, 
                                                between_target_var, 
                                                between_x_vars, 
                                                id_var = 'cluster_id', 
                                                random_seed = rep_seed)
    between_cv_trainer.run_cv_training(min_samples_leaf = 1)
    
    # run the within training
    print("\nWithin training")
    delta_trainer = rf.CrossValidator(delta_df_norm, 
                                               fold_ids, 
                                               within_target_var, 
                                               within_x_vars, 
                                               id_var = 'delta_id', 
                                               random_seed = rep_seed)
    delta_trainer.run_cv_training(min_samples_leaf = 15)
    
    # get results of the delta predictions
    delta_res = delta_trainer.compute_overall_performance(use_fold_weights = True)
    
    # evaluate the delta model on the demeaned variables
    delta_evaluator = rf.CV_Evaluator(demeaned_df_norm, fold_ids, delta_trainer, id_var = 'delta_id')
    delta_evaluator.evaluate()
    delta_evaluator.compute_overall_performance()
    
    # add the predictions to the delta_trainer
    delta_trainer.predictions = delta_evaluator.predictions

    # combine both models
    combined_model = CombinedModel(lsms_df, between_cv_trainer, delta_trainer)
    combined_model.evaluate()
    combined_results = combined_model.compute_overall_performance(use_fold_weights = True)
    
    # store the results 
    rep_cv_res_asset['between_r2'].append(combined_results['r2']['between'])
    rep_cv_res_asset['within_r2'].append(combined_results['r2']['within'])
    rep_cv_res_asset['delta_r2'].append(delta_res['val_r2'])
    rep_cv_res_asset['overall_r2'].append(combined_results['r2']['overall'])
    
    # print the results
    print("."*100)
    print(combined_results)
    print(delta_res['val_r2'])
    print("."*100)

Iteration 0
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 193 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 449 seconds
....................................................................................................
{'r2': {'overall': 0.49734093118670436, 'between': 0.5269073580780452, 'within': 0.02411204374755973}, 'mse': {'overall': 1.1166528436352692, 'between': 1.1501767333281865, 'within': 0.2558077407467569}}
0.024820225470310948
....................................................................................................
Iteration 1
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 206 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 552 seconds
....................................................................................................
{'r2': {'overall': 0.4776733904862209, 'between': 0.5141778707953452, 'within': 0.014532102738887076}, 'mse': {'overall': 1.110836571768237, 'between': 1.1323349221025276, 'within': 0.26018208370507645}}
0.014667535840295513
....................................................................................................
Iteration 2
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 184 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 455 seconds
....................................................................................................
{'r2': {'overall': 0.48983703257624556, 'between': 0.5227742028540355, 'within': 0.03592547114635547}, 'mse': {'overall': 1.113255813738592, 'between': 1.1417236269024638, 'within': 0.2543618798788891}}
0.03588764742145118
....................................................................................................
Iteration 3
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.17
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 251 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 468 seconds
....................................................................................................
{'r2': {'overall': 0.4817925602173824, 'between': 0.5084122798849106, 'within': 0.008563284832168616}, 'mse': {'overall': 1.1188688905858584, 'between': 1.1503970613830434, 'within': 0.2512048125547214}}
0.011576087831282264
....................................................................................................
Iteration 4
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 227 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 487 seconds
....................................................................................................
{'r2': {'overall': 0.47596220629071956, 'between': 0.5032669435267736, 'within': 0.025518128616211232}, 'mse': {'overall': 1.1032535543661561, 'between': 1.1438875304998806, 'within': 0.25259616486349845}}
0.02530934650114543
....................................................................................................
Iteration 5
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.26
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.13
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 218 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 526 seconds
....................................................................................................
{'r2': {'overall': 0.46220138702057767, 'between': 0.4830192127123397, 'within': 0.022199367988587702}, 'mse': {'overall': 1.1073295924895978, 'between': 1.162587849423459, 'within': 0.2507068867026979}}
0.02181055106571556
....................................................................................................
Iteration 6
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 235 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 519 seconds
....................................................................................................
{'r2': {'overall': 0.49084177610347907, 'between': 0.5202195909821432, 'within': 0.013603933867819329}, 'mse': {'overall': 1.1260823376746611, 'between': 1.146001294653968, 'within': 0.2552104932535819}}
0.013034543698398132
....................................................................................................
Iteration 7
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.14
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 255 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 458 seconds
....................................................................................................
{'r2': {'overall': 0.47479013023433625, 'between': 0.500887746647168, 'within': 0.03213531530366209}, 'mse': {'overall': 1.091222656912666, 'between': 1.143938381572645, 'within': 0.2497061353600604}}
0.032541453650170075
....................................................................................................
Iteration 8
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 237 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 511 seconds
....................................................................................................
{'r2': {'overall': 0.4791299438615249, 'between': 0.5047568392187705, 'within': 0.020985063574567597}, 'mse': {'overall': 1.0992611650895314, 'between': 1.13844925655747, 'within': 0.2545852511174326}}
0.02019142726721898
....................................................................................................
Iteration 9
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 256 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 555 seconds
....................................................................................................
{'r2': {'overall': 0.47925980476805763, 'between': 0.5172519639484899, 'within': 0.021787048219215243}, 'mse': {'overall': 1.1087578749009597, 'between': 1.1291558565306428, 'within': 0.26003149824188637}}
0.021741358868822665
....................................................................................................


In [15]:
pth = 'results/baseline/rep_cv_asset_delta.pkl'
with open(pth, 'wb') as f:
    pickle.dump(rep_cv_res_asset, f)

In [18]:
pth = 'results/baseline/rep_cv_asset_delta.pkl'
with open(pth, 'rb') as f:
    rep_cv_res_asset = pickle.load(f)
    
mean_r2_asset = {k: np.mean(v) for k,v in rep_cv_res_asset.items()}
se_r2_asset = {k: np.std(v)/np.sqrt(10) for k,v in rep_cv_res_asset.items()}

# print as tex
print(f"& {mean_r2_asset['between_r2']:.4f} & {mean_r2_asset['within_r2']:.4f} & {mean_r2_asset['overall_r2']:.4f} & {mean_r2_asset['delta_r2']:.4f} \\\\")
print(f"& \\footnotesize({se_r2_asset['between_r2']:.4f}) & \\footnotesize({se_r2_asset['within_r2']:.4f}) & \\footnotesize({se_r2_asset['overall_r2']:.4f}) & \\footnotesize({se_r2_asset['delta_r2']:.4f}) \\\\")

& 0.5102 & 0.0219 & 0.4809 & 0.0222 \\
& \footnotesize(0.0039) & \footnotesize(0.0025) & \footnotesize(0.0030) & \footnotesize(0.0024) \\


In [11]:
pth = 'results/baseline/rep_cv_asset_delta.pkl'
with open(pth, 'rb') as f:
    asset_res = pickle.load(f)

In [14]:
print(f"Delta R2: {np.mean(asset_res['delta_r2']):.4f}")
print(f"Delta SE: {np.std(asset_res['delta_r2'])/np.sqrt(len(asset_res['delta_r2'])):.4f}")

Delta R2: 0.0222
Delta SE: 0.0024
