# Between model
This model takes as input any variable that is static, that is the OSM variables, ESA Landcover variables and the WSF variables. Moreover, it takes the mean over all dynamic variables. The dynamic variables include Nightlights, NDVI, and NDWI_Gao as well as NDWI_McF. 

The idea is that the between model captures variation between clusters and thus the target variable for the between model is $\bar{w}_c = \frac{1}{T_c}\sum_t^{T_c} w_{c,t}$ 

# Within model
To augment the number of training observations, I train the model on deltas, rather than on the demeaned variables. This substantially increases the number of training observations and covers a wider range of differences, making the training dataset more versatile and robust. Ideally, this helps to learn from a wider range of differences and thus increases the out-of-sample when predicting $\tilde{\boldsymbol{w}}_{ct}$.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import pickle
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
# load the necessary functions from the analysis package

# load the variable names, this allows to access the variables in the feature data in a compact way
from analysis_utils.variable_names import *

# load flagged ids 
from analysis_utils.flagged_uids import *

# load the functions to do spatial k-fold CV
from analysis_utils.spatial_CV import *

# load the helper functions
from analysis_utils.analysis_helpers import *

# load the random forest trainer and cross_validator
import analysis_utils.RandomForest as rf

# load the combien model
from analysis_utils.CombinedModel import CombinedModel

In [3]:
# set the global file paths
root_data_dir = "../../Data"

# the lsms data
lsms_pth = f"{root_data_dir}/lsms/processed/labels_cluster_v1.csv"

# the feature data
feat_data_pth = f"{root_data_dir}/feature_data/tabular_data.csv"

# set the random seed
random_seed = 423
spatial_cv_random_seed = 348

# set the number of folds for k-fold CV
n_folds = 5

In [4]:
# load the feature and the label data
lsms_df = pd.read_csv(lsms_pth)
# remove flagged ids form dataset
lsms_df = lsms_df[~lsms_df.unique_id.isin(flagged_uids)].reset_index()
lsms_df['delta_id'] = lsms_df.unique_id
lsms_df['avg_log_mean_pc_cons_usd_2017'] = lsms_df.groupby('cluster_id')['log_mean_pc_cons_usd_2017'].transform('mean')
lsms_df['avg_mean_asset_index_yeh'] = lsms_df.groupby('cluster_id')['mean_asset_index_yeh'].transform('mean')
feat_df = pd.read_csv(feat_data_pth)

# describe the training data broadly
print(f"Number of observations {len(lsms_df)}")
print(f"Number of clusters {len(np.unique(lsms_df.cluster_id))}")
print(f"Number of x vars {len(feat_df.columns)-2}")

Number of observations 6401
Number of clusters 2128
Number of x vars 113


In [5]:
# merge the label and the feature data to one dataset
lsms_vars = ['unique_id', 'n_households',           
             'log_mean_pc_cons_usd_2017', 'avg_log_mean_pc_cons_usd_2017',
             'mean_asset_index_yeh', 'avg_mean_asset_index_yeh']
df = pd.merge(lsms_df[lsms_vars], feat_df, on = 'unique_id', how = 'left')

# Run Training

In [6]:
# define the within and between x variables
avg_rs_vars = avg_ndvi_vars + avg_ndwi_gao_vars + avg_nl_vars
osm_vars = osm_dist_vars + osm_count_vars + osm_road_vars

between_x_vars = osm_vars + esa_lc_vars + wsf_vars + avg_rs_vars + avg_preciptiation + median_rgb_vars

dyn_rs_vars = dyn_ndvi_vars + dyn_ndwi_gao_vars + dyn_nl_vars
within_x_vars = dyn_rs_vars + precipitation + dyn_rgb_vars

### Target: Log per capita consumption

In [7]:
between_target_var = 'avg_log_mean_pc_cons_usd_2017'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

In [8]:
# get the within dataframe
# define the within variables
within_target_var = 'log_mean_pc_cons_usd_2017'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

# create a delta df
demeaned_df = demean_df(within_df)
print("Creating Delta df")
delta_df = make_delta_df(within_df)

# combine the delta df, with the demeaned df
demeaned_df = demeaned_df.rename(columns = {'unique_id': 'delta_id'})
delta_dmn_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)
delta_dmn_df_norm = standardise_df(delta_dmn_df, exclude_cols = [within_target_var])

# subset the normalised dataframe into the demeaned data (used in validation) and the delta data (used in training)
demeaned_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(demeaned_df.delta_id)].copy().reset_index(drop = True)
delta_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(delta_df.delta_id)].copy().reset_index(drop = True)

Creating Delta df


  0%|          | 0/2128 [00:00<?, ?it/s]

In [9]:
# run repeated cross validation
rep_cv_res_cons = {
    'between_r2': [],
    'within_r2': [],
    'delta_r2': [],
    'overall_r2': []
}

for j in range(10):
    print("="*100)
    print(f"Iteration {j}")
    print("="*100)
    rep_seed = random_seed + j
    
    # divide the data into k different folds
    fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = spatial_cv_random_seed + j)
    
    # run the bewtween training
    print('Between training')
    between_cv_trainer = rf.CrossValidator(cl_df_norm, 
                                                fold_ids, 
                                                between_target_var, 
                                                between_x_vars, 
                                                id_var = 'cluster_id', 
                                                random_seed = rep_seed)
    between_cv_trainer.run_cv_training(min_samples_leaf = 1)
    
    # run the within training
    print("\nWithin training")
    delta_trainer = rf.CrossValidator(delta_df_norm, 
                                               fold_ids, 
                                               within_target_var, 
                                               within_x_vars, 
                                               id_var = 'delta_id', 
                                               random_seed = rep_seed)
    delta_trainer.run_cv_training(min_samples_leaf = 15)
    
    # get results of the delta predictions
    delta_res = delta_trainer.compute_overall_performance(use_fold_weights = True)
    
    # evaluate the delta model on the demeaned variables
    delta_evaluator = rf.CV_Evaluator(demeaned_df_norm, fold_ids, delta_trainer, id_var = 'delta_id')
    delta_evaluator.evaluate()
    delta_evaluator.compute_overall_performance()
    
    # add the predictions to the delta_trainer
    delta_trainer.predictions = delta_evaluator.predictions

    # combine both models
    combined_model = CombinedModel(lsms_df, between_cv_trainer, delta_trainer)
    combined_model.evaluate()
    combined_results = combined_model.compute_overall_performance(use_fold_weights = True)
    
    # store the results 
    rep_cv_res_cons['between_r2'].append(combined_results['r2']['between'])
    rep_cv_res_cons['within_r2'].append(combined_results['r2']['within'])
    rep_cv_res_cons['delta_r2'].append(delta_res['val_r2'])
    rep_cv_res_cons['overall_r2'].append(combined_results['r2']['overall'])
    
    # print the results
    print("."*100)
    print(combined_results)
    print(delta_res['val_r2'])
    print("."*100)

Iteration 0
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 265 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 1057 seconds
....................................................................................................
{'r2': {'overall': 0.38271232479511413, 'between': 0.4680397028161642, 'within': 0.006810988732453738}, 'mse': {'overall': 0.21630467821898658, 'between': 0.15544672368474663, 'within': 0.17326758372499557}}
0.006998940108383306
....................................................................................................
Iteration 1
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 246 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 958 seconds
....................................................................................................
{'r2': {'overall': 0.35063185616927034, 'between': 0.4373700346429812, 'within': 0.010345259978969557}, 'mse': {'overall': 0.21766941603620527, 'between': 0.1563233863099223, 'within': 0.17276267063291537}}
0.010561350985600657
....................................................................................................
Iteration 2
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 230 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 957 seconds
....................................................................................................
{'r2': {'overall': 0.37981546062737476, 'between': 0.4703481901733915, 'within': 0.011306661934008932}, 'mse': {'overall': 0.213593991224197, 'between': 0.15275229044326138, 'within': 0.17195988289590186}}
0.012205073723492475
....................................................................................................
Iteration 3
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.17
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 282 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 947 seconds
....................................................................................................
{'r2': {'overall': 0.3578106055103658, 'between': 0.44387537729956195, 'within': 0.012292412874956082}, 'mse': {'overall': 0.21686914943764413, 'between': 0.15538676518979067, 'within': 0.17175639150861724}}
0.011533019857124637
....................................................................................................
Iteration 4
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 261 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 933 seconds
....................................................................................................
{'r2': {'overall': 0.36030068703582047, 'between': 0.44668153536773103, 'within': -0.0009854483690895366}, 'mse': {'overall': 0.2183578765551149, 'between': 0.1568348807528071, 'within': 0.1738459527144281}}
-0.001666249802815344
....................................................................................................
Iteration 5
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.26
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.13
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 238 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 928 seconds
....................................................................................................
{'r2': {'overall': 0.3710241810439191, 'between': 0.4603291929287872, 'within': 0.017218215524484236}, 'mse': {'overall': 0.2129789319803504, 'between': 0.15233050590403518, 'within': 0.1727091826890664}}
0.017119103713046036
....................................................................................................
Iteration 6
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 251 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 926 seconds
....................................................................................................
{'r2': {'overall': 0.38002196668478866, 'between': 0.4653559140463946, 'within': 0.016522642848943492}, 'mse': {'overall': 0.21496686795048936, 'between': 0.1537418243454254, 'within': 0.17188283079589373}}
0.01540047735836468
....................................................................................................
Iteration 7
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.14
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 248 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 948 seconds
....................................................................................................
{'r2': {'overall': 0.35898990753135274, 'between': 0.45284080864933723, 'within': 0.018099772852773244}, 'mse': {'overall': 0.21471061721000903, 'between': 0.15363118655566205, 'within': 0.17189883521659294}}
0.01687722204149326
....................................................................................................
Iteration 8
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 241 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 927 seconds
....................................................................................................
{'r2': {'overall': 0.3629047040673096, 'between': 0.4575603867917326, 'within': 0.015129805569607857}, 'mse': {'overall': 0.2159450871518056, 'between': 0.15413245592899416, 'within': 0.17027618475361225}}
0.014693753005610806
....................................................................................................
Iteration 9
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 323 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 1031 seconds
....................................................................................................
{'r2': {'overall': 0.36766780733947696, 'between': 0.4588353562899485, 'within': 0.010227655154473391}, 'mse': {'overall': 0.21568768351186762, 'between': 0.15447896843223147, 'within': 0.17300039718954113}}
0.011045623924569611
....................................................................................................


In [10]:
pth = 'results/baseline_ls/rep_cv_cons_delta.pkl'
with open(pth, 'wb') as f:
    pickle.dump(rep_cv_res_cons, f)

In [8]:
pth = 'results/baseline_ls/rep_cv_cons_delta.pkl'
with open(pth, 'rb') as f:
    rep_cv_res_cons = pickle.load(f)

mean_r2_cons = {k: np.mean(v) for k,v in rep_cv_res_cons.items()}
se_r2_cons = {k: np.std(v)/np.sqrt(10) for k,v in rep_cv_res_cons.items()}

# print as tex
print(f"& {mean_r2_cons['between_r2']:.4f} & {mean_r2_cons['within_r2']:.4f} & {mean_r2_cons['overall_r2']:.4f} & {mean_r2_cons['delta_r2']:.4f}  \\\\")
print(f"& \\footnotesize({se_r2_cons['between_r2']:.4f}) & \\footnotesize({se_r2_cons['within_r2']:.4f}) & \\footnotesize({se_r2_cons['overall_r2']:.4f}) & \\footnotesize({se_r2_cons['delta_r2']:.4f})\\\\")


& 0.4561 & 0.0117 & 0.3672 & 0.0115  \\
& \footnotesize(0.0033) & \footnotesize(0.0017) & \footnotesize(0.0033) & \footnotesize(0.0017)\\


### Target: Asset index

In [12]:
# get a dataset that only varies at the cluster level
between_target_var = 'avg_mean_asset_index_yeh'
cl_df = df[['cluster_id', between_target_var] + between_x_vars].drop_duplicates().reset_index(drop = True)

# normalise the feature data
cl_df_norm = standardise_df(cl_df, exclude_cols = [between_target_var])

In [13]:
# define the within variables
within_target_var = 'mean_asset_index_yeh'
within_df = df[['cluster_id','unique_id', within_target_var] + within_x_vars]

# create a delta df
demeaned_df = demean_df(within_df)
print("Creating Delta df")
delta_df = make_delta_df(within_df)

# combine the delta df, with the demeaned df
demeaned_df = demeaned_df.rename(columns = {'unique_id': 'delta_id'})
delta_dmn_df = pd.concat([delta_df, demeaned_df]).reset_index(drop = True)
delta_dmn_df_norm = standardise_df(delta_dmn_df, exclude_cols = [within_target_var])

# subset the normalised dataframe into the demeaned data (used in validation) and the delta data (used in training)
demeaned_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(demeaned_df.delta_id)].copy().reset_index(drop = True)
delta_df_norm = delta_dmn_df_norm[delta_dmn_df_norm.delta_id.isin(delta_df.delta_id)].copy().reset_index(drop = True)

Creating Delta df


  0%|          | 0/2128 [00:00<?, ?it/s]

In [14]:
# run repeated cross validation
rep_cv_res_asset = {
    'between_r2': [],
    'within_r2': [],
    'delta_r2': [],
    'overall_r2': []
}

for j in range(10):
    print("="*100)
    print(f"Iteration {j}")
    print("="*100)
    rep_seed = random_seed + j
    
    # divide the data into k different folds
    fold_ids = split_lsms_spatial(lsms_df, n_folds = n_folds, random_seed = spatial_cv_random_seed + j)
    
    # run the bewtween training
    print('Between training')
    between_cv_trainer = rf.CrossValidator(cl_df_norm, 
                                                fold_ids, 
                                                between_target_var, 
                                                between_x_vars, 
                                                id_var = 'cluster_id', 
                                                random_seed = rep_seed)
    between_cv_trainer.run_cv_training(min_samples_leaf = 1)
    
    # run the within training
    print("\nWithin training")
    delta_trainer = rf.CrossValidator(delta_df_norm, 
                                               fold_ids, 
                                               within_target_var, 
                                               within_x_vars, 
                                               id_var = 'delta_id', 
                                               random_seed = rep_seed)
    delta_trainer.run_cv_training(min_samples_leaf = 15)
    
    # get results of the delta predictions
    delta_res = delta_trainer.compute_overall_performance(use_fold_weights = True)
    
    # evaluate the delta model on the demeaned variables
    delta_evaluator = rf.CV_Evaluator(demeaned_df_norm, fold_ids, delta_trainer, id_var = 'delta_id')
    delta_evaluator.evaluate()
    delta_evaluator.compute_overall_performance()
    
    # add the predictions to the delta_trainer
    delta_trainer.predictions = delta_evaluator.predictions

    # combine both models
    combined_model = CombinedModel(lsms_df, between_cv_trainer, delta_trainer)
    combined_model.evaluate()
    combined_results = combined_model.compute_overall_performance(use_fold_weights = True)
    
    # store the results 
    rep_cv_res_asset['between_r2'].append(combined_results['r2']['between'])
    rep_cv_res_asset['within_r2'].append(combined_results['r2']['within'])
    rep_cv_res_asset['delta_r2'].append(delta_res['val_r2'])
    rep_cv_res_asset['overall_r2'].append(combined_results['r2']['overall'])
    
    # print the results
    print("."*100)
    print(combined_results)
    print(delta_res['val_r2'])
    print("."*100)

Iteration 0
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 289 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 971 seconds
....................................................................................................
{'r2': {'overall': 0.4895710676639905, 'between': 0.511914212918517, 'within': 0.05526805260284837}, 'mse': {'overall': 1.1349368844973553, 'between': 1.186998658499354, 'within': 0.2492365926466408}}
0.05617366358121622
....................................................................................................
Iteration 1
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 294 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 1051 seconds
....................................................................................................
{'r2': {'overall': 0.46777569830324456, 'between': 0.49889247755311256, 'within': 0.04827216020656302}, 'mse': {'overall': 1.1309125204593002, 'between': 1.168068975150059, 'within': 0.25285409794071784}}
0.048162552737970744
....................................................................................................
Iteration 2
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 264 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 821 seconds
....................................................................................................
{'r2': {'overall': 0.4806965167298979, 'between': 0.5095096586567669, 'within': 0.0631028987076105}, 'mse': {'overall': 1.1315165000590193, 'between': 1.1727883654121796, 'within': 0.24898652595671908}}
0.06354480491593804
....................................................................................................
Iteration 3
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.17
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 218 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 801 seconds
....................................................................................................
{'r2': {'overall': 0.47394923292595514, 'between': 0.49386352260951977, 'within': 0.043333613755008014}, 'mse': {'overall': 1.1354464622665743, 'between': 1.1852278707296993, 'within': 0.2438410417988072}}
0.04358334228828824
....................................................................................................
Iteration 4
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 218 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 800 seconds
....................................................................................................
{'r2': {'overall': 0.46454610286385783, 'between': 0.4851591626062994, 'within': 0.05890594838603016}, 'mse': {'overall': 1.1275742148573558, 'between': 1.185947457502159, 'within': 0.24654596360612366}}
0.05861892581355295
....................................................................................................
Iteration 5
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.26
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.13
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 216 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 782 seconds
....................................................................................................
{'r2': {'overall': 0.45263077986204325, 'between': 0.46913729535421445, 'within': 0.04843577107281423}, 'mse': {'overall': 1.1253915972500088, 'between': 1.1925117338690798, 'within': 0.24536874201234218}}
0.04829492751328881
....................................................................................................
Iteration 6
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.18
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 205 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 770 seconds
....................................................................................................
{'r2': {'overall': 0.484849937037314, 'between': 0.5074997725684639, 'within': 0.054497593064964}, 'mse': {'overall': 1.1377404205665322, 'between': 1.1757306084233676, 'within': 0.24461861562975912}}
0.0535588799080074
....................................................................................................
Iteration 7
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.25
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.14
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 213 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 784 seconds
....................................................................................................
{'r2': {'overall': 0.46154825389276, 'between': 0.48317115568569446, 'within': 0.054507912988030816}, 'mse': {'overall': 1.1159936968364157, 'between': 1.1840521054884658, 'within': 0.24536574008385587}}
0.05456706000865173
....................................................................................................
Iteration 8
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.23
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.16
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 217 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 784 seconds
....................................................................................................
{'r2': {'overall': 0.46876194812047833, 'between': 0.487822956737478, 'within': 0.0498463880989168}, 'mse': {'overall': 1.1209719741524455, 'between': 1.1776162092659368, 'within': 0.24814330203142704}}
0.049756154208269406
....................................................................................................
Iteration 9
Fold 0, specified test ratio: 0.2 - Actual test ratio 0.21
Fold 1, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 2, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 3, specified test ratio: 0.2 - Actual test ratio 0.20
Fold 4, specified test ratio: 0.2 - Actual test ratio 0.19
Between training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 200 seconds

Within training
Initialising training


  0%|          | 0/5 [00:00<?, ?it/s]

Finished training after 771 seconds
....................................................................................................
{'r2': {'overall': 0.47037096271522205, 'between': 0.5019702444655766, 'within': 0.05812843509249175}, 'mse': {'overall': 1.1265190057014047, 'between': 1.1641378535155429, 'within': 0.2517340183980058}}
0.05859139923610297
....................................................................................................


In [15]:
pth = 'results/baseline_ls/rep_cv_asset_delta.pkl'
with open(pth, 'wb') as f:
    pickle.dump(rep_cv_res_asset, f)

In [16]:
mean_r2_asset = {k: np.mean(v) for k,v in rep_cv_res_asset.items()}
se_r2_asset = {k: np.std(v)/np.sqrt(10) for k,v in rep_cv_res_asset.items()}

# print as tex
print(f"& {mean_r2_asset['between_r2']:.4f} & {mean_r2_asset['within_r2']:.4f} & {mean_r2_asset['overall_r2']:.4f} \\\\")
print(f"& \\footnotesize({se_r2_asset['between_r2']:.4f}) & \\footnotesize({se_r2_asset['within_r2']:.4f}) & \\footnotesize({se_r2_asset['overall_r2']:.4f})\\\\")

& 0.4949 & 0.0534 & 0.4715 \\
& \footnotesize(0.0041) & \footnotesize(0.0018) & \footnotesize(0.0033)\\


In [9]:
pth = 'results/baseline_ls/rep_cv_asset_delta.pkl'
with open(pth, 'rb') as f:
    rep_cv_res_asset = pickle.load(f)
    
mean_r2_asset = {k: np.mean(v) for k,v in rep_cv_res_asset.items()}
se_r2_asset = {k: np.std(v)/np.sqrt(10) for k,v in rep_cv_res_asset.items()}

# print as tex
print(f"& {mean_r2_asset['between_r2']:.4f} & {mean_r2_asset['within_r2']:.4f} & {mean_r2_asset['overall_r2']:.4f} & {mean_r2_asset['delta_r2']:.4f} \\\\")
print(f"& \\footnotesize({se_r2_asset['between_r2']:.4f}) & \\footnotesize({se_r2_asset['within_r2']:.4f}) & \\footnotesize({se_r2_asset['overall_r2']:.4f}) & \\footnotesize({se_r2_asset['delta_r2']:.4f}) \\\\")

& 0.4949 & 0.0534 & 0.4715 & 0.0535 \\
& \footnotesize(0.0041) & \footnotesize(0.0018) & \footnotesize(0.0033) & \footnotesize(0.0018) \\
