In [1]:
import pandas as pd
import os
import json
import yaml
from pathlib import Path
import collections
import shutil
from util_xgb import xgb_process_data, xgb_hyperparam_search, xgb_multi_seed_inference

In [2]:
run = 'default' # Run name - must be an existing directory from an EcoPerceiver experiment
                # as it reuses the config.yml to do train/test splitting

DATA_DIR = Path('data') / 'carbonsense'
ALL_RUN_DIR = Path('runs')
RUN_DIR = ALL_RUN_DIR / run
CONFIG_PATH = RUN_DIR / 'config.yml'
XGB_DIR = RUN_DIR / 'xgb'

with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

TRAIN_SITES = config['data']['train_sites']
VAL_SITES = config['data']['val_sites']
TEST_SITES = config['data']['test_sites']

In [14]:
def extract_igbp(site):
    site_dir = DATA_DIR / site
    meta_file = site_dir / os.listdir(site_dir)[0] / 'meta.json'
    with open(meta_file, 'r') as f:
        d = json.load(f)
    return d['IGBP']


# Visualize how many of each site type were in each set
def site_configuration():
    igbp_values = list(set([extract_igbp(s) for s in TRAIN_SITES + VAL_SITES + TEST_SITES]))
    train_val_igbp = {i: [0,0,0] for i in igbp_values}
    for site in TRAIN_SITES:
        igbp = extract_igbp(site)
        train_val_igbp[igbp][0] += 1
    for site in VAL_SITES:
        igbp = extract_igbp(site)
        train_val_igbp[igbp][1] += 1
    for site in TEST_SITES:
        igbp = extract_igbp(site)
        train_val_igbp[igbp][2] += 1

    site_type_distribution = pd.DataFrame(data=train_val_igbp).T.rename(columns={0: 'train', 1: 'val'})
    site_type_distribution.to_csv(RUN_DIR / 'site_type_distribution_2.csv')

In [4]:
# Check for other runs with the same distribution
def find_identical_run():
    for d in os.listdir(ALL_RUN_DIR):
        other_run_dir = ALL_RUN_DIR / d
        if other_run_dir == RUN_DIR:
            continue
        if not os.path.exists(other_run_dir / 'config.yml'):
            continue
        
        with open(other_run_dir / 'config.yml', 'r') as f:
            other_config = yaml.safe_load(f)
        
        other_train_sites = other_config.get('data', {}).get('train_sites', [])
        other_val_sites = other_config.get('data', {}).get('val_sites', [])
        other_test_sites = other_config.get('data', {}).get('test_sites', [])
        
        if collections.Counter(TRAIN_SITES) == collections.Counter(other_train_sites) \
                and collections.Counter(VAL_SITES) == collections.Counter(other_val_sites) \
                and collections.Counter(TEST_SITES) == collections.Counter(other_test_sites) \
                and os.path.exists(other_run_dir / 'xgb'):
            return other_run_dir
    return None

In [5]:
if not os.path.exists(XGB_DIR):
    identical_run = find_identical_run()
    if identical_run is not None:
        shutil.copytree(identical_run / 'xgb', XGB_DIR)
    else:
        os.makedirs(XGB_DIR, exist_ok=True)

In [15]:
site_configuration()
xgb_train_sites = TRAIN_SITES + VAL_SITES
xgb_process_data(DATA_DIR, xgb_train_sites, TEST_SITES, XGB_DIR)

Processing data for XGBoost...
  train data already compiled
  test data already compiled


In [7]:
# took about an hour
param_path = XGB_DIR / 'params.json'
score_path = XGB_DIR / 'score.txt'

if os.path.exists(param_path):
    print('Already found XGB parameters')
else:
    best_params, best_score = xgb_hyperparam_search(XGB_DIR, n_iter=50, target='NEE_VUT_REF')

    with open(param_path, 'w') as f:
        json.dump(best_params, f)
    with open(score_path, 'w') as f:
        f.write(f'{str(best_score)}\n')

Already found XGB parameters


In [8]:
with open(param_path, 'r') as f:
    best_params = json.load(f)
best_params

{'subsample': 0.7,
 'scale_pos_weight': 0.5,
 'n_estimators': 150,
 'min_child_weight': 9,
 'max_depth': 9,
 'lambda': 0,
 'gamma': 0.4,
 'eta': 0.1,
 'colsample_bytree': 0.7,
 'colsample_bylevel': 0.8,
 'alpha': 0.1}

In [9]:
xgb_multi_seed_inference(XGB_DIR, best_params)

Training with seed: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 20


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 50


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 60


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 70


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 80


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions


Training with seed: 90


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_final[f'XGBoost_{seed}'] = predictions
