In [1]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from pathlib import Path
import json
import math

In [2]:
DATA_DIR = Path('data')
RENAMED_DIR = DATA_DIR / 'renamed'
CLEANED_DIR = DATA_DIR / 'cleaned'
CONFIG_FILE = Path('config.json')

with open(CONFIG_FILE, 'r') as f:
    config = json.load(f)['harmonize_columns']

TIMESTAMP_COL = config['timestamp_column']
METEO_COLS = config['meteorological_columns']
SOIL_COLS = config['soil_columns']
FLUX_COLS = config['flux_columns']
ALL_COLS = {}
ALL_COLS.update(METEO_COLS)
ALL_COLS.update(SOIL_COLS)
ALL_COLS.update(FLUX_COLS)
DEFAULT_GAPFILL_QC_FLAGS = config['default_gapfill_qc_flags']

META_FIELDS = ['SITE_ID', 'SITE_NAME', 'LOCATION_LON', 'LOCATION_LAT', 'LOCATION_ELEV', 'IGBP', 'MAT', 'MAP']

In [7]:
def default_qc_value(alias):
    default = 0
    for k, v in DEFAULT_GAPFILL_QC_FLAGS.items():
        if k in alias and v > default:
            default = v
    return default


def process_site_dataframe(df):
    df = df.replace(-9999.0, np.nan)
    audit = {} # keep track of column sources

    # We will copy all data into a new harmonized dataframe
    h_df = df[[TIMESTAMP_COL]].copy()
    h_df.rename(columns={TIMESTAMP_COL: 'timestamp'}, inplace=True)
    n_rows = h_df.shape[0]
    audit['timestamp'] = [TIMESTAMP_COL]
    
    for col, col_aliases in ALL_COLS.items():
        colqc = f'{col}_QC'
        audit[col] = []

        col_np = np.array([np.nan] * n_rows, dtype=float)
        colqc_np = np.array([9999] * n_rows, dtype=float)

        for alias in col_aliases:
            if alias in df.columns:
                alias_np = df[alias].values

                aliasqc = f'{alias}_qc'
                if aliasqc not in df.columns:
                    aliasqc_np = np.array([default_qc_value(alias)] * n_rows, dtype=int)
                else:
                    aliasqc_np = np.nan_to_num(df[aliasqc].values, nan=default_qc_value(alias)).astype(int)
                
                fill_index = ((np.isnan(col_np)) & (~np.isnan(alias_np))) | ((~np.isnan(alias_np)) & (colqc_np > aliasqc_np))

                col_np[fill_index] = alias_np[fill_index]
                colqc_np[fill_index] = aliasqc_np[fill_index]
                audit[col].append(alias)

        colqc_np[colqc_np == 9999.0] = np.nan
        h_df.loc[:,col] = col_np
        h_df.loc[:,colqc] = colqc_np
        
    used_columns = [c for cs in audit.values() for c in cs]
    audit['unused'] = [c for c in df.columns if c not in used_columns]
    audit = {
        'step': 'harmonize_columns',
        'info': audit
    }
    return h_df, audit

In [8]:
sources = os.listdir(RENAMED_DIR)
site_meta_dict = {}
audit_dict = {}
print('Cleaning site data...')
for source in sources:
    meta_df = pd.read_csv(RENAMED_DIR / source / 'site_data.csv')
    site_csvs = [f for f in os.listdir(RENAMED_DIR / source) if f != 'site_data.csv']
    print(source)
    for csv in tqdm(site_csvs):
        site = csv[:-4]
        if site not in site_meta_dict.keys():
            os.makedirs(CLEANED_DIR / site, exist_ok=True)
            site_meta_dict[site] = {'COVERAGE': {}}
            audit_dict[site] = []
        
        df = pd.read_csv(RENAMED_DIR / source / csv)
        df, audit = process_site_dataframe(df)
        time_min = df['timestamp'].min()
        time_max = df['timestamp'].max()
        df.to_csv(CLEANED_DIR / site / f'{source}_{time_min}_{time_max}.csv', index=False)

        site_meta = meta_df[meta_df['SITE_ID'] == site].to_dict('records')[0]
        site_meta = {k: v for k, v in site_meta.items() if k in META_FIELDS}
        site_meta_dict[site].update(site_meta)
        site_meta_dict[site]['COVERAGE'][source] = [time_min, time_max]
        audit_dict[site].append(audit)

Cleaning site data...
fluxnet


100%|██████████| 195/195 [09:24<00:00,  2.89s/it]


ameriflux


100%|██████████| 192/192 [10:23<00:00,  3.25s/it]


icos-2023


100%|██████████| 58/58 [02:25<00:00,  2.51s/it]


icos-ww


100%|██████████| 73/73 [08:45<00:00,  7.20s/it]


fluxnet-ch4


100%|██████████| 79/79 [01:49<00:00,  1.38s/it]


In [9]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

for s in site_meta_dict.keys():
    for ss in site_meta_dict[s].keys():
        if type(site_meta_dict[s][ss]) == float and math.isnan(site_meta_dict[s][ss]):
            site_meta_dict[s][ss] = None

print('Writing metadata and audit logs...')
for site in tqdm(site_meta_dict.keys()):
    with open(CLEANED_DIR / site / 'meta.json', 'w') as f:
        f.write(json.dumps(site_meta_dict[site], cls=NpEncoder))
    with open(CLEANED_DIR / site / 'audit.json', 'w') as f:
        f.write(json.dumps({'harmonize_columns': audit_dict[site]}))

Writing metadata and audit logs...


100%|██████████| 417/417 [00:00<00:00, 5832.67it/s]
