In [1]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from pathlib import Path
import json
import math

In [2]:
DATA_DIR = Path('data')
RENAMED_DIR = DATA_DIR / 'renamed'
CLEANED_DIR = DATA_DIR / 'cleaned'
CONFIG_FILE = Path('config.json')

with open(CONFIG_FILE, 'r') as f:
    config = json.load(f)['harmonize_columns']

TIMESTAMP_COL = config['timestamp_column']
PREDICTOR_COLS = config['predictor_columns']
FLUX_COLS = config['flux_columns']
DEFAULT_GAPFILL_QC_FLAGS = config['default_gapfill_qc_flags']

META_FIELDS = ['SITE_ID', 'SITE_NAME', 'LOCATION_LON', 'LOCATION_LAT', 'LOCATION_ELEV', 'IGBP', 'MAT', 'MAP']

In [3]:
# TODO: TS_F_MDS_1 & TS_F_MDS_1_QC are examples of variables being missed here.

def process_site_dataframe(df):
    df = df.replace(-9999.0, np.nan)
    audit = {} # keep track of column sources
    allowable_suffixes = DEFAULT_GAPFILL_QC_FLAGS.keys()

    # We will copy all data into a new harmonized dataframe
    h_df = df[[TIMESTAMP_COL]].copy()
    h_df.rename(columns={TIMESTAMP_COL: 'timestamp'}, inplace=True)
    audit['timestamp'] = [TIMESTAMP_COL]
    
    for col in PREDICTOR_COLS + FLUX_COLS:
        colqc = f'{col}_QC'
        audit[col] = []
        audit[colqc] = []
        
        # Check if the column already exists verbatim
        if col in df.columns:
            h_df.loc[:, col] = df.loc[:, col]
            audit[col].append(col)
        else:
            h_df.loc[:, col] = np.nan
        
        if colqc in df.columns:
            h_df.loc[:, colqc] = df.loc[:, colqc]
            audit[colqc].append(colqc)
        else:
            h_df.loc[:, colqc] = np.nan
        
        h_df.loc[h_df[col].notna(), colqc] = 0 # default for variables which do not have gapfilling suffixes
        
        # Check all possible suffix versions of the column (usually gapfilled)
        for suffix in allowable_suffixes:
            scol = col + suffix
            scolqc = f'{scol}_QC'

            if scol in df.columns:
                audit[col].append(scol)

                if scolqc in df.columns:
                    fill_index = ((h_df[col].isna()) & (df[scol].notna())) | ((df[scol].notna()) & (h_df[colqc] > df[scolqc]))
                    h_df.loc[fill_index, col] = df.loc[fill_index, scol]
                    h_df.loc[fill_index, colqc] = df.loc[fill_index, scolqc]
                    audit[colqc].append(scolqc)
                else:
                    fill_index = (h_df[col].isna()) & (df[scol].notna())
                    h_df.loc[fill_index, col] = df.loc[fill_index, scol]
                
                # Fill remaining unknown QC index with default value
                remaining_qc_ind = fill_index & (h_df[colqc].isna())
                h_df.loc[remaining_qc_ind, colqc] = DEFAULT_GAPFILL_QC_FLAGS[suffix]

    used_columns = [c for cs in audit.values() for c in cs]
    audit['unused'] = [c for c in df.columns if c not in used_columns]
    audit = {
        'step': 'harmonize_columns',
        'info': audit
    }
    return h_df, audit

In [4]:
sources = os.listdir(RENAMED_DIR)
site_meta_dict = {}
audit_dict = {}

print('Cleaning site data...')
for source in sources:
    meta_df = pd.read_csv(RENAMED_DIR / source / 'site_data.csv')
    site_csvs = [f for f in os.listdir(RENAMED_DIR / source) if f != 'site_data.csv']
    print(source)
    for csv in tqdm(site_csvs):
        site = csv[:-4]
        if site not in site_meta_dict.keys():
            os.makedirs(CLEANED_DIR / site, exist_ok=True)
            site_meta_dict[site] = {'COVERAGE': {}}
            audit_dict[site] = []
        
        df = pd.read_csv(RENAMED_DIR / source / csv)
        df, audit = process_site_dataframe(df)
        time_min = df['timestamp'].min()
        time_max = df['timestamp'].max()
        df.to_csv(CLEANED_DIR / site / f'{source}_{time_min}_{time_max}.csv', index=False)

        site_meta = meta_df[meta_df['SITE_ID'] == site].to_dict('records')[0]
        site_meta = {k: v for k, v in site_meta.items() if k in META_FIELDS}
        site_meta_dict[site].update(site_meta)
        site_meta_dict[site]['COVERAGE'][source] = [time_min, time_max]
        audit_dict[site].append(audit)


Cleaning site data...
fluxnet


100%|██████████| 195/195 [06:41<00:00,  2.06s/it]


ameriflux


100%|██████████| 192/192 [06:40<00:00,  2.08s/it]


icos-2023


100%|██████████| 58/58 [01:42<00:00,  1.77s/it]


icos-ww


100%|██████████| 73/73 [06:27<00:00,  5.31s/it]


fluxnet-ch4


100%|██████████| 79/79 [01:23<00:00,  1.06s/it]


In [5]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

for s in site_meta_dict.keys():
    for ss in site_meta_dict[s].keys():
        if type(site_meta_dict[s][ss]) == float and math.isnan(site_meta_dict[s][ss]):
            site_meta_dict[s][ss] = None

print('Writing metadata and audit logs...')
for site in tqdm(site_meta_dict.keys()):
    with open(CLEANED_DIR / site / 'meta.json', 'w') as f:
        f.write(json.dumps(site_meta_dict[site], cls=NpEncoder))
    with open(CLEANED_DIR / site / 'audit.json', 'w') as f:
        f.write(json.dumps({'harmonize_columns': audit_dict[site]}))

Writing metadata and audit logs...


100%|██████████| 417/417 [00:00<00:00, 5370.35it/s]
