In [1]:
import os
from tqdm import tqdm
import numpy as np
import pandas as pd
from pathlib import Path
import shutil
import json

In [3]:
DATA_DIR = Path('data')
CLEANED_DIR = DATA_DIR / 'cleaned'
MERGED_DIR = DATA_DIR / 'merged'
CONFIG_FILE = Path('config.json')

with open(CONFIG_FILE, 'r') as f:
    config = json.load(f)

TIMESTAMP_COL = config['harmonize_columns']['timestamp_column']
METEO_COLS = list(config['harmonize_columns']['meteorological_columns'].keys())
SOIL_COLS = list(config['harmonize_columns']['soil_columns'].keys())
FLUX_COLS = list(config['harmonize_columns']['flux_columns'].keys())
ALL_COLS = METEO_COLS + SOIL_COLS + FLUX_COLS
QC_COLS = [f'{c}_QC' for c in ALL_COLS]
SOURCE_ORDER = {s: i for i, s in enumerate(config['combine_sources']['source_priority'])}

In [None]:
def merge_dataframes(df1, df2):
    df_merged = pd.merge(df1,df2, on='timestamp', how='outer', suffixes=('_df1', '_df2'))
    df_final = df_merged[['timestamp']].copy()
    for col in ALL_COLS:
        colqc = f'{col}_QC'
        df_final[[col]] = np.nan
        df_final[[colqc]] = np.nan
        df1qc = df_merged[f'{colqc}_df1']
        df2qc = df_merged[f'{colqc}_df2']

        # Merging rules:
        #   - For any row and any variable, use the one which has the lower QC value between the two sources
        #   - If it's a tie, use the value from the newer source (assumed df1 for this function)
        df1_copy_indices = ((df1qc.notna()) & (df2qc.isna())) | ((df1qc.notna()) & (df2qc.notna()) & (df1qc <= df2qc))
        df_final.loc[df1_copy_indices, col] = df_merged.loc[df1_copy_indices, f'{col}_df1']
        df_final.loc[df1_copy_indices, colqc] = df_merged.loc[df1_copy_indices, f'{colqc}_df1']
        df2_copy_indices = ((df2qc.notna()) & (df1qc.isna())) | ((df2qc.notna()) & (df1qc.notna()) & (df2qc < df1qc))
        df_final.loc[df2_copy_indices, col] = df_merged.loc[df2_copy_indices, f'{col}_df2']
        df_final.loc[df2_copy_indices, colqc] = df_merged.loc[df2_copy_indices, f'{colqc}_df2']

    return df_final.sort_values('timestamp').reset_index(drop=True)

def merge_data(path):
    files = [f for f in os.listdir(path) if f not in ['audit.json', 'meta.json']]
    dfs = [(SOURCE_ORDER.get(f.split('_')[0], 9999), pd.read_csv(f'{path}/{f}')) for f in files]
    dfs = [x[1] for x in sorted(dfs, key=lambda y: y[0])]
    if len(dfs) == 1:
        df = dfs[0]
    else:
        # Merge dataframes
        df = dfs.pop(0)
        while len(dfs) > 0:
            df = merge_dataframes(df, dfs.pop(0))

        # Fill missing timestamps. This prevents having multiple files per site for disjoint sources
        df['timestamp'] = df['timestamp']
        ts_col = pd.to_datetime(df['timestamp'], format='%Y%m%d%H%M')
        timestamp_range = pd.date_range(start=ts_col.min(), end=ts_col.max(), freq='30T')
        timestamp_range_int = timestamp_range.strftime('%Y%m%d%H%M').astype(int)
        existing_timestamps = set(df['timestamp'])
        missing_timestamps = timestamp_range_int[~timestamp_range_int.isin(existing_timestamps)]
        if len(missing_timestamps) > 0:
            print(path)
            missing_data = {c: [np.nan for _ in range(len(missing_timestamps))] for c in ALL_COLS + QC_COLS}
            missing_data['timestamp'] = missing_timestamps
            missing_df = pd.DataFrame(missing_data)
            df = pd.concat([df, missing_df], axis=0).sort_values('timestamp').reset_index(drop=True)
    cols = list(df.columns)
    ts_col = pd.to_datetime(df['timestamp'], format='%Y%m%d%H%M')
    df['DOY'] = ts_col.dt.dayofyear.astype(float) - 1.0
    df['TOD'] = ts_col.dt.hour.astype(float)
    df['TOD'] += 0.5 * (ts_col.dt.minute.astype(float) == 30).astype(float)
    df = df[['timestamp', 'DOY', 'TOD'] + cols[1:]]
    return df

In [5]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

sites = os.listdir(CLEANED_DIR)
for site in tqdm(sites):
    df = merge_data(CLEANED_DIR / site)
    
    os.makedirs(MERGED_DIR / site, exist_ok=True)
    shutil.copy(CLEANED_DIR / site / 'audit.json', MERGED_DIR / site / 'audit.json')

    with open(CLEANED_DIR / site / 'meta.json', 'r') as f:
        meta = json.loads(f.read())
    meta['MIN_DATE'] = df['timestamp'][0]
    meta['MAX_DATE'] = df['timestamp'][len(df)-1]
    with open(MERGED_DIR / site / 'meta.json', 'w') as f:
        f.write(json.dumps(meta,  cls=NpEncoder))
    
    df.to_csv(MERGED_DIR / site / 'data.csv', index=False)

 36%|███▌      | 151/417 [10:34<11:43,  2.64s/it]

data/cleaned/GL-NuF


 44%|████▍     | 185/417 [12:58<18:35,  4.81s/it]

data/cleaned/US-Ivo


 55%|█████▌    | 230/417 [15:04<08:31,  2.74s/it]

data/cleaned/CZ-BK1


 58%|█████▊    | 242/417 [15:34<04:58,  1.70s/it]

data/cleaned/FR-Hes


 62%|██████▏   | 258/417 [16:12<07:30,  2.83s/it]

data/cleaned/FR-Pue


 71%|███████   | 294/417 [18:16<05:32,  2.70s/it]

data/cleaned/US-Atq


 81%|████████  | 336/417 [20:12<03:52,  2.87s/it]

data/cleaned/RU-Che


 86%|████████▋ | 360/417 [21:32<03:14,  3.41s/it]

data/cleaned/GL-ZaH


 88%|████████▊ | 365/417 [21:50<02:23,  2.76s/it]

data/cleaned/CZ-Lnz


100%|██████████| 417/417 [25:00<00:00,  3.60s/it]
