# Process FLUXNET data
- replace sentinel values with NaN
- downsample from half-hourly to hourly (if needed for each site)

In [1]:
import os
import numpy as np
import pandas as pd
import shutil

In [2]:
COLS_VA = ['TA_F', 'SW_IN_F', 'LW_IN_F', 'VPD_F', 'PA_F', 'P_F', 'WS_F', 'WD', 'RH', 'USTAR', 'NETRAD', 'PPFD_IN', 'PPFD_DIF', 'PPFD_OUT', 'SW_DIF', 'SW_OUT', 'LW_OUT',
        'CO2_F_MDS', 'G_F_MDS', 'LE_F_MDS', 'H_F_MDS', 'NEE_VUT_REF', 'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF']
COLS_QC = [f'{c}_QC' for c in COLS_VA]
COLS_TS = ['TIMESTAMP_START']

collection = 'fluxnet'

INPUT_DIR = os.path.join('data', 'raw', collection, 'unzipped')
META_FILE = os.path.join('data', 'raw', collection, 'site_data.csv')
OUTPUT_DIR = os.path.join('data', 'intermediate', 'test_int_1', collection)

In [3]:
def process_site_dataframe(df, downsample=True):
    df = df.replace(-9999.0, np.nan)
    for column in COLS_VA + COLS_QC:
        if column not in df.columns:
            df[column] = np.nan
    
    df_ts = df[COLS_TS]
    df_va = df[COLS_VA]
    df_qc = df[COLS_QC]

    if downsample:
        # Average to hourly data
        grouping_key = np.arange(len(df_va)) // 2
        df_va = df_va.groupby(grouping_key).mean().reset_index(drop=True)
        df_ts = df_ts.iloc[::2,:].reset_index(drop=True)
        df_qc = df_qc.iloc[::2,:].reset_index(drop=True)
    
    # Double precipitation, as this should not be averaged
    df_va['P_F'] = df_va['P_F'] * 2.0
    df = pd.concat([df_ts, df_va, df_qc], axis=1)
    return df

In [4]:
data = []
for site in os.listdir(INPUT_DIR):
    file = os.listdir(os.path.join(INPUT_DIR, site))[0] # only 1 file per dir here
    data.append((site, os.path.join(INPUT_DIR, site, file), collection, True))

In [5]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

for site, file, source, downsample in data:
    print(f'Processing {file}...')
    site_dir = os.path.join(OUTPUT_DIR, site)
    if not os.path.exists(site_dir):
        os.makedirs(site_dir)
    
    site_df = pd.read_csv(file)
    processed_df = process_site_dataframe(site_df, downsample=downsample)
    min_time = processed_df['TIMESTAMP_START'].min()
    max_time = processed_df['TIMESTAMP_START'].max()
    outfile = os.path.join(site_dir, f'{min_time}_{max_time}_{source}.csv')
    processed_df.to_csv(outfile, index=False)

Processing data/raw/fluxnet/unzipped/US-ORv/FLX_US-ORv_FLUXNET2015_SUBSET_HH_2011-2011_1-4.csv...
Processing data/raw/fluxnet/unzipped/US-Los/FLX_US-Los_FLUXNET2015_SUBSET_HH_2000-2014_2-4.csv...
Processing data/raw/fluxnet/unzipped/DK-Fou/FLX_DK-Fou_FLUXNET2015_SUBSET_HH_2005-2005_1-4.csv...
Processing data/raw/fluxnet/unzipped/US-ARb/FLX_US-ARb_FLUXNET2015_SUBSET_HH_2005-2006_1-4.csv...
Processing data/raw/fluxnet/unzipped/US-Me2/FLX_US-Me2_FLUXNET2015_SUBSET_HH_2002-2014_1-4.csv...
Processing data/raw/fluxnet/unzipped/IT-Tor/FLX_IT-Tor_FLUXNET2015_SUBSET_HH_2008-2014_2-4.csv...
Processing data/raw/fluxnet/unzipped/FR-LBr/FLX_FR-LBr_FLUXNET2015_SUBSET_HH_1996-2008_1-4.csv...
Processing data/raw/fluxnet/unzipped/CH-Lae/FLX_CH-Lae_FLUXNET2015_SUBSET_HH_2004-2014_1-4.csv...
Processing data/raw/fluxnet/unzipped/AR-SLu/FLX_AR-SLu_FLUXNET2015_SUBSET_HH_2009-2011_1-4.csv...
Processing data/raw/fluxnet/unzipped/CH-Dav/FLX_CH-Dav_FLUXNET2015_SUBSET_HH_1997-2014_1-4.csv...
Processing data/raw/

In [3]:
shutil.copyfile(META_FILE, os.path.join(OUTPUT_DIR, 'site_data.csv'))

'data/intermediate/test_int_1/fluxnet/site_data.csv'