# Process Ameriflux data
- replace sentinel values with NaN
- downsample from half-hourly to hourly (if needed for each site)

In [1]:
import os
import numpy as np
import pandas as pd
import shutil

In [2]:
COLS_VA = ['TA_F', 'SW_IN_F', 'LW_IN_F', 'VPD_F', 'PA_F', 'P_F', 'WS_F', 'WD', 'RH', 'USTAR', 'NETRAD', 'PPFD_IN', 'PPFD_DIF', 'PPFD_OUT', 'SW_DIF', 'SW_OUT', 'LW_OUT',
        'CO2_F_MDS', 'G_F_MDS', 'LE_F_MDS', 'H_F_MDS', 'NEE_VUT_REF', 'RECO_NT_VUT_REF', 'RECO_DT_VUT_REF', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF']
COLS_QC = [f'{c}_QC' for c in COLS_VA]
COLS_TS = ['TIMESTAMP_START']

collection = 'ameriflux'

INPUT_DIR = os.path.join('data', 'raw', collection, 'unzipped')
META_FILE = os.path.join('data', 'raw', collection, 'site_data.csv')
OUTPUT_DIR = os.path.join('data', 'intermediate', 'test_int_1', collection)

In [3]:
def process_site_dataframe(df, downsample=True):
    df = df.replace(-9999.0, np.nan)
    for column in COLS_VA + COLS_QC:
        if column not in df.columns:
            df[column] = np.nan
    
    df_ts = df[COLS_TS]
    df_va = df[COLS_VA]
    df_qc = df[COLS_QC]

    if downsample:
        # Average to hourly data
        grouping_key = np.arange(len(df_va)) // 2
        df_va = df_va.groupby(grouping_key).mean().reset_index(drop=True)
        df_ts = df_ts.iloc[::2,:].reset_index(drop=True)
        df_qc = df_qc.iloc[::2,:].reset_index(drop=True)
    
    # Double precipitation, as this should not be averaged
    df_va['P_F'] = df_va['P_F'] * 2.0
    df = pd.concat([df_ts, df_va, df_qc], axis=1)
    return df

In [4]:
data = []
for site in os.listdir(INPUT_DIR):
    files = os.listdir(os.path.join(INPUT_DIR, site))
    fluxnet_compatible_files = [f for f in files if 'FLUXNET_SUBSET_HH' in f  and 'VARINFO' not in f]
    if len(fluxnet_compatible_files) == 1:
        data.append((site, os.path.join(INPUT_DIR, site, fluxnet_compatible_files[0]), collection, True))
    else:
        fluxnet_compatible_files = [f for f in files if 'FLUXNET_SUBSET_HR' in f  and 'VARINFO' not in f]
        if len(fluxnet_compatible_files) != 1:
            print(f'No valid file found for {site}')
            continue
        data.append((site, os.path.join(INPUT_DIR, site, fluxnet_compatible_files[0]), collection, False))

In [5]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

for site, file, source, downsample in data:
    print(f'Processing {file}...')
    site_dir = os.path.join(OUTPUT_DIR, site)
    if not os.path.exists(site_dir):
        os.makedirs(site_dir)
    
    site_df = pd.read_csv(file)
    processed_df = process_site_dataframe(site_df, downsample=downsample)
    min_time = processed_df['TIMESTAMP_START'].min()
    max_time = processed_df['TIMESTAMP_START'].max()
    outfile = os.path.join(site_dir, f'{min_time}_{max_time}_{source}.csv')
    processed_df.to_csv(outfile, index=False)

Processing data/raw/ameriflux/unzipped/US-ORv/AMF_US-ORv_FLUXNET_SUBSET_HH_2011-2016_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-ARb/AMF_US-ARb_FLUXNET_SUBSET_HH_2005-2006_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Me2/AMF_US-Me2_FLUXNET_SUBSET_HH_2002-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xHE/AMF_US-xHE_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-BZB/AMF_US-BZB_FLUXNET_SUBSET_HH_2011-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-CS1/AMF_US-CS1_FLUXNET_SUBSET_HH_2018-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xUN/AMF_US-xUN_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-SRS/AMF_US-SRS_FLUXNET_SUBSET_HH_2011-2018_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Vcp/AMF_US-Vcp_FLUXNET_SUBSET_HH_2007-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-KS1/AMF_US-KS1_FLUXNET_SUBSET_HH_2002-2003_3-5.csv...
Processing data/raw/ameriflux/unzipped/U

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_va['P_F'] = df_va['P_F'] * 2.0


Processing data/raw/ameriflux/unzipped/US-Kon/AMF_US-Kon_FLUXNET_SUBSET_HH_2004-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-UMd/AMF_US-UMd_FLUXNET_SUBSET_HH_2007-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Sne/AMF_US-Sne_FLUXNET_SUBSET_HH_2016-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-RGA/AMF_US-RGA_FLUXNET_SUBSET_HH_2021-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Ton/AMF_US-Ton_FLUXNET_SUBSET_HH_2001-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Mo2/AMF_US-Mo2_FLUXNET_SUBSET_HH_2018-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Rwf/AMF_US-Rwf_FLUXNET_SUBSET_HH_2014-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/CA-NS4/AMF_CA-NS4_FLUXNET_SUBSET_HH_2001-2005_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-EML/AMF_US-EML_FLUXNET_SUBSET_HH_2008-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Syv/AMF_US-Syv_FLUXNET_SUBSET_HH_2001-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/C

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_va['P_F'] = df_va['P_F'] * 2.0


Processing data/raw/ameriflux/unzipped/US-ONA/AMF_US-ONA_FLUXNET_SUBSET_HH_2015-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/CA-SF1/AMF_CA-SF1_FLUXNET_SUBSET_HH_2003-2006_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-ICt/AMF_US-ICt_FLUXNET_SUBSET_HH_2007-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/CA-NS3/AMF_CA-NS3_FLUXNET_SUBSET_HH_2001-2005_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-HB2/AMF_US-HB2_FLUXNET_SUBSET_HH_2019-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-CF3/AMF_US-CF3_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Whs/AMF_US-Whs_FLUXNET_SUBSET_HH_2007-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/CA-DBB/AMF_CA-DBB_FLUXNET_SUBSET_HH_2014-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Wi8/AMF_US-Wi8_FLUXNET_SUBSET_HH_2002-2002_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-NGB/AMF_US-NGB_FLUXNET_SUBSET_HH_2012-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/U

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_va['P_F'] = df_va['P_F'] * 2.0


Processing data/raw/ameriflux/unzipped/US-xYE/AMF_US-xYE_FLUXNET_SUBSET_HH_2018-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-HB1/AMF_US-HB1_FLUXNET_SUBSET_HH_2019-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-UMB/AMF_US-UMB_FLUXNET_SUBSET_HH_2007-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Wjs/AMF_US-Wjs_FLUXNET_SUBSET_HH_2007-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Ro4/AMF_US-Ro4_FLUXNET_SUBSET_HH_2014-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Hn3/AMF_US-Hn3_FLUXNET_SUBSET_HH_2017-2018_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xBA/AMF_US-xBA_FLUXNET_SUBSET_HH_2019-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/CA-MA1/AMF_CA-MA1_FLUXNET_SUBSET_HH_2009-2011_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Wi9/AMF_US-Wi9_FLUXNET_SUBSET_HH_2004-2005_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xBR/AMF_US-xBR_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/U

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_va['P_F'] = df_va['P_F'] * 2.0


Processing data/raw/ameriflux/unzipped/US-EDN/AMF_US-EDN_FLUXNET_SUBSET_HH_2018-2019_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xTR/AMF_US-xTR_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Wi7/AMF_US-Wi7_FLUXNET_SUBSET_HH_2005-2005_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-xRM/AMF_US-xRM_FLUXNET_SUBSET_HH_2017-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Rpf/AMF_US-Rpf_FLUXNET_SUBSET_HH_2008-2021_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-Mpj/AMF_US-Mpj_FLUXNET_SUBSET_HH_2008-2020_3-5.csv...
Processing data/raw/ameriflux/unzipped/US-KLS/AMF_US-KLS_FLUXNET_SUBSET_HH_2012-2019_3-5.csv...


In [3]:
shutil.copyfile(META_FILE, os.path.join(OUTPUT_DIR, 'site_data.csv'))

'data/intermediate/test_int_1/ameriflux/site_data.csv'