# Prepare data for experiments
- Steps more specific to my architecture
- Custom normalization, remove QC columns, index by date
- Combine MODIS data and site metadata all into the same leaf directories (organize by time chunks)

In [1]:
import numpy as np
import os
import ast
import pickle as pkl
import pandas as pd
import json

In [2]:
'''
int_2 should have merged datasets. Data is unnormalized.
'''

DATA_DIR = os.path.join('data', 'intermediate', 'int_2')
MODIS_A4_DIR = os.path.join('data', 'modis_a4')
MODIS_A2_DIR = os.path.join('data', 'modis_a2')
OUTPUT_DIR = os.path.join('data', 'processed', 'v3')
SITES = os.listdir(DATA_DIR)

In [3]:
DROP_COLS = ['PPFD_DIF', 'SW_DIF', 'PPFD_DIFF_QC', 'SW_DIF_QC']

COL_THRESHOLDS = {
    'TA_F': [-80.0, 80.0],
    'SW_IN_F': [0, 2000.0],
    'LW_IN_F': [0, 2000.0],
    'VPD_F': [0.0, 110.0],
    'PA_F': [20.0, 150.0],
    'P_F': [0.0, 300.0],
    'WS_F': [0.0, 100.0],
    'WD': [0.0, 360.0],
    'RH': [0.0, 100.0],
    'USTAR': [0.0, 10.0],
    'NETRAD': [-1000.0, 2000.0],
    'PPFD_IN': [-200.0, 3000.0],
    # 'PPFD_DIF': [],
    'PPFD_OUT': [-200.0, 3000.0],
    # 'SW_DIF': [],
    'SW_OUT': [-100.0, 2000.0],
    'LW_OUT': [-100.0, 2000.0],
    'CO2_F_MDS': [0.0, 2000.0], # air CO2 concentration. Shouldn't be too far outside typical ranges.
    'G_F_MDS': [-1000.0, 2000.0],
    'LE_F_MDS': [-1000.0, 2000.0],
    'H_F_MDS': [-1000.0, 2000.0],
    'DOY': [0, 366],
    'TOD': [0, 24],
    # 'NEE_VUT_REF': [],
    # 'RECO_NT_VUT_REF': [],
    # 'GPP_NT_VUT_REF': []
}

PREDICT_COLS = ['NEE_VUT_REF', 'RECO_NT_VUT_REF', 'GPP_NT_VUT_REF']

CYCLIC_COLS = ['WD', 'DOY', 'TOD'] # DOY = day of year, TOD = time of day

In [4]:
# Get input file list. We'll be using this a lot.
csvs = []
for site in SITES:
    entries = os.listdir(os.path.join(DATA_DIR, site))
    for e in entries:
        csvs.append((os.path.join(DATA_DIR, site, e, 'data.csv'), site))

In [5]:
# Count lines and get min/max for each column
col_minmax = {c: [None, None] for c in COL_THRESHOLDS.keys()}
site_hours = 0
cols_to_keep = list(COL_THRESHOLDS.keys()) + PREDICT_COLS

data = []

for file, site in csvs:
    df = pd.read_csv(file)

    df['timestamp'] = pd.to_datetime(df['TIMESTAMP_START'], format='%Y%m%d%H%M')
    df['DOY'] = df['timestamp'].dt.dayofyear.astype(float) - 1.0
    df['TOD'] = df['timestamp'].dt.hour.astype(float)
    df = df.set_index('timestamp')

    for col, bounds in COL_THRESHOLDS.items():
        low, high = bounds
        col_qc = f'{col}_QC'
        if col_qc in df.columns:
            # Filter out bad interpolations
            df.loc[df[col_qc]==3, col] = np.nan

        # Filter outliers
        df.loc[~df[col].between(low, high), col] = np.nan

        # Normalize (kind of)
        v_range = high - low
        v_mid = (high + low) / 2
        if col in CYCLIC_COLS:
            v_range /= 2
        df[col] = (df[col] - v_mid)/ v_range

        # Get min/max of each attribute across the whole dataset
        # cmin = df[c].min()
        # cmax = df[c].max()
        # if col_minmax[c][0] == None or col_minmax[c][0] > cmin:
        #     col_minmax[c][0] = cmin
        # if col_minmax[c][1] == None or col_minmax[c][1] < cmax:
        #     col_minmax[c][1] = cmax

    df = df[cols_to_keep]
    data.append((df, site))
    site_hours += len(df)

# Histogram of values for a column
# cols_to_examine = ['G_F_MDS', 'LW_IN_F', 'CO2_F_MDS', 'P_F']
# big_df = pd.concat([d[cols_to_examine]for d in dfs], axis=0)
# big_df['G_F_MDS'].hist(bins=200)

In [18]:
# For now, I'm just binarizing it.
water_dict = {
    0: 1, # shallow ocean
    1: 0, # land
    2: 0, # ocean coastlines and lake shorelines
    3: 1, # shallow inland water
    4: 1, # ephemeral water
    5: 1, # deep inland water
    6: 1, # moderate or continental ocean
    7: 1, # deep ocean
    255: 0 # fill value, treat as land for simplicity
}

def clean_a4_data(arr):
    arr = np.where((arr > 30000) | (arr < 0), -10000, arr)
    arr = np.where(arr > 10000, 10000, arr)
    arr = arr / 10000.0
    return arr.astype(np.float32)

def clean_a2_data(arr):
    # Snow: 0 = no snow, 1 = snow, 255 = fill
    snow_arr = np.where((arr[0] == 255), -1, arr[0]).astype(np.float32)
    water_arr = np.vectorize(water_dict.get)(arr[2]).astype(np.float32)
    return np.stack((snow_arr, water_arr), axis=0)


# Get modis and metadata
modis_time = '12:00:00' # average of Terra and Aqua satelites1
df_meta = pd.read_csv('processed_site_meta.csv')

for df, site in data:
    with open(os.path.join(MODIS_A4_DIR, f'{site}.pkl'), 'rb') as f:
        modis_a4_data = pkl.load(f)
    modis_a4_pixels = modis_a4_data['pixel_values']
    modis_a4_pixels = {pd.to_datetime(k, format='%Y_%m_%d').replace(hour=12): v for k, v in modis_a4_pixels.items()}
    modis_a4_pixels = {k: v for k, v in modis_a4_pixels.items() if k in df.index}

    with open(os.path.join(MODIS_A2_DIR, f'{site}.pkl'), 'rb') as f:
        modis_a2_data = pkl.load(f)
    modis_a2_pixels = modis_a2_data['pixel_values']
    modis_a2_pixels = {pd.to_datetime(k, format='%Y_%m_%d').replace(hour=12): v for k, v in modis_a2_pixels.items()}
    modis_a2_pixels = {k: v for k, v in modis_a2_pixels.items() if k in df.index}

    # Normalize and join MODIS data
    modis_data = {}
    for k, a4_values in modis_a4_pixels.items():
        a2_values = modis_a2_pixels.get(k, None)
        if a2_values is None:
            print(f'Mismatch in MODIS keys for {site}: {k}')
            continue

        clean_a4 = clean_a4_data(a4_values)
        clean_a2 = clean_a2_data(a2_values)
        if (np.sum(clean_a4 < 0) / len(clean_a4.flatten()) > 0.5):
            continue
        modis_data[k] = np.concatenate((clean_a4, clean_a2), axis=0)


    meta_row = df_meta.loc[df_meta['SITE_ID'] == site]
    nan_to_null = lambda x: None if pd.isna(x) else x
    meta = {
        'SITE_ID': site,
        'LOCATION_LAT': nan_to_null(meta_row['LOCATION_LAT'].values[0]),
        'LOCATION_LON': nan_to_null(meta_row['LOCATION_LON'].values[0]),
        'LOCATION_ELEV': nan_to_null(meta_row['LOCATION_ELEV'].values[0]),
        'IGBP': nan_to_null(meta_row['IGBP'].values[0]),
    }
    timeinfo = ast.literal_eval(meta_row['TIME_INFO'].values[0])

    timebound_string = ''
    # find out which source we're dealing with
    for source, bounds in timeinfo.items():
        start = bounds[0].replace('_', '-')
        end = bounds[1].replace('_', '-')
        pd_dt = pd.to_datetime(start, format='%Y-%m-%d').replace(hour=12)
        if pd_dt in df.index:
            meta['SOURCES'] = source
            meta['TIME'] = [start, end]
            timebound_string = f'{start}_{end}'
    
    if len(timebound_string) == 0:
        print(f'error with {site}...')
        print(timeinfo)
        print(start)
        print(end)
        print(df)
        break
        
    # Write it all out
    site_dir = os.path.join(OUTPUT_DIR, site)
    if not os.path.exists(site_dir):
        os.makedirs(site_dir)
    
    section_dir = os.path.join(site_dir, timebound_string)
    if not os.path.exists(section_dir):
        os.makedirs(section_dir)
    
    df.to_csv(os.path.join(section_dir, 'data.csv'))
    with open(os.path.join(section_dir, 'modis.pkl'), 'wb') as f:
        pkl.dump(modis_data, f)
    with open(os.path.join(section_dir, 'meta.json'), 'w') as f:
        json.dump(meta, f)




# Normalize MODIS data
- Fill value is 32767, and valid range is 0-32766

In [None]:
# # Examine distribution of MODIS data
# modis_files = []
# for root, dirs, files in os.walk(OUTPUT_DIR):
#     if 'modis.pkl' in files:
#         modis_files.append(os.path.join(root, 'modis.pkl'))

In [None]:
# def clean_array(arr):
#     arr = np.where((arr > 30000) | (arr < 0), -10000, arr)
#     arr = np.where(arr > 10000, 10000, arr)
#     arr = arr / 10000.0
#     return arr

# for file in modis_files:
#     with open(file, 'rb') as f:
#         modis_data = pkl.load(f)
    
#     cleaned_pixel_values = {}
#     for k,v in modis_data.items():
#         clean = clean_array(v)
#         if np.sum(clean < 0) / len(clean.flatten()) <= 0.5:
#             cleaned_pixel_values[k] = clean

#     with open(file, 'wb') as f:
#         pkl.dump(cleaned_pixel_values, f)


Discarded 127455 out of 391 images
