# Prepare data for experiments
- Steps more specific to my architecture
- Custom normalization, remove QC columns, index by date
- Combine MODIS data and site metadata all into the same leaf directories (organize by time chunks)

In [None]:
import numpy as np
import os
import ast
import pickle as pkl
import pandas as pd
import yaml
import json
from pathlib import Path
from tqdm import tqdm

In [2]:
INTERMEDIATE_DIR_2 = Path('data/intermediate/int_2')
SITES = os.listdir(INTERMEDIATE_DIR_2)
META_DIR = Path('data/meta')
MODIS_A4_DIR = Path('data/modis_a4')
MODIS_A2_DIR = Path('data/modis_a2')
OUTPUT_DIR = Path('data/processed/CarbonSense')

In [3]:
# MCD42A2 water cover map - binarize
water_dict = {
    0: 1, # shallow ocean
    1: 0, # land
    2: 0, # ocean coastlines and lake shorelines
    3: 1, # shallow inland water
    4: 1, # ephemeral water
    5: 1, # deep inland water
    6: 1, # moderate or continental ocean
    7: 1, # deep ocean
    255: 0 # fill value, treat as land for simplicity
}

# For all MODIS bands, we're treating -1 as a fill value
def clean_a4_data(arr):
    arr = np.where((arr > 30000) | (arr < 0), -10000, arr)
    arr = np.where(arr > 10000, 10000, arr)
    arr = arr / 10000.0
    return arr.astype(np.float32)

def clean_a2_data(arr):
    # Snow: 0 = no snow, 1 = snow, 255 = fill
    snow_arr = np.where((arr[0] == 255), -1, arr[0]).astype(np.float32)
    water_arr = np.vectorize(water_dict.get)(arr[2]).astype(np.float32)
    return np.stack((snow_arr, water_arr), axis=0)


# Get modis and metadata
modis_time = '12:00:00' # average of Terra and Aqua satelites
df_meta = pd.read_csv(META_DIR / 'processed_site_meta.csv')

In [4]:
# Get input file list. We'll be using this a lot.
csvs = []
for site in SITES:
    entries = os.listdir(os.path.join(INTERMEDIATE_DIR_2, site))
    for e in entries:
        csvs.append((os.path.join(INTERMEDIATE_DIR_2, site, e, 'data.csv'), site))

with open('normalization_config.yml', 'r') as f:
    config = yaml.safe_load(f)

In [5]:
def normalize_tabular_data(df, config):
    targets = [k for k, v in config.items() if v.get('target', False) and v.get('keep', False)]
    predictors = [k for k, v in config.items() if not v.get('target', False) and v.get('keep', False)]

    for p in predictors:
        # Filter out gap fills based on config
        max_qc = config[p].get('max_qc_flag', 2)
        qc_flag = f'{p}_QC'
        if qc_flag in df.columns:
            df.loc[df[qc_flag] > max_qc, p] = np.nan

        # Filter outlier values
        max_v = config[p].get('normalization_max', None)
        min_v = config[p].get('normalization_min', None)
        df.loc[~df[p].between(min_v, max_v), p] = np.nan

        # Min-max normalization
        v_mid = (max_v + min_v) / 2
        v_range = max_v - min_v
        if config[p].get('cyclic', False):
            v_range /= 2
        df[p] = (df[p] - v_mid) / v_range

    # Add timestamp index
    df['timestamp'] = pd.to_datetime(df['TIMESTAMP_START'], format='%Y%m%d%H%M')

    # Add day of year / time of day (and normalize as cyclic)
    df['DOY'] = df['timestamp'].dt.dayofyear.astype(float) - 1.0
    df['TOD'] = df['timestamp'].dt.hour.astype(float)
    df['DOY'] = (df['DOY'] - 183) / 183
    df['TOD'] = (df['TOD'] - 12) / 12

    predictors = ['timestamp', 'DOY', 'TOD'] + predictors

    for t in targets:
        # Filter out gap fills based on config
        max_qc = config[t].get('max_qc_flag', 2)
        qc_flag = f'{t}_QC'
        if qc_flag in df.columns:
            df.loc[df[qc_flag] > max_qc, t] = np.nan

    targets = ['timestamp'] + targets

    df_p = df[predictors]
    df_p = df_p.set_index('timestamp', drop=True)
    df_t = df[targets]
    df_t = df_t.set_index('timestamp', drop=True)
    return df_p, df_t

In [6]:
def normalize_modis_data(site, df):
    if not os.path.exists(MODIS_A4_DIR / f'{site}.pkl'):
        return None
    with open(MODIS_A4_DIR / f'{site}.pkl', 'rb') as f:
        modis_a4_data = pkl.load(f)
    modis_a4_pixels = modis_a4_data['pixel_values']
    modis_a4_pixels = {pd.to_datetime(k, format='%Y_%m_%d').replace(hour=12): v for k, v in modis_a4_pixels.items()}
    modis_a4_pixels = {k: v for k, v in modis_a4_pixels.items() if k in df.index}

    with open(os.path.join(MODIS_A2_DIR, f'{site}.pkl'), 'rb') as f:
        modis_a2_data = pkl.load(f)
    modis_a2_pixels = modis_a2_data['pixel_values']
    modis_a2_pixels = {pd.to_datetime(k, format='%Y_%m_%d').replace(hour=12): v for k, v in modis_a2_pixels.items()}
    modis_a2_pixels = {k: v for k, v in modis_a2_pixels.items() if k in df.index}

    # Normalize and join MODIS data
    modis_data = {}
    for k, a4_values in modis_a4_pixels.items():
        a2_values = modis_a2_pixels.get(k, None)
        if a2_values is None:
            print(f'Mismatch in MODIS keys for {site}: {k}')
            continue

        clean_a4 = clean_a4_data(a4_values)
        clean_a2 = clean_a2_data(a2_values)
        if (np.sum(clean_a4 < 0) / len(clean_a4.flatten()) > 0.5):
            continue
        modis_data[k] = np.concatenate((clean_a4, clean_a2), axis=0)
    return modis_data

In [7]:
def get_site_meta(site, df_t, df_meta):
    meta_row = df_meta.loc[df_meta['SITE_ID'] == site]
    nan_to_null = lambda x: None if pd.isna(x) else x
    meta = {
        'SITE_ID': site,
        'LOCATION_LAT': nan_to_null(meta_row['LOCATION_LAT'].values[0]),
        'LOCATION_LON': nan_to_null(meta_row['LOCATION_LON'].values[0]),
        'LOCATION_ELEV': nan_to_null(meta_row['LOCATION_ELEV'].values[0]),
        'IGBP': nan_to_null(meta_row['IGBP'].values[0]),
    }
    timeinfo = ast.literal_eval(meta_row['TIME_INFO'].values[0])

    timebound_string = ''
    # find out which source we're dealing with
    for source, bounds in timeinfo.items():
        start = bounds[0].replace('_', '-')
        end = bounds[1].replace('_', '-')
        pd_dt = pd.to_datetime(start, format='%Y-%m-%d').replace(hour=12)
        if pd_dt in df_t.index:
            meta['SOURCES'] = source
            meta['TIME'] = [start, end]
            timebound_string = f'{start}_{end}'
    
    if len(timebound_string) == 0:
        print(f'error with {site}...')
        print(timeinfo)
        print(start)
        print(end)
        raise Exception()
    
    return meta, timebound_string

In [8]:
for file, site in tqdm(csvs):
    df = pd.read_csv(file)

    df_p, df_t = normalize_tabular_data(df, config)
    modis_data = normalize_modis_data(site, df_t)
    if modis_data is None:
        print(f'WARNING: No MODIS data found for {site}, discarding.')
        continue
    meta, time_string = get_site_meta(site, df_t, df_meta)

    # Write it all out
    site_dir = os.path.join(OUTPUT_DIR, site)
    if not os.path.exists(site_dir):
        os.makedirs(site_dir)
    
    section_dir = os.path.join(site_dir, time_string)
    if not os.path.exists(section_dir):
        os.makedirs(section_dir)
    
    df_t.to_csv(os.path.join(section_dir, 'targets.csv'))
    df_p.to_csv(os.path.join(section_dir, 'predictors.csv'))
    with open(os.path.join(section_dir, 'modis.pkl'), 'wb') as f:
        pkl.dump(modis_data, f)
    with open(os.path.join(section_dir, 'meta.json'), 'w') as f:
        json.dump(meta, f)


 58%|█████▊    | 226/392 [06:37<02:34,  1.07it/s]



100%|██████████| 392/392 [10:54<00:00,  1.67s/it]
