# Fluxnet Hainich
> Load Hainich dataset from fluxnet

Due to license reason the data is not included in the repo, but you can download it from fluxnet

In [None]:
#| hide
#| default_exp data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from pathlib import Path
from pyprojroot import here
import pandas as pd
import numpy as np

In [None]:
['PA', 'P', 'WS', 'WD', 'LW_IN', 'NETRAD']

['PA', 'P', 'WS', 'WD', 'LW_IN', 'NETRAD']

In [None]:
#| exports
_def_meteo_vars = {
    "TA_F": "TA",
    "SW_IN_F": "SW_IN",
    # "LW_IN_F": "LW_IN",
    "VPD_F": "VPD",
    #"PA": "PA"
}


meteo_vars_big = {f"{var}_F" : var for var in ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS', 'PA', 'P']} | {'SWC_F_MDS_1': 'SWC', 'TS_F_MDS_1': 'TS'}


units = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    # 'LW_IN': 'W m-2',
    'VPD': 'hPa'
}

units_big = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    'VPD': 'hPa',
    'PA': 'hPa',
    'P': 'mm',
    'WS': 'm s-1',
    'LW_IN': 'W m-2',
    'TS': '°C',
    'SWC': '%'
    
    # 'NETRAD': 'W m-2',
}

hai_path_raw = here("data/Hainich") / "FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4.csv"
hai_path = here("data/Hainich") / "FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4_float32.parquet"
hai_path64 = here("data/Hainich") / "FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4_float64.parquet"
hai_big_path = here("data/Hainich") / "FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4_float64_big.parquet"

In [None]:
#| exporti

def get_dtype(col_name: str, num_dtype=np.float32):
    "Get correct dtype based on column name"
    if col_name in ["TIMESTAMP_END", "TIMESTAMP_START"]:
        return 'str'
    elif col_name.endswith("QC"):
        return None # pd.CategoricalDtype
    else:
        return num_dtype

def col_types(cols, num_dtype=np.float32):
    return {col: get_dtype(col, num_dtype) for col in cols}

def read_col_names(path):
    "read only column names from csv"
    return pd.read_csv(path, nrows=0).columns

In [None]:
read_col_names(hai_path_raw)

Index(['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F_MDS', 'TA_F_MDS_QC', 'TA_ERA',
       'TA_F', 'TA_F_QC', 'SW_IN_POT', 'SW_IN_F_MDS', 'SW_IN_F_MDS_QC',
       ...
       'GPP_DT_CUT_MEAN', 'GPP_DT_CUT_SE', 'GPP_DT_CUT_05', 'GPP_DT_CUT_16',
       'GPP_DT_CUT_25', 'GPP_DT_CUT_50', 'GPP_DT_CUT_75', 'GPP_DT_CUT_84',
       'GPP_DT_CUT_95', 'RECO_SR'],
      dtype='object', length=238)

In [None]:
col_types(read_col_names(hai_path_raw)[:10]) # only for 10 cols for testing

{'TIMESTAMP_START': 'str',
 'TIMESTAMP_END': 'str',
 'TA_F_MDS': numpy.float32,
 'TA_F_MDS_QC': None,
 'TA_ERA': numpy.float32,
 'TA_F': numpy.float32,
 'TA_F_QC': None,
 'SW_IN_POT': numpy.float32,
 'SW_IN_F_MDS': numpy.float32,
 'SW_IN_F_MDS_QC': None}

In [None]:
#| export
def read_fluxnet_csv(path,
                     nrows:int,
                     meteo_vars: dict[str, str] = _def_meteo_vars,
                     num_dtype = np.float32 # type for numerical columns
                    ):
    "Read fluxnet csv in Pandas with correct parsing of csv"
    return (pd.read_csv(path, na_values=["-9999", "-9999.99"],
                        parse_dates=[0, 1],
                        nrows=nrows,
                        dtype=col_types(read_col_names(path), num_dtype)
                       )
           .rename(columns={'TIMESTAMP_END': "time"})
           .set_index("time")
           .filter(meteo_vars.keys(), axis='columns')
           .rename(columns=meteo_vars))

hainich default df

In [None]:
hai_path_raw

PosixPath('/home/simone/Documents/uni/Thesis/GPFA_imputation/data/FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4.csv')

In [None]:
%time hai = read_fluxnet_csv(hai_path_raw, None)

CPU times: user 25.2 s, sys: 1min 48s, total: 2min 13s
Wall time: 2min 14s


In [None]:
hai.to_parquet(hai_path)

In [None]:
%time hai64 = read_fluxnet_csv(hai_path_raw, None, num_dtype=np.float64)

CPU times: user 23.5 s, sys: 1.44 s, total: 25 s
Wall time: 25.1 s


In [None]:
hai64.to_parquet(hai_path64)

In [None]:
%time pd.read_parquet(hai_path64)

CPU times: user 16.2 ms, sys: 6.29 ms, total: 22.5 ms
Wall time: 14.6 ms


Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:30:00,-0.60,0.0,0.222
2000-01-01 01:00:00,-0.65,0.0,0.122
2000-01-01 01:30:00,-0.58,0.0,0.090
2000-01-01 02:00:00,-0.51,0.0,0.110
2000-01-01 02:30:00,-0.49,0.0,0.102
...,...,...,...
2012-12-31 22:00:00,4.75,0.0,2.249
2012-12-31 22:30:00,4.48,0.0,2.154
2012-12-31 23:00:00,4.32,0.0,2.108
2012-12-31 23:30:00,4.02,0.0,1.996


In [None]:
meteo_vars_big

{'TA_F': 'TA',
 'SW_IN_F': 'SW_IN',
 'LW_IN_F': 'LW_IN',
 'VPD_F': 'VPD',
 'WS_F': 'WS',
 'PA_F': 'PA',
 'P_F': 'P',
 'SWC_F_MDS': 'SWC',
 'TS_F_MDS': 'TS'}

In [None]:
read_fluxnet_csv(hai_path_raw, None, meteo_vars = meteo_vars_big, num_dtype=np.float64).to_parquet(hai_big_path)

In [None]:
hai_big = pd.read_parquet(hai_big_path)

In [None]:
sum(hai_big.P == 0.0) / len(hai_big)

0.9071997613532674

In [None]:
#| export
try:
    hai = pd.read_parquet(hai_path)
except FileNotFoundError: # for CI
    hai = pd.DataFrame()

In [None]:
hai.dtypes

TA       float32
SW_IN    float32
VPD      float32
dtype: object

In [None]:
hai

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:30:00,-0.60,0.0,0.222
2000-01-01 01:00:00,-0.65,0.0,0.122
2000-01-01 01:30:00,-0.58,0.0,0.090
2000-01-01 02:00:00,-0.51,0.0,0.110
2000-01-01 02:30:00,-0.49,0.0,0.102
...,...,...,...
2012-12-31 22:00:00,4.75,0.0,2.249
2012-12-31 22:30:00,4.48,0.0,2.154
2012-12-31 23:00:00,4.32,0.0,2.108
2012-12-31 23:30:00,4.02,0.0,1.996


## ERA

In [None]:
#| export
hai_era_path_raw = here("data/Hainich") / "FLX_DE-Hai_FLUXNET2015_ERAI_HH_1989-2014_1-4.csv"
hai_era_path = here("data/Hainich")/"FLX_DE-Hai_FLUXNET2015_ERAI_HH_1989-2014_1-4_float32.parquet"
hai_era_path64 = here("data/Hainich")/"FLX_DE-Hai_FLUXNET2015_ERAI_HH_1989-2014_1-4_float64.parquet"
hai_era_big_path = here("data/Hainich")/"FLX_DE-Hai_FLUXNET2015_ERAI_HH_1989-2014_1-4_float64_big.parquet"

In [None]:
_def_meteo_vars

{'TA_F': 'TA', 'SW_IN_F': 'SW_IN', 'VPD_F': 'VPD'}

In [None]:
#| export
era_vars = {
    'TA_ERA': 'TA_ERA',
    'SW_IN_ERA': 'SW_IN_ERA',
    'VPD_ERA': 'VPD_ERA'
}

era_vars_big = {f"{var}_ERA" : f"{var}_ERA"  for var in ['TA', 'SW_IN','VPD', 'PA', 'P', 'WS', 'LW_IN',]}

In [None]:
%time hai_era = read_fluxnet_csv(hai_era_path_raw, None, meteo_vars = era_vars)

CPU times: user 41 s, sys: 32.4 ms, total: 41.1 s
Wall time: 41.3 s


In [None]:
hai_era.to_parquet(hai_era_path)

In [None]:
%time hai_era = pd.read_parquet(hai_era_path)

CPU times: user 21.2 ms, sys: 10.3 ms, total: 31.5 ms
Wall time: 18.4 ms


In [None]:
hai_era64 = read_fluxnet_csv(hai_era_path_raw, None, meteo_vars = era_vars, num_dtype=np.float64)

In [None]:
hai_era64.to_parquet(hai_era_path64)

In [None]:
%time hai_era64 = read_fluxnet_csv(hai_era_path_raw, None, meteo_vars = era_vars_big, num_dtype=np.float64).to_parquet(hai_era_big_path)

CPU times: user 39.8 s, sys: 65.6 ms, total: 39.9 s
Wall time: 40.1 s


### Control map

In [None]:
#| export
control_map = {f"{var}_ERA" : var  for var in ['TA', 'SW_IN','VPD', 'PA', 'P', 'WS', 'LW_IN',]}

## Plotting

Scales for consistent colors for plotting variables

In [None]:
#| export
import altair as alt

In [None]:
units_big.keys()

dict_keys(['TA', 'SW_IN', 'VPD', 'PA', 'P', 'WS', 'LW_IN', 'TS', 'SWC'])

In [None]:
#| export
dark2 = ['#1B9E77', '#D95F02', '#7570B3', '#E7298A', '#66A61E', '#E6AB02', '#A6761D', '#666666']

scale_meteo = alt.Scale(domain = ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS', 'PA', 'SWC', 'TS', 'P'], range = dark2)

In [None]:
df = pd.DataFrame({'vars' : units_big.keys()})

In [None]:
scale_meteo

Scale({
  domain: ['TA', 'SW_IN', 'LW_IN', 'VPD', 'WS', 'PA', 'SWC', 'TS', 'P'],
  range: ['#1B9E77', '#D95F02', '#7570B3', '#E7298A', '#66A61E', '#E6AB02', '#A6761D', '#666666']
})

In [None]:
alt.Chart(df).mark_rect().encode(x = 'vars', color = alt.Color('vars', scale= scale_meteo))

if we remove one variable the order doesn't change

In [None]:
alt.Chart(df[df.vars != 'SW_IN']).mark_rect().encode(x = 'vars', color = alt.Color('vars', scale= scale_meteo))

## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()