# Fluxnet Hainich
> Load Hainich dataset from fluxnet

Due to license reason the data is not included in the repo, but you can download it from fluxnet

In [None]:
#| hide
#| default_exp data

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| export
from pathlib import Path
from pyprojroot import here
import pandas as pd
import numpy as np

In [None]:
#| exports
_def_meteo_vars = {
    "TA_F": "TA",
    "SW_IN_F": "SW_IN",
    # "LW_IN_F": "LW_IN",
    "VPD_F": "VPD",
    #"PA": "PA"
}


units = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    # 'LW_IN': 'W m-2',
    'VPD': 'hPa'
}

hai_path = here("data") / "FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4.csv"

In [None]:
#| exporti

def get_dtype(col_name: str):
    "Get correct dtype based on column name"
    if col_name in ["TIMESTAMP_END", "TIMESTAMP_START"]:
        return 'str'
    elif col_name.endswith("QC"):
        return None # pd.CategoricalDtype
    else:
        return np.float32

def col_types(cols):
    return {col: get_dtype(col) for col in cols}

def read_col_names(path):
    "read only column names from csv"
    return pd.read_csv(path, nrows=0).columns

In [None]:
col_types(read_col_names(hai_path)[:10]) # only for 10 cols for testing

{'TIMESTAMP_START': 'str',
 'TIMESTAMP_END': 'str',
 'TA_F_MDS': numpy.float32,
 'TA_F_MDS_QC': None,
 'TA_ERA': numpy.float32,
 'TA_F': numpy.float32,
 'TA_F_QC': None,
 'SW_IN_POT': numpy.float32,
 'SW_IN_F_MDS': numpy.float32,
 'SW_IN_F_MDS_QC': None}

In [None]:
#| export
def read_fluxnet_csv(path,
                     nrows:int,
                     meteo_vars: dict[str, str] = _def_meteo_vars,):
    "Read fluxnet csv in Pandas with correct parsing of csv"
    return (pd.read_csv(path, na_values=["-9999", "-9999.99"],
                        parse_dates=[0, 1],
                        nrows=nrows,
                        dtype=col_types(read_col_names(hai_path))
                       )
           .rename(columns=meteo_vars)
           .rename(columns={'TIMESTAMP_END': "time"})
           .set_index("time")
           .loc[:, meteo_vars.values()])

hainich default df

In [None]:
#| export
try:
    hai = read_fluxnet_csv(hai_path, 200)
except FileNotFoundError: # for CI
    hai = pd.DataFrame()

In [None]:
hai.dtypes

TA       float32
SW_IN    float32
VPD      float32
dtype: object

In [None]:
hai

Unnamed: 0_level_0,TA,SW_IN,VPD
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2000-01-01 00:30:00,-0.60,0.0,0.222
2000-01-01 01:00:00,-0.65,0.0,0.122
2000-01-01 01:30:00,-0.58,0.0,0.090
2000-01-01 02:00:00,-0.51,0.0,0.110
2000-01-01 02:30:00,-0.49,0.0,0.102
...,...,...,...
2000-01-05 02:00:00,4.74,0.0,1.191
2000-01-05 02:30:00,4.75,0.0,1.057
2000-01-05 03:00:00,4.76,0.0,0.935
2000-01-05 03:30:00,4.62,0.0,1.162


## Export 

In [None]:
#| hide
from nbdev import nbdev_export
nbdev_export()