In [1]:
from pathlib import Path
import polars as pl
from datetime import datetime
from fastcore.utils import * # support of ls for paths

In [2]:
out_dir = Path("../../fluxnet/gap_stat")
download_dir = Path("/run/media/simone/Simone DATI/fluxnet_all")

In [3]:
site_info = pl.read_parquet(out_dir / "../site_info.parquet").select([
    pl.col("start").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
    pl.col("end").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M"),
    pl.col("site").cast(pl.Categorical).sort()
])

In [4]:
site_info.head()

start,end,site
datetime[μs],datetime[μs],cat
2009-01-01 00:30:00,2012-01-01 00:00:00,"""AR-SLu"""
2009-01-01 00:30:00,2013-01-01 00:00:00,"""AR-Vir"""
2002-01-01 00:30:00,2013-01-01 00:00:00,"""AT-Neu"""
2007-01-01 00:30:00,2010-01-01 00:00:00,"""AU-Ade"""
2010-01-01 00:30:00,2015-01-01 00:00:00,"""AU-ASM"""


In [5]:
def duration_n_obs(duration):
    "converts a duration into a n of fluxnet observations"
    return abs(int(duration.total_seconds() / (30 * 60)))

In [6]:
duration_n_obs(site_info[1, "start"] - site_info[1, "end"])

70127

In [7]:
# maybe this code should actually go in 20_gap_finding
files = out_dir.ls()
files.sort() # need to sort to match the site_info
sites = []
for i, path in enumerate(files):
    sites.append(pl.scan_parquet(path).with_columns([
        pl.lit(site_info[i, "site"]).alias("site"),
        pl.lit(duration_n_obs(site_info[i, "start"] -  site_info[i, "end"])).alias("total_obs"),
        pl.col("TIMESTAMP_END").cast(pl.Utf8).str.strptime(pl.Datetime, "%Y%m%d%H%M").alias("end"),
    ]))

In [9]:
gap_stat = pl.concat(sites)

In [12]:
gap_stat.head().fetch(5)

TIMESTAMP_END,gap_len,variable,site,total_obs,end
i64,u32,str,str,i32,datetime[μs]
200901010030,16992,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-01-01 00:30:00
200912211100,5,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-12-21 11:00:00
200912211700,1,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-12-21 17:00:00
201001061300,1,"""TA_F_MDS_QC""","""AR-SLu""",52559,2010-01-06 13:00:00
201001071300,3,"""TA_F_MDS_QC""","""AR-SLu""",52559,2010-01-07 13:00:00


In [24]:
def filter_variables(variables = ["TA_F_QC", "SW_IN_QC", "LW_IN_QC", "VPD_F_QC"]):
    expr = False
    for var in variables:
        expr |= pl.col("variable") == var
    return expr

some sites have a lot of data missing, with the avg gap length of several years, so is seems that the year can have an impact

Important! here the 3 possibles gap value of a QC variable are considered as one (null, 1, 2) we should co

In [35]:
gap_stat.filter(
    pl.col("variable") == "TA_F_QC" 
).groupby("site").agg([
    pl.col("gap_len").mean().alias("mean"),
    (pl.col("gap_len").sum() / pl.col("total_obs").first()).alias("frac_gap")
]).collect()

site,mean,frac_gap
str,f64,f64
"""DE-Zrk""",623.818182,0.195839
"""US-Wi8""",3328.333333,0.569953
"""CN-Du2""",346.526316,0.125154
"""FI-Hyy""",16.82732,0.039199
"""US-Wi7""",43.500938,1.323477
"""CN-Cng""",2228.0,0.158855
"""US-IB2""",640.923077,0.118812
"""IT-SR2""",1.5,0.000086
"""CN-Qia""",1.5,0.000057
"""BR-Sa3""",153.370787,0.311306


In [38]:
gaps_year_site_ta = gap_stat.filter(
    pl.col("variable") == "TA_F_QC" 
).with_column(
  pl.col("end").dt.year().alias("year")  
).groupby(["site", "year"]).agg([
    pl.col("gap_len").mean().alias("mean"),
    (pl.col("gap_len").sum() / (48 * 365)).alias("frac_gap")
]).sort(["site", "year"]).collect()

In [45]:
[y for y in gaps_year_site_ta["year"].unique()]

[1991,
 1992,
 1993,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015]

In [74]:
def visualize_by_site(df):
    sites = df["site"].unique()
    for site in sites:
        yield df.filter(pl.col("site") == site)

In [95]:
by_site = list(visualize_by_site(gaps_year_site_ta))

In [99]:
from IPython.display import display
from ipywidgets import widgets

In [100]:
i = 0

In [103]:
button_next = widgets.Button(description="Next", icon="arrow-right")
button_prev = widgets.Button(description="Previous", icon="arrow-left")
output = widgets.Output()

display(button_next, button_prev, output)

i = 0

def update_view():
    with output:
        print(f"{i} of {len(by_site)}")
        display(by_site[i])
    output.clear_output(wait=True)

def on_next(b):
    global i
    if i < len(by_site):
        i +=1
    else:
        button.disabled = True
    update_view()

def on_prev(b):
    global i
    if i > 0:
        i -=1
    else:
        button.disabled = True
    update_view()
    

button_next.on_click(on_next)
button_prev.on_click(on_prev)

Button(description='Next', icon='arrow-right', style=ButtonStyle())

Button(description='Previous', icon='arrow-left', style=ButtonStyle())

Output()

In [102]:
gap_stat.with_columns(
    (pl.col("gap_len") / pl.col("total_obs")).alias("frac_gap")
).head().collect()

TIMESTAMP_END,gap_len,variable,site,total_obs,end,frac_gap
i64,u32,str,str,i32,datetime[μs],f64
200901010030,16992,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-01-01 00:30:00,0.323294
200912211100,5,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-12-21 11:00:00,9.5e-05
200912211700,1,"""TA_F_MDS_QC""","""AR-SLu""",52559,2009-12-21 17:00:00,1.9e-05
201001061300,1,"""TA_F_MDS_QC""","""AR-SLu""",52559,2010-01-06 13:00:00,1.9e-05
201001071300,3,"""TA_F_MDS_QC""","""AR-SLu""",52559,2010-01-07 13:00:00,5.7e-05


In [None]:
gap_stat.with_columns(
    (pl.col("gap_len") / pl.col("total_obs")).alias("frac_gap")
).groupby(["variable", "site"]).agg(pl.col("gap_len").sum()).head().collect()

In [None]:
gap_stat.with_columns(
    (pl.col("gap_len") / pl.col("total_obs")).alias("frac_gap")
).groupby(["variable", "site"]).agg(
    pl.col("gap_len").sum() / pl.col("total_obs").first()
).groupby("variable").agg(pl.col("gap_len").mean()).collect()

In [None]:
gap_stat.with_columns(
    (pl.col("gap_len") / pl.col("total_obs")).alias("frac_gap")
).groupby(["variable", "site"]).agg(
    pl.col("gap_len").sum() / pl.col("total_obs").first()
).filter(pl.col("variable") == "TA_F_QC").collect()