# Gap Length Variation

In [None]:
%load_ext autoreload
%autoreload 2

## Setup

In [None]:
from meteo_imp.gpfa.imputation import *
from meteo_imp.data_preparation import *
from meteo_imp.results import *
from meteo_imp.results import _display_as_row 

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from pyprojroot import here
import matplotlib.pyplot as plt

from meteo_imp.utils import cache_disk

from itertools import combinations, repeat, zip_longest

from ipywidgets import interact
from tqdm.auto import tqdm

from multiprocessing import Pool

import pickle

ModuleNotFoundError: No module named 'meteo_imp.gpfa.data_preparation'

### Load Data

take the first 200 rows from the Hainich dataset

In [None]:
hai_path = Path("FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4.csv")
hai_raw = pd.read_csv(here("data") / hai_path, na_values=["-9999", "-9999.99"], parse_dates=[0, 1], nrows=200)

In [None]:
meteo_vars = {
    "TA_F": "TA",
    "SW_IN_F": "SW_IN",
    #"LW_IN_F": "LW_IN",
    "VPD_F": "VPD",
    #"PA": "PA"
}

units = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    # 'LW_IN': 'W m-2',
    'VPD': 'hPa'
}

hai = (hai_raw
       .rename(columns=meteo_vars)
       .set_index("TIMESTAMP_END")
       .loc[:, meteo_vars.values()])
hai

###  Functions

makes here all the slow computations and cache them on disk

In [None]:
n_obs = 200
n_latent = 1
total_iter = 100

In [None]:
model_save_dir = here() / "analysis/trained_models"

model_path = model_save_dir / f"GPFA_l_{n_latent}_train_{total_iter}_1ker_{n_obs}_obs.pickle"

In [None]:
data = GPFADataTest(hai[:n_obs])

In [None]:
# inspired from https://datagy.io/python-combinations-of-a-list/
def all_comb(l):
    list_combinations = []
    for n in range(1, len(l) + 1):
        list_combinations += list(combinations(l, n))
    return list_combinations

In [None]:
all_comb(meteo_vars.values())

In [None]:
def to_result_pretrained(gap_len, n_latent, var_sel, gap_start=None):
    data = GPFADataTest(hai[:n_obs]).add_gap(gap_len, var_sel, gap_start)
    imp = GPFAImputationExplorer(data.data, latent_dims = n_latent)
    model_path = model_save_dir / f"GPFA_l_{n_latent}_train_{total_iter}_1ker_{n_obs}_obs.pickle"
    imp.learner.load(model_path)
    return imp.to_result(data.data_compl_tidy, units=units)

In [None]:
# to_result_pretrained(10, 1, ['TA'])

In [None]:
#| hide
# @cache_disk(here() / ".cache/diff_gaps")
# def diff_gaps(gap_start=30):
#     return {n_lat:
#             {var_sel:{ 
#                 gap_len: to_result_pretrained(GPFADataTest(hai[:n_obs]).add_gap(gap_len, ['TA'], gap_start), n_lat)
#                 for gap_len in [2, 4, 5, 7, 10 , 15, 20, 30, 50, 100]
#                 }
#                 for var_sel in all_comb(meteo_vars.values())}
#             for n_lat in range(1,4)}


In [None]:
gaps = [2, 5, 7, 10, 20, 30, 50, 100]
gap_starts = [0, 30, 60, 90]

In [None]:
path_base = here() / ".cache/diff_gap_partial"
# path_base.rmdir()

In [None]:
def process_var_sel(args, path_base=path_base):
    var_sel, n_lat = args # limitations in python map...
    f_name = path_base / f"{'-'.join(var_sel)}__l_{n_lat}.pickle"
    if f_name.exists(): return
    out = {}
    for gap_len in gaps:
        out[gap_len] = {}
        for gap_start in gap_starts:
            out[gap_len][gap_start] = to_result_pretrained(gap_len, n_latent=n_lat, var_sel = var_sel, gap_start=gap_start) 
    with open(f_name, "wb") as f:
        pickle.dump(out, f)    

In [None]:
#| hide
# # this is going to run on the process
# # @cache_disk(here() / ".cache/diff_gaps/diff_gaps")
# def process_n_lat(n_lat):
#     out = {}
#     for var_sel in tqdm(all_comb(meteo_vars.values()), desc=f"latent: {n_lat}"):
#         out[var_sel] = {}
#         for gap_len in tqdm(gaps, desc=f"sel: {var_sel}"):
#             out[var_sel][gap_len] = {}
#             for gap_start in tqdm(gap_starts, desc=f"gap len: {gap_len}"):
#                 out[var_sel][gap_len][gap_start] = to_result_pretrained(gap_len, n_latent=n_lat, var_sel = var_sel, gap_start=gap_start)
#     return out

In [None]:
# @cache_disk(here() / ".cache/diff_gaps/diff_gaps")
def compute_diff_gaps(gap_start=30):
    for n_lat in tqdm(range(1,4)):
        with Pool(processes=4) as pool:
            list(pool.imap(process_var_sel, zip(all_comb(meteo_vars.values()), repeat(n_lat,))))

### Compute

this is memory intensive! (maybe there is a leak to fix somewhere ...)

In [None]:
# compute_diff_gaps()

In [None]:
#| hide
# @cache_disk(here() / ".cache/diff_gaps/diff_gaps")
# def diff_gaps(gap_start=30):
#     with Pool(processes=4) as pool:
#         out = pool.map(res, range(1,4)
#         for n_lat in tqdm(range(1,4)):
#             out[n_lat] = {}
#             for var_sel in tqdm(all_comb(meteo_vars.values()), desc=f"latent: {n_lat}"):
#                 out[n_lat][var_sel] = {}
#                 for gap_len in tqdm(gaps, desc=f"sel: {var_sel}"):
#                     out[n_lat][var_sel][gap_len] = {}
#                     for gap_start in tqdm(gap_starts, desc=f"gap len: {gap_len}"):
#                         out[n_lat][var_sel][gap_len][gap_start] = to_result_pretrained(gap_len, n_latent=n_lat, var_sel = var_sel, gap_start=gap_start)
#     return out


In [None]:
#| hide
# @cache_disk(here() / ".cache/diff_gaps")
# def diff_gaps(gap_start=30):
#     with Pool(processes=4) as pool:
#         out = {}  
#         for n_lat in tqdm(range(1,4)):
#             out[n_lat] = {}
#             for var_sel in tqdm(all_comb(meteo_vars.values()), desc=f"latent: {n_lat}"):
#                 out[n_lat][var_sel] = {}
#                 for gap_len in tqdm(gaps, desc=f"sel: {var_sel}"):
#                     out[n_lat][var_sel][gap_len] = {}
#                     f = lambda gap_start: to_result_pretrained(gap_len, n_latent=n_lat, var_sel = var_sel, gap_start=gap_start)
#                     results = pool.map(f, gap_starts)
#                     for gap_start, res in zip(gap_starts, results):
#                         out[n_lat][var_sel][gap_len][gap_start] = res
#     return out


In [None]:
#| hide
# diff_gaps_res = diff_gaps()

In [None]:
# loads computations from disk
def load_diff_gaps():
    out = {}
    for n_lat in tqdm(range(1,4)):
        out[n_lat] = {}
        for var_sel in all_comb(meteo_vars.values()):
            f_name = path_base / f"{'-'.join(var_sel)}__l_{n_lat}.pickle"
            with open(f_name, "rb") as f:
                out[n_lat][var_sel] = pickle.load(f)  
    return out

In [None]:
diff_gaps_res = load_diff_gaps()

## Results

What I am doing here:

- take a dataset with 200 obs and 3 variables
- [distribution and correlation between vars](var_distribution.ipynb)
- fit the kernel parameters using gradient descend on whole dataset and save trained model [notebook](Train multiple latent.ipynb)
- create a dataset with all combinations of gap_len, gap_start, n latents and variable missing
- predict the model for all 200 Obs also when there are no gaps!
- **Note: in case there is a gap in not all variable, the variable with the gap have the (correct) prediction conditioned on the other variables, but the variables with no gap have the base model prediction (which is often bad), which should not be considered**


#### Explanation

What we can see from this result:

##### Latent

- 1 latent the Lambda is almost 1 for `TA`, 0 for `SW_IN` and .4 for `VPD`. Hence is good for `TA`, horrible for `SW_IN and somehow okayish for `VPD`
- 2 latent2: good for `TA` and `SW_IN`, still limited for `VPD`
- 3 latents: quite good fit for all 3 models

**comments**

- correlation between `SW_IN` and the others variables is pretty low
- therefore with 1-2 latents the model cannot model accurately more then 1 variable

In [None]:
#| echo: false
hai.corr()

##### Gap Len


- when gaps are short <~10 the model works kind of well, but there are issues in some locations (eg: TA, len: 10 start: 60)

**comments**

the lengthscale of the kernel is quite small (3 latents):

- 5.2 z0
- 1.8 z1
- 4.0 z2

so for longer gaps (in only one var) the main driver for the predictions are observations from the other variables, otherwise the models predictions are contastant as there is no way to use more information (eg. gap_len: 50, gap_start: 30, gaps in all vars)

**notes**
- when SW_IN and VPD are close to 0 the gap filling is not that great also for shorter gaps (eg. `SW_IN`, len: 7, start: 30, n_lat: 3)


##### Gaps multiple variables

the interesting aspect is when there is a long gap, but in only 1-2 variables

- for gaps only in `TA` with len up to 50 the models manages to follow the variations in the measurements, but with an error
    - this is pretty similar if there are gaps also in `SW_IN`, but not if there are gaps in `VPD`
    - with gap len over 100 it get way worse
- - for gaps only in `VP` with len up to 50 the models overall manages to follow the variations in the measurements, but with a considerable error (measurements are still in error bar) and the models has a lot of variations which are not present in the data
- the predictions for `SW_IN` are bad (underestimates a lot the values) during the day for long gaps 

### Visualization

In [None]:
#| include: false
@interact(n_lat = range(1,4), TA=True, SW_IN=True, VPD=True, gap_len=gaps, gap_start=gap_starts)
def show_diff_gaps_res(n_lat, TA, SW_IN, VPD, gap_len, gap_start):
    var_sel = []
    if TA: var_sel.append('TA')
    if SW_IN: var_sel.append('SW_IN')
    if VPD: var_sel.append('VPD')
    var_sel = (*var_sel,)
    diff_gaps_res[n_lat][var_sel][gap_len][gap_start].units = units
    diff_gaps_res[n_lat][var_sel][gap_len][gap_start].display_results()
    

## Next steps

### Improvements GPFA

- [more kernels](Additional latent kernel.ipynb) -> can have different timescales. However with 150 obs both kernels have the same timescale, should use more data but then there are computation issues (with 1500 it would take more then 20hours to do the training)
- [log transform](Log transform - Multi latent - Imputation GPFA - Hainich.ipynb)
- more variables

#### Technical

- model performance:
at the moment it takes ~8 minutes to train with 200 obs and ~20 seconds for inference
    - profile current model
    - use SparseGP
    - CUDA support
- [parameters init](Init_parameters_effect.ipynb)
- [learning rate and stability of parameters over training](Train multiple latent.ipynb)
- variable transformation:
    - all vars are now normalized (0 mean, 1 std)
    - time is enconded as integer increasing at steps of 1. Maybe not a good idea?

### Next step

- use ERA5-Land (world-wide dataset with complete meteo vars, but a coarse spatial-temporal scale)
- compare performance with state of art models
- model where relation between variables changes over time
- understand gap distribution in real world:
    - average gap len (tentative results are: a lot of short gaps(<10) and some pretty long gap (>10.0000)
    - correlation between variable gaps
    - site distributions of gaps?

## Other

In [None]:
%time r = to_result_pretrained(12, 3, ['SW_IN', 'TA'], gap_start=60)

In [None]:
r.display_results()

In [None]:
data = GPFADataTest(hai[:n_obs]).add_gap(50, ['TA'], 30)

In [None]:
imp = GPFAImputationExplorer(data.data, latent_dims = n_latent)
imp.learner.load(model_path)

In [None]:
result_pretrained(data)