# Single latent

First analysis of the Hainich data using GPFA for filling the gaps

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from meteo_imp.gpfa.imputation import *
from meteo_imp.data_preparation import *

import torch
import pandas as pd
import numpy as np
from pathlib import Path
from pyprojroot import here
import matplotlib.pyplot as plt

In [None]:
torch.manual_seed(27);
np.random.seed(27);

## Load Data

take the first 200 rows from the Hainich dataset

In [None]:
hai_path = Path("FLX_DE-Hai_FLUXNET2015_FULLSET_HH_2000-2012_1-4.csv")
hai_raw = pd.read_csv(here("data") / hai_path, na_values=["-9999", "-9999.99"], parse_dates=[0, 1], nrows=200)

In [None]:
meteo_vars = {
    "TA_F": "TA",
    "SW_IN_F": "SW_IN",
    "LW_IN_F": "LW_IN",
    "VPD_F": "VPD",
    #"PA": "PA"
}

units = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    'LW_IN': 'W m-2',
    'VPD': 'hPa'
}

hai = (hai_raw
       .rename(columns=meteo_vars)
       .set_index("TIMESTAMP_END")
       .loc[:, meteo_vars.values()])
hai

Unnamed: 0_level_0,TA,SW_IN,LW_IN,VPD
TIMESTAMP_END,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:30:00,-0.60,0.0,302.475,0.222
2000-01-01 01:00:00,-0.65,0.0,302.475,0.122
2000-01-01 01:30:00,-0.58,0.0,301.677,0.090
2000-01-01 02:00:00,-0.51,0.0,301.677,0.110
2000-01-01 02:30:00,-0.49,0.0,301.677,0.102
...,...,...,...,...
2000-01-05 02:00:00,4.74,0.0,330.202,1.191
2000-01-05 02:30:00,4.75,0.0,330.202,1.057
2000-01-05 03:00:00,4.76,0.0,330.202,0.935
2000-01-05 03:30:00,4.62,0.0,330.202,1.162


## Random Gaps

In [None]:
gpfa_data = GPFADataTest(hai).add_random_missing()

In [None]:
gpfa_hai = GPFAImputation(gpfa_data.data, gpfa_data.tidy_df(complete=True, is_missing=True))

TypeError: rand(): argument 'size' must be tuple of ints, but found element of type DataFrame at pos 2

In [None]:
gpfa_hai

In [None]:
%time imputed = gpfa_hai.impute()

  0%|          | 0/100 [00:00<?, ?it/s]

torch.linalg.solve_triangular has its arguments reversed and does not return a copy of one of the inputs.
X = torch.triangular_solve(B, A).solution
should be replaced with
X = torch.linalg.solve_triangular(A, B). (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:2183.)
  res = torch.triangular_solve(right_tensor, self.evaluate(), upper=self.upper).solution
 does not have profile information (Triggered internally at  ../torch/csrc/jit/codegen/cuda/graph_fuser.cpp:104.)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


CPU times: user 5min 57s, sys: 485 ms, total: 5min 57s
Wall time: 5min 59s


In [None]:
imputed

Unnamed: 0,time,variable,mean,std
0,0.0,TA,-0.600000,
1,2.0,TA,-0.580000,
2,3.0,TA,-0.510000,
3,4.0,TA,-0.490000,
4,11.0,TA,-0.230000,
...,...,...,...,...
403,189.0,VPD,0.826632,0.252326
404,190.0,VPD,0.827371,0.252322
405,192.0,VPD,1.213000,0.000000
406,193.0,VPD,0.826446,0.252319


In [None]:
hai_plot = gpfa_hai.plot_pred(units=units, properties =  {'height': 190 , 'width': 380})

hai_plot.save("plots/plot_hai_winter_4_var_200_obs_random_gaps_row_20_value_10.vl.json")
hai_plot

  for col_name, dtype in df.dtypes.iteritems():


In [None]:
#gpfa_hai.plot_pred(complete= gpfa_data.tidy_df(complete=True, is_missing=True) )

In [None]:
gpfa_hai.rmse()

AttributeError: module 'sklearn' has no attribute 'metrics'

In [None]:
gpfa_hai.r2()

In [None]:
losses = pd.DataFrame(gpfa_hai.learner.losses.cpu().numpy(), columns=['loss'])

p = losses.plot()
plt.savefig(here('analysis/plots/loss_plot_hai_winter_4_var_200_obs_random_gaps_row_20_value_10.png'))
p

Lambda parameter, the latent variable is very similar to the 

In [None]:
gpfa_hai.data.corr()

NameError: name 'gpfa_hai' is not defined

In [None]:
gpfa_hai.learner.model.covar_module.Lambda.detach()

this is the value of the length scale of the RBF latent kernel

In [None]:
gpfa_hai.learner.model.covar_module.latent_kernel.lengthscale.detach()

In [None]:
gpfa_hai.learner.model.covar_module.psi.detach()

### Summer

The low correlation between `SW_IN` and `TA` is likely due to cloud cover, which is hard to predict with a dialy cycle. Hence we are looking at summer days and there is a much better correlation

In [None]:
hai_raw2 = pd.read_csv(here("data") / hai_path, na_values=["-9999", "-9999.99"], parse_dates=[0, 1], nrows= 7 * 30 * 24 * 2)

NameError: name 'pd' is not defined

In [None]:
meteo_vars = {
    "TA_F": "TA",
    "SW_IN_F": "SW_IN",
    "LW_IN_F": "LW_IN",
    "VPD_F": "VPD",
    #"PA": "PA"
}

units = {
    'TA': '°C',
    'SW_IN': 'W m-2',
    'LW_IN': 'W m-2',
    'VPD': 'hPa'
}

hai2 = (hai_raw2
       .rename(columns=meteo_vars)
       .set_index("TIMESTAMP_END")
       .loc[:, meteo_vars.values()])
hai2

In [None]:
hai2[-800:-500].SW_IN.plot()

In [None]:
hai2[-800:-500].corr()

In [None]:
gpdata2 = GPFADataTest(hai2[-800:-500].copy()).add_random_missing()

In [None]:
gp_imp2 = GPFAImputation(gpdata2.data, gpdata2.tidy_df(complete=True, is_missing=True))

In [None]:
%time data_imp2 = gp_imp2.impute()

In [None]:
gp_imp2.plot_pred(units=units)

In [None]:
data_imp2

In [None]:
gp_imp2.rmse()

In [None]:
gpdata2.data.corr()

In [None]:
gp_imp2.learner.model.covar_module.Lambda.detach()

In [None]:
gp_imp2.learner.model.covar_module.psi.detach()

this is the value of the length scale of the RBF latent kernel

In [None]:
gp_imp2.learner.model.covar_module.latent_kernel.lengthscale.detach()

#### Only TA and SW_IN

In [None]:
gpdata3 = GPFADataTest(hai2[-800:-500].loc[:, ["TA", "SW_IN"]].copy()).add_random_missing()

In [None]:
gp_imp3 = GPFAImputation(gpdata3.data, gpdata3.tidy_df(complete=True, is_missing=True))

In [None]:
%time data_imp3 = gp_imp3.impute()

In [None]:
gp_imp3.plot_pred(units=units, bind_interaction=False)

In [None]:
data_imp3

In [None]:
gpdata3.data.corr()

In [None]:
gp_imp3.learner.model.covar_module.Lambda.detach()

this is the value of the length scale of the RBF latent kernel

In [None]:
gp_imp3.learner.model.covar_module.latent_kernel.lengthscale.detach()

In [None]:
gp_imp3.learner.model.covar_module.latent_kernel.lengthscale.detach()

## Continous gaps

Trying to see how the model works with a continous gap of 10% the length of the dataset for all variables

In [None]:
gpd_gap = GPFADataTest(hai).add_gap(20, variables = ['TA', 'SW_IN', 'LW_IN', 'VPD'])

In [None]:
gp_gap = GPFAImputation(gpd_gap.data, gpd_gap.tidy_df(complete=True, is_missing=True))

In [None]:
gp_gap

In [None]:
%time gp_gap.impute()

In [None]:
gap_plot= gp_gap.plot_pred(units=units, properties =  {'height': 190 , 'width': 380})

gap_plot.save(here("analysis/plots") /" plot_hai_winter_4_var_200_obs_gap_20.vl.json")
gap_plot

In [None]:
print(gp_gap.rmse().to_markdown(index=False))

In [None]:
print(pd.DataFrame(gp_gap.learner.model.covar_module.Lambda.detach().numpy()).to_markdown(index=False))

In [None]:
psi = pd.DataFrame(gp_gap.learner.model.covar_module.psi.detach().numpy())
psi.insert(0, "variable", meteo_vars.values())
print(psi.to_markdown(index=False))

In [None]:
gp_gap.learner.model.covar_module.latent_kernel.lengthscale.detach()

In [None]:
losses = pd.DataFrame(gp_gap.learner.losses.cpu().numpy(), columns=['loss'])

p = losses.plot()
plt.savefig(here('analysis/plots/') /'loss_plot_hai_winter_4_var_200_obs_gap_20.png')
p