# load data

In [None]:
pwd

In [None]:
%matplotlib inline
from netCDF4 import Dataset

import numpy as np
import matplotlib.pyplot as plt
import seaborn

import predictability_utils as pu
from predictability_utils.utils import helpers, io

root_data = '../../data/pyrina'

# air pressure at mean sea level (North Atlantic & EU) ANOMALIES
msl_naeu_anomalies, _ = io.data_load('msl', 'NA-EU', 'anomalies', root_data)

# sea surface temperatures (North Atlantic & EU) ANOMALIES
sst_naeu_anomalies, sst_naeu_anomalies_mask = io.data_load('sst', 'NA-EU', 'anomalies', root_data)

# Volumetric soil water layer 1 (EU) ANOMALIES
swvl1_eu_anomalies, _ = io.data_load('swvl1', 'EU', 'anomalies', root_data)

# Temperature at 2m (EU) ANOMALIES
t2m_eu_anomalies, _ = io.data_load('t2m', 'EU', 'anomalies', root_data)

# sea surface temperatures (TNA) ANOMALIES
sst_tna_anomalies, sst_tna_anomalies_mask = io.data_load('sst', 'TNA', 'anomalies', root_data)

# training data time stamps and map shape
nc_fn = root_data + "/t2m_ERA20c_monthly_1900-2010.EU.mv.nc"
ts = Dataset(nc_fn, 'r').variables['time'].__array__().data
t2m_eu = Dataset(nc_fn, 'r').variables['t2m'].__array__().data
map_shape = t2m_eu.shape[1:]

train_months, test_months = [2,3,4], [5,6,7]
y_train = 51

tmp = helpers.split_train_data(ts, y_train, train_months, test_months)
idx_source_train, idx_target_train, idx_source_test, idx_target_test = tmp

# let's not miss a year
assert np.prod(idx_source_train.shape) + np.prod(idx_source_test.shape) == len(ts)/12 * len(train_months)
assert np.prod(idx_target_train.shape) + np.prod(idx_target_test.shape) == len(ts)/12 * len(test_months)

plt.figure(figsize=(16,8))
plt.subplot(1,3,1)
plt.imshow(t2m_eu[::12,:,:].mean(axis=0))
plt.title('avg t2m map for EU')
plt.subplot(1,3,2)
plt.imshow(sst_tna_anomalies_mask[0,:,:])
plt.title('TNA mask (SSTs))')
plt.subplot(1,3,3)
plt.imshow(sst_naeu_anomalies_mask[0,:,:])
plt.title('NA EU mask (SSTs)')
plt.show()


In [None]:
n_latents = 5

# recreate CCA analysis
- Canonical correlation analysis to identify subspaces $U$, $V$ in source space $X$ and target space $Y$, respectively, such that $(UX)_i$ and $(VY)_i$ are maximally correlated.
- in a second step, establish a (linear) mapping from $VY \approx Q UX$ to predict $VY$ from $UX$.
- predict new $Y$ from $Y \approx V^\dagger Q UX$

In [None]:
from predictability_utils.methods.cca_method import CCA_method
from predictability_utils.utils import viz

# predict T2ms in Summer from soil moisture levels in Spring (1900 - 1950)
X = swvl1_eu_anomalies.reshape(len(ts), -1)[idx_source_train,:].mean(axis=0)
Y = t2m_eu_anomalies.reshape(len(ts), -1)[idx_target_train,:].mean(axis=0)

# fit CCA-based model
ccam = CCA_method(n_latents=n_latents)
ccam.fit(X,Y)

# predict T2ms for test data (1951 - 2010)
X_f = swvl1_eu_anomalies.reshape(len(ts), -1)[idx_source_test,:].mean(axis=0)
out_pred = ccam.predict(X_f)

# evaluate prediction performance
out_true = t2m_eu_anomalies.reshape(len(ts), -1)[idx_target_test,:].mean(axis=0)
anomaly_corrs = helpers.compute_anomaly_corrs(out_true, out_pred)

# visualize anomaly correlations
viz.visualize_anomaly_corrs(anomaly_corrs.reshape(*map_shape))

# visualize example predictions for selected years
viz.visualize_example_preds(out_true, out_pred, map_shape, y_train, years=[0,15,30,45])


# simple low-rank linear prediction (pixel MSEs) 

- set up simple model $Y = W X$ with $W = U V$
- low-rank: if $Y \in \mathbb{R}^N, X \in \mathbb{R}^M$, then $W \in \mathbb{R}^{N \times M}$, but $U \in \mathbb{R}^{N \times k}, V \in \mathbb{R}^{k \times M}$ with $k << M,N$ !
- low-rank structure saves us parameters: $M N$ parameters in $W$, but only $N k + k M$ in $U$ and $V$, helps prevent overfitting on low samples size

In [None]:
from predictability_utils.methods.lrlin_method import LR_lin_method

import torch
torch.manual_seed(42)

# predict T2ms in Summer from soil moisture levels in Spring
X = torch.tensor(swvl1_eu_anomalies.reshape(len(ts), -1)[idx_source_train,:].mean(axis=0))
Y = torch.tensor(t2m_eu_anomalies.reshape(len(ts), -1)[idx_target_train,:].mean(axis=0))
    
# fit CCA-based model
lrlm = LR_lin_method(n_latents=n_latents)
loss_vals = lrlm.fit(X,Y, n_epochs=3000)

plt.semilogx(loss_vals[100:])
plt.title('loss curve')
plt.show()

# predict T2ms for test data (1951 - 2010)
X_f = swvl1_eu_anomalies.reshape(len(ts), -1)[idx_source_test,:].mean(axis=0)
out_pred = lrlm.predict(X_f)

# evaluate prediction performance
out_true = t2m_eu_anomalies.reshape(len(ts), -1)[idx_target_test,:].mean(axis=0)
anomaly_corrs = helpers.compute_anomaly_corrs(out_true, out_pred)

# visualize anomaly correlations
viz.visualize_anomaly_corrs(anomaly_corrs.reshape(*map_shape))

# visualize example predictions for selected years
viz.visualize_example_preds(out_true, out_pred, map_shape, y_train, years=[0,15,30,45])


# debug