# load data

In [1]:
pwd

'/mnt/lustre01/pf/g/g260086/Python/SeasonalPred/notebooks'

In [2]:
#!pip install --user netCDF4
#!pip install --user torch
#!pip uninstall torch --yes

In [None]:
%matplotlib inline
from netCDF4 import Dataset

import numpy as np
import matplotlib.pyplot as plt
import seaborn

from predictability_utils.utils import helpers, io
from predictability_utils.methods.lrlin_method import run_lrlin
from predictability_utils.methods.cca_method import run_cca

import torch
torch.manual_seed(42)
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
else:
    print("CUDA not available")
    device = torch.device("cpu")
    torch.set_default_tensor_type("torch.FloatTensor")

    
#root_data = '/gpfs/work/nonnenma/data/forecast_predictability/pyrina/'
root_data = '/work/gg0304/g260086/HZG-ML-work/Data'

n_latents = 5
train_months, test_months = [2,3,4], [5,6,7]
y_train = 46
y_test = 40

# Volumetric soil water layer 1 (EU) ANOMALIES
source_data, _ = io.data_load('swvl1', 'EU', 'anomalies', root_data)

# Temperature at 2m (EU) ANOMALIES
target_data, _ = io.data_load('t2m', 'EU', 'anomalies', root_data)

# training data time stamps and map shape
nc_fn = root_data + "/t2m_ERA20c_monthly_1900-2010.EU.mv.nc"
ts_or = Dataset(nc_fn, 'r').variables['time'].__array__().data

t2m_eu = Dataset(nc_fn, 'r').variables['t2m'].__array__().data
map_shape = t2m_eu.shape[1:]

# recreate CCA analysis
- Canonical correlation analysis to identify subspaces $U$, $V$ in source space $X$ and target space $Y$, respectively, such that $(UX)_i$ and $(VY)_i$ are maximally correlated.
- in a second step, establish a (linear) mapping from $VY \approx Q UX$ to predict $VY$ from $UX$.
- predict new $Y$ from $Y \approx V^\dagger Q UX$

In [63]:
TAr = [ ]
for i in range(25): #last train will cretae ts starting from 1924
    t = 12 * i
    TAr.append(t)
    print(TAr)

In [None]:
Preds = [ ]
for i,n in enumerate(TAr):
    ts = ts_or[n:]
    print(len(ts))
    #print(ts)
    idcs = helpers.split_train_data(ts, y_train, train_months, test_months)
    idx_source_train, idx_target_train, idx_source_test, idx_target_test = idcs

    # let's not miss a year
    assert np.prod(idx_source_train.shape) + np.prod(idx_source_test.shape) == len(ts)/12 * len(train_months)
    assert np.prod(idx_target_train.shape) + np.prod(idx_target_test.shape) == len(ts)/12 * len(test_months)
    
    #prediction using CCA
    out_pred, params = run_cca(source_data, target_data, n_latents, idcs, if_plot=False, map_shape=map_shape)
    Preds.append(out_pred)
    print(len(Preds)); print(len(Preds[0])); print(len(Preds[24]));
    
m_pred = np.asarray(Preds).mean(axis=0)
print(m_pred); print(m_pred.shape);

In [None]:
T = source_data.shape[0]
out_true = target_data.reshape(T, -1)[idx_target_test,:].mean(axis=0)
anomaly_corrs = helpers.compute_anomaly_corrs(out_true, m_pred)

- Plot: Canonical correlation analysis

In [None]:
ACC = anomaly_corrs.reshape(37,42)
lat = Dataset(nc_fn, 'r').variables['latitude'].__array__().data
lon = Dataset(nc_fn, 'r').variables['longitude'].__array__().data
#[lons, lats] = np.meshgrid(lon,lat)

In [None]:
#conda install -c anaconda basemap
#conda install -c anaconda proj4 #it doesnt work with proj 6.?.
import os
os.environ ['PROJ_LIB'] = '/work/gg0304/g260086/anaconda3/share/proj'
from mpl_toolkits.basemap import Basemap

m = Basemap(projection='cyl', llcrnrlon=min(lon), llcrnrlat=min(lat),
    urcrnrlon=max(lon), urcrnrlat=max(lat))
clevs = np.linspace(-1, 1, 21)
lons, lats = m(*np.meshgrid(lon, lat))
h = m.contourf(lons, lats, ACC, clevs, cmap=plt.cm.RdBu_r) #RdBu_r
#h = m.pcolormesh(lons, lats, ACC, shading='flat',latlon=True, cmap='jet', vmin=-cmax, vmax=cmax)
m.drawcoastlines()
# draw parallels and meridians.
m.drawparallels(np.arange(35.,70.,10.),labels=[1,0,0,1], fontsize=14)
m.drawmeridians(np.arange(0.,25.,10.),labels=[1,0,0,1], fontsize=14);
m.drawmapboundary(fill_color='aqua')

m.colorbar(h, location='bottom', size='8%', pad="15%", label='[$r$]')
plt.title('ACC CCA', fontsize=14)

# simple low-rank linear prediction (pixel MSEs) 

- set up simple model $Y = W X$ with $W = U V$
- low-rank: if $Y \in \mathbb{R}^N, X \in \mathbb{R}^M$, then $W \in \mathbb{R}^{N \times M}$, but $U \in \mathbb{R}^{N \times k}, V \in \mathbb{R}^{k \times M}$ with $k << M,N$ !
- low-rank structure saves us parameters: $M N$ parameters in $W$, but only $N k + k M$ in $U$ and $V$, helps prevent overfitting on low samples size

In [None]:
#To be fixed
anomaly_corrs, params = run_lrlin(source_data, target_data, n_latents, idcs, if_plot=False, map_shape=map_shape,
                                 n_epochs=2000, lr=1e-1, batch_size=None)

In [None]:
ACC = anomaly_corrs.reshape(37,42)
lat = Dataset(nc_fn, 'r').variables['latitude'].__array__().data
lon = Dataset(nc_fn, 'r').variables['longitude'].__array__().data
#[lons, lats] = np.meshgrid(lon,lat)
m = Basemap(projection='cyl', llcrnrlon=min(lon), llcrnrlat=min(lat),
    urcrnrlon=max(lon), urcrnrlat=max(lat))
clevs = np.linspace(-1, 1, 21)
lons, lats = m(*np.meshgrid(lon, lat))
h = m.contourf(lons, lats, ACC, clevs, cmap=plt.cm.RdBu_r)

m.drawcoastlines()
# draw parallels and meridians.
m.drawparallels(np.arange(35.,70.,10.),labels=[1,0,0,1], fontsize=14)
m.drawmeridians(np.arange(0.,25.,10.),labels=[1,0,0,1], fontsize=14);
m.drawmapboundary(fill_color='aqua')

m.colorbar(h, location='bottom', size='8%', pad="15%", label='[$r$]')
plt.title('ACC LR', fontsize=14)

# debug