In [13]:
from numba import njit
import numpy as np

@njit
def dcov_cb(x:np.float32,y:np.bool8):
    """
    Oleg Kanakov implementation of distance covariance
    22 Feb 2023
    Same as dcor library for binary outcome variable (y) 
    Time complexity is O(n), n = len(x) = len(y)
    x must be sorted before, y_i should corresponds to x_i
    """
    if x.size != y.size:
        raise ValueError('Array lengths do not match!')

    N=x.size
    
    n=0; nw=0
    z=0.; zw=0.
    d=0.
    for i in range(N):
        if y[i]==True:
            d += x[i]*nw-zw
            n += 1
            z += x[i]
        else:
            d  += x[i]*n-z
            nw += 1
            zw += x[i]

    qN=nw; pN=n
    sx=z+zw

    add=0.
    sab=0.
    s=0.
    for i in range(N):
        aid = -(N-2.*i)*x[i] -2.*s +sx
        s += x[i]
        if y[i]==True:
            bid = qN
        else:
            bid = pN
        
        add += aid
        sab += aid*bid

    bdd = 2.*qN*pN

    return 2.*d/N**2. - 2.*sab/N**3. + add*bdd/N**4.


In [14]:
@njit
def dcor_cb(x:np.float32,y:np.bool8):
    """
    Oleg Kanakov implementation of distance correlation
    22 Feb 2023
    Same as dcor library for binary outcome variable (y) 
    Time complexity is O(n), n = len(x) = len(y)
    x must be sorted before, y_i should corresponds to x_i
    """
    if x.size != y.size:
        raise ValueError('Array lengths do not match!')

    N=x.size
    
    n=0; nw=0
    z=0.; zw=0.
    d=0.
    sx2=0.
    for i in range(N):
        sx2 += x[i]*x[i]
        if y[i]==True:
            d += x[i]*nw-zw
            n += 1
            z += x[i]
        else:
            d  += x[i]*n-z
            nw += 1
            zw += x[i]

    qN=nw; pN=n
    sx=z+zw
    
    add=0.
    sab=0.
    sai2=0.
    s=0.
    for i in range(N):
        aid = -(N-2.*i)*x[i] -2.*s +sx
        s += x[i]
        if y[i]==True:
            bid = qN
        else:
            bid = pN
        
        add += aid
        sai2 += aid*aid
        sab += aid*bid

    bdd = 2.*qN*pN

    dxx = 2.*(N*sx2-sx*sx)
    
    dcov = 2.*d/N**2. - 2.*sab/N**3. + add*bdd/N**4.
    dcovxx = dxx/N**2. -2.*sai2/N**3. + add*add/N**4.
    dcovyy = bdd*bdd/N**4.

    return dcov/(dcovxx*dcovyy)**0.5 


# Functions test

In [15]:
N = 1000000;
x = np.random.uniform(0,1,N)
y = (x + np.random.normal(0,0.2,N)) > 0.95
ysur1 = np.roll(y,100) # time-shifted "surrogate"
ysur2 = np.roll(y,200) # another one
k = np.argsort(x) # find sorting permutation
x = x[k]
y = y[k]
ysur1 = ysur1[k]
ysur2 = ysur2[k]
del k

In [115]:
dcor_cb(x,y)

0.18972462796801534

In [116]:
dcor_cb(x,ysur1)

2.5658451970047904e-06

In [117]:
dcor_cb(x,ysur2)

6.737465220168448e-07

In [118]:
import dcor

dcor.distance_covariance_sqr(x.astype(np.float64),y.astype(np.float64),method='avl')

0.007626302076594704

In [119]:
dcor.distance_covariance_sqr(x.astype(np.float64),y.astype(np.float64),method='mergesort')

0.0076263020765869605

In [120]:
dcor.distance_correlation_sqr(x.astype(np.float64),y.astype(np.float64),method='mergesort')

0.18972462796827202

# Cyclone test

In [16]:
%load_ext autoreload
%autoreload 2
import numpy as np
import re
import time
%matplotlib
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import textwrap
import scipy.stats as stats

from scipy import stats
import scipy
from functools import partial

import matplotlib.pylab as pylab
import seaborn as sns
sns.set_style('whitegrid', {'legend.frameon':True})
sns.set_palette(sns.color_palette("Set1", 12))
#sns.set_context("paper")
fontsize = 12
params = {'legend.fontsize': fontsize,
  'figure.figsize': (18, 15),
  'axes.labelsize': fontsize,
  'axes.titlesize':fontsize,
  'axes.edgecolor':"0.3",
  'xtick.labelsize':fontsize,
  'ytick.labelsize':fontsize,
  'legend.fontsize':10,
  'font.size':fontsize,
  'font.family':'serif'}
pylab.rcParams.update(params)
plt.rc('axes', labelsize=fontsize) 

#plt.style.use('ggplot')
%matplotlib inline

from tqdm.notebook import tqdm
import pandas as pd
from pathlib2 import Path
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: QtAgg


In [17]:
import sys
sys.path.append('../')

from metric_store import save_metrics, save_metric, load_metrics, get_metric_names, load_metric
from network_metrics import prepare_metric
from pipeline.pipeline import load_config

In [18]:
config_name = "pipeline.config"
config = load_config(config_name)

In [19]:
from corr_network import load_data, get_available_mask
data = load_data(config)
available_mask = get_available_mask(data)

In [20]:
metric_names = [
    #'input_data/MSLP_preproc',
    #'probability_for_metrics/input_data/MSLP',
    #'probability_for_metrics/input_data/MSLP_preproc',
    #'probability_for_metrics/network_metrics/LCC_w',
    #'probability_for_metrics/network_metrics/LCC_0.9',
    'probability_for_metrics/network_metrics/LCC_0.95',
]
for metric_name in metric_names:
    config.metrics_plot_options['metric_name'] = metric_name
    metric = load_metric(config, metric_name)
    metric = prepare_metric(metric_name, metric, available_mask)
    print(metric_name, metric.shape)

probability_for_metrics/network_metrics/LCC_0.95 (36, 69, 113960)


In [21]:
prefix = 'network_metrics'
metric_names = list(get_metric_names(config, prefix = prefix).keys())
metric_names

['network_metrics/LCC_w',
 'network_metrics/GCC_w',
 'network_metrics/degree_w',
 'network_metrics/EVC_w',
 'network_metrics/closeness_w',
 'network_metrics/LCC_0.9',
 'network_metrics/GCC_0.9',
 'network_metrics/degree_0.9',
 'network_metrics/EVC_0.9',
 'network_metrics/closeness_0.9',
 'network_metrics/LCC_0.95',
 'network_metrics/GCC_0.95',
 'network_metrics/degree_0.95',
 'network_metrics/EVC_0.95',
 'network_metrics/closeness_0.95']

In [44]:
prefix = 'sond'
metric_names = list(get_metric_names(config, prefix = prefix).keys())
metric_names

['sond/probability_for_metrics/network_metrics/LCC_w',
 'sond/probability_for_metrics/network_metrics/degree_w',
 'sond/probability_for_metrics/network_metrics/EVC_w',
 'sond/probability_for_metrics/network_metrics/closeness_w',
 'sond/probability_for_metrics/network_metrics/LCC_0.9',
 'sond/probability_for_metrics/network_metrics/degree_0.9',
 'sond/probability_for_metrics/network_metrics/EVC_0.9',
 'sond/probability_for_metrics/network_metrics/closeness_0.9',
 'sond/probability_for_metrics/network_metrics/LCC_0.95',
 'sond/probability_for_metrics/network_metrics/degree_0.95',
 'sond/probability_for_metrics/network_metrics/EVC_0.95',
 'sond/probability_for_metrics/network_metrics/closeness_0.95',
 'sond/probability_for_metrics/diff_metrics/network_metrics/LCC_w',
 'sond/probability_for_metrics/diff_metrics/network_metrics/degree_w',
 'sond/probability_for_metrics/diff_metrics/network_metrics/EVC_w',
 'sond/probability_for_metrics/diff_metrics/network_metrics/closeness_w',
 'sond/proba

In [22]:
cyclone_events = np.load('../cyclones_events.npz')['cyclone_events_2']

In [53]:
from plot_network_metrics.utils import get_times_lats_lots
from plot_network_metrics.utils import get_sond_times

all_times, all_lats, all_lons = get_times_lats_lots(config)
sond_time_inds = get_sond_times(config, all_times)
cyclone_events_sond = cyclone_events[:, :, sond_time_inds]

In [42]:
from tqdm.notebook import tqdm

def compute_rolled_dcor(metric, cyclone_events, roll_shift=1):
    is_ok = ~np.isnan(metric)
    cyclone_events_rolled = cyclone_events.copy()
    x = metric[is_ok].astype('float32')
    ids = np.argsort(x)
    xs = x[ids]

    start_dcor = None
    dcor_counts = [0, 0]

    dcor_on_roll = []
    rolls = []
    cur_roll = 0

    nt = metric.shape[2]
    
    tqdm_traverse = tqdm(range(nt // roll_shift))

    for id_roll in tqdm_traverse:
        y = cyclone_events_rolled[is_ok]
        ys = y[ids]

        cur_dcor = dcor_cb(xs, ys)
        if start_dcor is None:
            start_dcor = cur_dcor
        else:
            dcor_counts[cur_dcor < start_dcor] += 1
        dcor_on_roll += [cur_dcor]
        rolls += [cur_roll]
        
        pv = dcor_counts[1] / max(1, dcor_counts[0] + dcor_counts[1])

        description = f"{start_dcor:0.3g} > {cur_dcor:0.3g}. | "
        description += f"{dcor_counts[0]} + {dcor_counts[1]} = {dcor_counts[0] + dcor_counts[1]}. | "
        description += f"p-value = {pv:0.3g}"
        tqdm_traverse.set_description(description)

        cyclone_events_rolled = np.roll(cyclone_events_rolled, roll_shift)
        cur_roll += roll_shift
    roll_hours = np.array(rolls) * 3
    df_dcor_rolls = pd.DataFrame({
        'roll': rolls, 
        'roll_hours': roll_hours, 
        'dcor': dcor_on_roll
    })
    return df_dcor_rolls

In [55]:
from metric_store import Metrics
metric_names = [
    #'input_data/MSLP_preproc',
    #'probability_for_metrics/input_data/MSLP',
    #'probability_for_metrics/input_data/MSLP_preproc',
    'probability_for_metrics/network_metrics/LCC_w',
    'probability_for_metrics/network_metrics/LCC_0.9',
    'probability_for_metrics/network_metrics/LCC_0.95',
]
is_sond = True
cyclone_events_selected = cyclone_events_sond.copy() if is_sond else cyclone_events.copy()
if is_sond:
    metric_names = ['sond/' + metric_name for metric_name in metric_names]
print(metric_names)
for metric_name, metric in Metrics(config, metric_names=metric_names):
    metric = prepare_metric(metric_name, metric, available_mask)
    print(metric_name, metric.shape)
    df_dcor_rolls = compute_rolled_dcor(metric, cyclone_events_selected, roll_shift=8)
    short_metric_name = metric_name.replace('/', '$')
    file_name = config.distance_covariance_options['work_dir'] / f'dcor_{short_metric_name}.tsv'
    file_name.parent.mkdir(parents=True, exist_ok=True)
    df_dcor_rolls.to_csv(file_name, sep='\t')

['sond/probability_for_metrics/network_metrics/LCC_w', 'sond/probability_for_metrics/network_metrics/LCC_0.9', 'sond/probability_for_metrics/network_metrics/LCC_0.95']
sond/probability_for_metrics/network_metrics/LCC_0.9 (36, 69, 38064)


  0%|          | 0/4758 [00:00<?, ?it/s]

sond/probability_for_metrics/network_metrics/LCC_0.95 (36, 69, 38064)


  0%|          | 0/4758 [00:00<?, ?it/s]

sond/probability_for_metrics/network_metrics/LCC_w (36, 69, 38064)


  0%|          | 0/4758 [00:00<?, ?it/s]

In [41]:
file_name

WindowsPath('Z:/Research/Climate/data/ERA5/ERA5_MSL_1982_2020_3h_0.75/distance_covariance_window_2d_delay_0d/dcor_probability_for_metrics$network_metrics$LCC_0.95.tsv')

In [None]:
df_dcor_rolls

Unnamed: 0,roll,roll_hours,dcor
0,0,0,0.0002067696
1,8,24,0.0002378156
2,16,48,0.0001945169
3,24,72,3.906063e-05
4,32,96,4.516938e-06
5,40,120,6.338086e-07
6,48,144,2.077307e-07
7,56,168,1.423717e-07
8,64,192,5.358369e-07
9,72,216,5.791252e-07


In [10]:
a = np.array([1, 2, 3])
b = a[[1, 0]]
b[0] = 7
b
a

array([1, 2, 3])

In [17]:
# LCC_w
# 0.000316 > 6.87e-07. | 0 + 87 = 87. | p-value = 1.0: 2%

# LCC_0.9
# 0.000224 > 6.46e-07. | 0 + 3560 = 3560. | p-value = 1.0: 100%

# LCC_0.95
# 0.000206 > 3.36e-07. 0 + 182 = 182 p-value = 1.0: 5%

dtype('float16')