In [1]:
from pathlib import Path

import numpy as np
from numba import jit

from joblib import Parallel, delayed

In [2]:
def run_perc(data, thresh):
    perc_all = np.zeros(data.shape[0])
    for n,i in enumerate(data):
        data[n, i < np.percentile(i, thresh)] = 0.
    for n,i in enumerate(data):
        data[n, i < 0.] = 0.
    return data

def compute_similarities(input_path, output_path):
    fname = input_path.parts[-1].split('.')[1]
    
    K = np.load(input_path)
    K[np.isnan(K)] = 0.0

    A_mA = K - K.mean(1)[:,None]
    Adot = A_mA.dot(A_mA.T)

    ssA = (A_mA**2).sum(1)
    Asq = np.sqrt(np.dot(ssA[:,None],ssA[None]))
    K = Adot/Asq

    del A_mA, ssA, Asq, Adot

    # thrsholding
    K = run_perc(K, 90)

    # cosine similarity
    norm = (K * K).sum(0, keepdims=True) ** .5
    K = K.T @ K
    aff = K / norm / norm.T
    
    np.savez(f'./similarities/similarity.{fname}', K=K, aff=aff)

In [None]:
p = Path('./timeseries/')
out = Path('./similarities/')

files = sorted(p.glob('*.npy'))

In [None]:
_ = Parallel(n_jobs=52, verbose=10)(delayed(compute_similarities)(f, out) for f in files)