In [11]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm
import ksig

In [2]:
n_levels = 5 
# number of signature levels to use

n_components = 100
# number of components to use in the static features and the randomized projections

static_kernel = ksig.static.kernels.RBFKernel() 
# an RBF kernel for vector-valued data

static_feat = ksig.static.features.NystroemFeatures(static_kernel, n_components=n_components)
# Nystroem features with an RBF base kernel

proj = ksig.projections.CountSketchRandomProjection(n_components=n_components)
# a CountSketch random projection 

lr_sig_kernel = ksig.kernels.LowRankSignatureKernel(n_levels=n_levels, static_features=static_feat, projection=proj)
# a low-rank signature kernel, which additionally to working as a callable for kernel matrix computations
# also implements a fit method, which must be used to fit the kernel (and its subobjects) to the data
# and a transform method, which can be used to transform an array of paths to their corresponding low-rank features

# n_seq, l_seq, n_feat = 1000, 200, 100
# X = np.random.randn(n_seq, l_seq, n_feat)
# # generate 1000 sequences of length 200 with 100 features

# lr_sig_kernel.fit(X)
# # fit the kernel to the data

# K_XX = lr_sig_kernel(X) # K_XX has shape (1000, 1000)
# # compute the low-rank signature kernel matrix k(X, X)

# n_seq2, l_seq2 = 800, 250
# Y = np.random.randn(n_seq2, l_seq2, n_feat)
# # generate another array of 800 sequences of length 250 and 100 features

# K_XY = lr_sig_kernel(X, Y) # K_XY has shape (1000, 800)
# # compute the kernel matrix between arrays X and Y
# # the kernel does not have to be fitted a second time

# P_X = lr_sig_kernel.transform(X) # P_X has shape (n_seq, 1+n_levels*n_components) i.e. (1000, 501) in this case
# P_Y = lr_sig_kernel.transform(Y) # P_Y shape shape (800, 501)
# # alternatively, one may directly compute the low-rank representations for both X and Y
# # and then use these features to compute the kernel matrices K_XX and K_XY

# print(np.linalg.norm(K_XX - P_X @ P_X.T)) # 1.5336806154787045e-14
# print(np.linalg.norm(K_XY - P_X @ P_Y.T)) # 0.0

In [7]:
# simulate geometric Brownian motion paths
mu_x = 0.1
sigma_x = 0.2
mu_y = 0.2
sigma_y = 0.3
n_paths = 1000
n_steps = 200 - 1
dt = 0.01
X = np.exp((mu_x - 0.5 * sigma_x**2) * dt + sigma_x * np.sqrt(dt) * np.random.randn(n_paths, n_steps))
Y = np.exp((mu_y - 0.5 * sigma_y**2) * dt + sigma_y * np.sqrt(dt) * np.random.randn(n_paths, n_steps))
X = np.cumprod(X, axis=1)
Y = np.cumprod(Y, axis=1)
X = np.concatenate([np.ones((n_paths, 1)), X], axis=1)
Y = np.concatenate([np.ones((n_paths, 1)), Y], axis=1)
X = X[..., np.newaxis]
Y = Y[..., np.newaxis]
X.shape, Y.shape

((1000, 200, 1), (1000, 200, 1))

In [8]:
lr_sig_kernel.fit(X)



In [18]:
def two_sample_permutation_test(test_statistic, X, Y, num_permutations, prog_bar=True):
    assert X.ndim == Y.ndim
    
    statistics = np.zeros(num_permutations)
    
    range_ = range(num_permutations)
    if prog_bar:
        range_ = tqdm(range_)
    for i in range_:
        # concatenate samples
        if X.ndim == 1:
            Z = np.hstack((X,Y))
        elif X.ndim > 1:
            Z = np.vstack((X,Y))
        print(Z.shape)
        # permute samples and compute test statistic
        perm_inds = np.random.permutation(len(Z))
        Z = Z[perm_inds]
        X_ = Z[:len(X)]
        Y_ = Z[len(X):]
        my_test_statistic = test_statistic(X_, Y_)
        statistics[i] = my_test_statistic
    return statistics

def quadratic_time_mmd(X,Y,kernel):
    # assert X.ndim == Y.ndim == 2
    K_XX = kernel(X,X)
    K_XY = kernel(X,Y)
    K_YY = kernel(Y,Y)
       
    n = len(K_XX)
    m = len(K_YY)
    
    # unbiased MMD statistic (could also use biased, doesn't matter if we use permutation tests)
    np.fill_diagonal(K_XX, 0)
    np.fill_diagonal(K_YY, 0)
    mmd = np.sum(K_XX) / (n*(n-1))  + np.sum(K_YY) / (m*(m-1))  - 2*np.sum(K_XY)/(n*m)
    return mmd

def plot_permutation_samples(null_samples, statistic=None):
    plt.hist(null_samples)
    plt.axvline(x=np.percentile(null_samples, 2.5), c='b')
    legend = ["95% quantiles"]
    if statistic is not None:
        plt.axvline(x=statistic, c='r')
        legend += ["Actual test statistic"]
    plt.legend(legend)
    plt.axvline(x=np.percentile(null_samples, 97.5), c='b')
    plt.xlabel("Test statistic value")
    plt.ylabel("Counts")

In [19]:
num_permutations = 200

my_kernel = lambda X,Y : lr_sig_kernel(X, Y)
my_mmd = lambda X,Y : quadratic_time_mmd(X[:,np.newaxis],Y[:,np.newaxis], my_kernel)

statistics = two_sample_permutation_test(my_mmd, X, Y, num_permutations)
my_statistic = my_mmd(X,Y)

plot_permutation_samples(statistics, my_statistic)

  0%|          | 0/200 [00:00<?, ?it/s]

(2000, 200, 1)





ValueError: Only input sequence arrays with ndim==2 or ndim==3 are supported.