In [None]:
import datetime
import inspect
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import random
import seaborn as sns
import signatory
import sys
import torch

sys.path.insert(1, '../src/')

In [None]:
from azran_ghahramani import delta_k_t
from azran_ghahramani import get_maximal_eigengap_information
from azran_ghahramani import make_W_D_from_distances
from azran_ghahramani import multiscale_k_prototypes
from azran_ghahramani import multiscale_k_prototypes_from_W_D
from azran_ghahramani import write_maximal_eigengaps
from kernels import make_laplacian_kernel
from metrics import make_mmd_metric
from similarities import inverse_squared
from similarities import make_gaussian_similarity
from similarities import make_gaussian_similarity_from_percentile
from utils import make_colours

In [None]:
with open('../data/market_tickers.pkl', 'rb') as fp:
    tickers = pickle.load(fp)

In [None]:
START_DATE = '20000101'
END_DATE = '20200620'

In [None]:
full_df = pd.read_pickle('../data/market_full_df.pkl')

In [None]:
daystrings = full_df['yearday'].unique()

In [None]:
data = {}

for daystr in daystrings:
    day_subset = full_df[full_df['yearday'] == daystr]
    day_subset.dropna(inplace=True, axis='columns')
    
    for ticker in tickers:
        if ticker in day_subset.columns:
            ticker_values = day_subset[ticker].values
            ticker_values = ticker_values[~np.isnan(ticker_values)]
            
            if len(ticker_values) == 390:
                scaled_ticker_values = ticker_values / ticker_values[0]

                if daystr not in data:
                    data[daystr] = {
                        'points': {}
                    }

                data[daystr]['points'][ticker] = scaled_ticker_values.tolist()

In [None]:
signature_depth = 4

In [None]:
for daystr in data:
    points = data[daystr]['points']
    signatures = {}
    logsignatures = {}
    
    for ticker, path in points.items():
        augmented_path = list(enumerate(path))
        innovation_tensor = torch.tensor(augmented_path)
        innovation_tensor = innovation_tensor.reshape((1, len(augmented_path), 2))
        signature = signatory.signature(innovation_tensor, depth=signature_depth)
        logsignature = signatory.logsignature(innovation_tensor, depth=signature_depth)

        signatures[ticker] = signature.tolist()[0]
        logsignatures[ticker] = logsignature.tolist()[0]

    data[daystr]['signatures'] = signatures
    data[daystr]['logsignatures'] = logsignatures

In [None]:
def absolute_path(path):
    ret = [path[0]]
    for n in range(1, len(path)):
        this_val = path[n]
        prev_val = path[n-1]
        abs_diff = abs(this_val - prev_val)
        next_pt = ret[-1] + abs_diff
        ret.append(next_pt)
        
    return ret

In [None]:
for daystr in data:
    points = data[daystr]['points']
    absolute_signatures = {}
    absolute_logsignatures = {}
    
    for ticker, path in points.items():
        abs_path = absolute_path(path)
        augmented_abs_path = list(enumerate(abs_path))
        innovation_tensor = torch.tensor(augmented_abs_path)
        innovation_tensor = innovation_tensor.reshape((1, len(augmented_abs_path), 2))
        signature = signatory.signature(innovation_tensor, depth=signature_depth)
        logsignature = signatory.logsignature(innovation_tensor, depth=signature_depth)

        absolute_signatures[ticker] = signature.tolist()[0]
        absolute_logsignatures[ticker] = logsignature.tolist()[0]

    data[daystr]['absolute_signatures'] = absolute_signatures
    data[daystr]['absolute_logsignatures'] = absolute_logsignatures

Make a plot of the paths

In [None]:
paths_per_day = 2

In [None]:
for daystr in data:
    day_paths = list(data[daystr]['points'].values())
    if len(day_paths) > paths_per_day:
        day_paths = day_paths[:paths_per_day]
        
    data[daystr]['plotted_paths'] = day_paths

In [None]:
sns.set()

In [None]:
plotted_paths = []
for daystr in data:
    plotted_paths += data[daystr]['plotted_paths']
    
for path in plotted_paths:
    plt.plot(path, color='purple', lw=0.25)
    
plt.title('Daily Price Evolution of Population')
plt.xlabel('Minutes Elapsed')
plt.ylabel('Relative Price')
plt.show()

In [None]:
def make_wrapped_metric(metric, key, max_per_col=None):
    def wrapped_metric(daystr1, daystr2):
        col1 = list(data[daystr1][key].values())
        col2 = list(data[daystr2][key].values())
        
        if max_per_col is not None:
            col1 = col1[:max_per_col]
            col2 = col2[:max_per_col]
            
        return metric(col1, col2)

    return wrapped_metric

In [None]:
kernel = make_laplacian_kernel(sigma=0.5)
mmd = make_mmd_metric(kernel, kernel_repeated_arg_value=1)
metric = make_wrapped_metric(mmd, key='logsignatures', max_per_col = 100)

In [None]:
def make_distances(points, metric):
    '''Get distances between all points'''
    n_points = len(points)
    distances = [[] for _ in range(n_points-1)]
    for i in range(n_points-1):
        if (i+1) % 5 == 0:
            print(f'starting iteration {i+1} of {n_points}')
        for j in range(i+1, n_points):
            distances[i].append(metric(points[i], points[j]))
            
    return distances

In [None]:
def get_eigengap_values(t_values, evalues, max_clusters=None, ignore_one_clustering=False):
    eigengap_values = {}
    max_n_clusters = len(evalues) - 1
        
    if max_clusters is not None:
        max_n_clusters = min(max_n_clusters, max_clusters)

    clusters = range(1, max_n_clusters)
    if ignore_one_clustering:
        clusters = range(2, max_n_clusters)
    
    for k in clusters:
        eigengap_values[k] = {t: delta_k_t(k-1, t, evalues) for t in t_values}
    
    return eigengap_values


In [None]:
point_keys = list(data.keys())

Investigate impact on distances vector for different choice of sigma

In [None]:
def scale_vector(orig, min_vec, range_vec, skipped_indices=None):
    assert len(orig) == len(min_vec) == len(range_vec)
    
    scaled_indices = range(len(orig))
    if skipped_indices is not None:
        scaled_indices = [n for n in range(len(orig)) if n not in skipped_indices]
        
    scaled_vec = []
    for n in range(len(orig)):
        if n in scaled_indices:
            entry = orig[n]
            scaled_entry = (entry - min_vec[n]) / range_vec[n]
            scaled_vec.append(scaled_entry)
        else:
            scaled_vec.append(0)

    return scaled_vec

In [None]:
def scale_key(key, upper_percentile):
    full_collection = [
        el
        for daystr in point_keys
        for el in data[daystr][key].values()
    ]
    
    lower_percentile = 100 - upper_percentile
    vec_length = len(full_collection[0])
    
    upper_vector = [
        np.percentile([x[n] for x in full_collection], upper_percentile)
        for n in range(vec_length)
    ]
    lower_vector = [
        np.percentile([x[n] for x in full_collection], lower_percentile)
        for n in range(vec_length)
    ]
    range_vector = [
        upper - lower 
        for upper, lower in zip(upper_vector, lower_vector)
    ]
    skipped_indices = [
        idx
        for idx, el in enumerate(range_vector)
        if np.isclose(el, 0)
    ]
    
    for daystr in data:
        originals = data[daystr][key]
        scaled_values = {
            ticker: scale_vector(
                val, lower_vector, range_vector, skipped_indices
            ) for ticker, val in originals.items()
        }
        data[daystr][f'scaled_{key}'] = scaled_values

In [None]:
scale_key('logsignatures', upper_percentile=95)
scale_key('signatures', upper_percentile=95)
scale_key('absolute_signatures', upper_percentile=95)
scale_key('absolute_logsignatures', upper_percentile=95)

In [None]:
def make_W_D_from_distances(n_points, similarity, distances, self_similarity = 1):
    # Make W matrix
    W = np.empty((n_points, n_points))

    for i in range(n_points):
        for j in range(i, n_points):
            if i == j:
                W[i][j] = self_similarity
            else:
                distance = distances[i][j-i-1]
                W[i][j] = similarity(distance)
                W[j][i] = W[i][j]
    
    # Make D matrix from row sums
    D = np.diag([W[i].sum() for i in range(n_points)])

    # W is assumed to be full rank
    assert np.linalg.matrix_rank(W) == len(W), 'W is not full rank'
    return W, D

In [None]:
def get_spectrum(matr):
    evalues, evectors = np.linalg.eig(matr)
    evectors = evectors.T
    evectors = np.array([evectors[n] for n in range(len(evectors))])

    # Sort eigenvalues from largest to smallest; update eigenvectors
    idx = evalues.argsort()[::-1]
    evalues = evalues[idx]
    evectors = evectors[idx]
    
    return evalues, evectors

In [None]:
kernel = make_laplacian_kernel(sigma=0.1)
mmd = make_mmd_metric(kernel, kernel_repeated_arg_value=1)
metric = make_wrapped_metric(
    mmd,
    key = 'scaled_absolute_logsignatures',
    max_per_col = 100
)

In [None]:
distances = make_distances(point_keys, metric)
flat_distances = [el for ls in distances for el in ls]
largest_distance = max(flat_distances)

In [None]:
similarity = lambda x: 1 / x**5
self_similarity = similarity(largest_distance)

W, D = make_W_D_from_distances(
    n_points = len(distances),
    similarity = similarity,
    distances = distances,
    self_similarity = self_similarity
)

transition_matrix = np.linalg.inv(D).dot(W)

evalues, evectors = get_spectrum(transition_matrix)

t_values_first_segment = list(range(1, 1000))
t_values_second_segment = np.logspace(start=3, stop=4, num=1000).tolist()
t_values = t_values_first_segment + t_values_second_segment

eigengap_values = get_eigengap_values(t_values, evalues, ignore_one_clustering=True)    
max_egap_vals, max_attained = get_maximal_eigengap_information(eigengap_values)

colours = make_colours(len(max_attained))

for idx, cluster in enumerate(max_attained):
    cluster_colour = colours[idx]

    cluster_egap_separation_values = list(eigengap_values[cluster].values())
    plt.plot(t_values, cluster_egap_separation_values, color=cluster_colour)
    maxima_value = max_attained[cluster]['suitability']
    maxima_location = max_attained[cluster]['n_steps']
    plt.axvline(
        x = maxima_location,
        color = cluster_colour,
        linestyle = '--',
        label = f'{cluster} Clusters Maxima'
    )

plt.xscale('log')
plt.ylim(0, 1.2)
plt.legend()
plt.xlabel('Steps')
plt.ylabel('Eigengap Separation')
plt.title('Maximal Eigengap Separation for Number of Steps')

output = multiscale_k_prototypes_from_W_D(data, W, D)
print(json.dumps(output, indent=2))

In [None]:
def get_partition(output, n_clusters):
    target = [el for el in output if el['n_clusters'] == n_clusters][0]
    return target['partition']

In [None]:
three_partition = get_partition(output, 3)

In [None]:
three_partition

In [None]:
cluster_colours = ['red', 'green', 'blue']

In [None]:
# cluster_colours = make_colours(len(three_partition))

for cluster_idx, cluster in enumerate(three_partition):
    cluster_colour = cluster_colours[cluster_idx]
    for idx in cluster:
        daystr = point_keys[idx]
        
        if cluster_idx == 1:
            plotted_paths = list(data[daystr]['points'].values())
        else:
            plotted_paths = data[daystr]['plotted_paths']
        
        for path in plotted_paths:
            plt.plot(path, color=cluster_colour, lw=0.2)

    plt.ylim(0.92, 1.10)
    plt.title(f'Market Data\nCluster {cluster_idx+1}')
    plt.xlabel('Minutes Elapsed')
    plt.ylabel('Relative Price')
    plt.show()

In [None]:
def get_daystrings(indexes):
    return [point_keys[idx] for idx in indexes]

In [None]:
get_daystrings(three_partition[2])

In [None]:
def quadratic_variation(path):
    all_variations = []
    for n in range(1, len(path)):
        this_val = path[n]
        prev_val = path[n-1]
        difference = this_val - prev_val
        all_variations.append(difference ** 2)
        
    return np.mean(all_variations)

In [None]:
for idx, cluster in enumerate(three_partition):
    cluster_daystrings = get_daystrings(cluster)
    all_paths = []
    for daystr in cluster_daystrings:
        day_paths = list(data[daystr]['points'].values())
        all_paths += day_paths
        
    final_values = [x[-1] for x in all_paths]
    variations = [quadratic_variation(path) for path in all_paths]
    
    mean = np.mean(final_values)
    average_qv = np.mean(variations)
    
    print()
    print('cluster index: ', idx)
    print('number of paths: ', len(all_paths))
    print('average final value: ', mean)
    print('average QV: ', average_qv)

In [None]:
get_daystrings(three_partition[1])