# Non linear feature extraction of the stratosphere

## Set-up: 

### Imports:

In [6]:
from scipy.spatial import distance_matrix
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
from time import sleep
import warnings
from sklearn.neighbors import NearestNeighbors
import scipy.sparse.linalg as linalg
import scipy.signal as signal
import seaborn as sns
import matplotlib.dates as mdates
from datetime import datetime
import os
import math 
warnings.filterwarnings('ignore')

from functions_takens import *

%load_ext jupyternotify
%load_ext autoreload
%autoreload 2

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Constants:

In [7]:
# Percentage of nearest neighbours for Laplacian eigenmpas:
PERC_NEIGH = 10
print(f'Percentage of nearest neighbours: {PERC_NEIGH/100}')

# Number of eigenmaps to compute:
NUM_EIGENVALUES = 21
print(f'Number of eigenmaps: {NUM_EIGENVALUES}')

# Data set to consider: ('raw/anomalies')
DATA = 'anomalies'
print(f'Data set considered: {DATA}')

# Path to input data:
INPUT_DATA = '../../../data/vandermeer/input_data/'
print(f'Path to input data: {INPUT_DATA}')

# Wether to do NLSA or Laplacian Eigenmaps. Takens True for NLSA
USE_TAKENS = True
print(f'Using takens embedding: {USE_TAKENS}')

# Time-step in Takens embedding:
TAU = 4 * 30 * 2
print(f'tau = {TAU/(4*30)} months')

BEGIN_YEAR = 1979
END_YEAR = 2019


Percentage of nearest neighbours: 0.1
Number of eigenmaps: 21
Data set considered: anomalies
Path to input data: ../../../data/vandermeer/input_data/
Using takens embedding: True
tau = 2.0 months


In [8]:
# Constants for plots:
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 14

plt.rc('font', size=MEDIUM_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)  # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

### Paths:

Global path to save data: 

In [9]:
if DATA == 'raw':
    PATH = '../../../data/vandermeer/pickles/raw/'
elif DATA == 'anomalies':
    PATH = '../../../data/vandermeer/pickles/anomalies/'
if not os.path.exists(PATH):
    os.makedirs(PATH)
print(f'Global path: {PATH}')

Global path: ../../../data/vandermeer/pickles/anomalies/


Path to save data according to the percentage of neighbours used:

In [10]:
PATH1 = PATH + str(PERC_NEIGH)+'perc/'
if not os.path.exists(PATH1):
        os.makedirs(PATH1)
print(f'Precise path: {PATH1}')

Precise path: ../../../data/vandermeer/pickles/anomalies/10perc/


Path to save data when using simple kernel (binary kernel): 

In [11]:
# path to simple_kernel:
# without Takens:
PATH_SIMPLE = PATH1 + 'simple_kernel/'
if not os.path.exists(PATH_SIMPLE):
        os.makedirs(PATH_SIMPLE)
print(f'Path to simple kernel: {PATH_SIMPLE}')

# with Takens:
PATH_SIMPLE_TAKENS = PATH1 + 'simple_kernel/takens/'
if not os.path.exists(PATH_SIMPLE_TAKENS):
        os.makedirs(PATH_SIMPLE_TAKENS)
print(f'Path to simple kernel for NLSA with Takens embedding: {PATH_SIMPLE_TAKENS}')

Path to simple kernel: ../../../data/vandermeer/pickles/anomalies/10perc/simple_kernel/
Path to simple kernel for NLSA with Takens embedding: ../../../data/vandermeer/pickles/anomalies/10perc/simple_kernel/takens/


Path to save results of NLSA:

In [12]:
# path to NLSA results:
PATH_TAKENS = PATH + str(PERC_NEIGH)+'perc/takens/'
if not os.path.exists(PATH_TAKENS):
        os.makedirs(PATH_TAKENS)
print(f'Path to NLSA results: {PATH_TAKENS}')

Path to NLSA results: ../../../data/vandermeer/pickles/anomalies/10perc/takens/


### Load data:

Load input raw or anomalies data and basis coefficients

In [13]:
%%time
anomalies_cf = pd.read_csv(INPUT_DATA + 'anomalies_coefficients.csv', sep=',')
raw_cf = pd.read_csv(INPUT_DATA + 'raw_data_coefficients.csv', sep=',')
basis_cf = pd.read_csv(INPUT_DATA + 'basis_functions.csv', sep=',')

if DATA == 'raw':
    df = raw_cf
elif DATA == 'anomalies':
    df = anomalies_cf

# remove useless axes:
df = df.drop(['Unnamed: 0', 'Date'], axis=1)
print('Sample of data:')
print(f'Shape of {DATA} data: {df.shape}')
pd.DataFrame(df).head(3)

Sample of data:
Shape of anomalies data: (33960, 1001)
CPU times: user 21.3 s, sys: 1.37 s, total: 22.7 s
Wall time: 22.7 s


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X992,X993,X994,X995,X996,X997,X998,X999,X1000,X1001
0,0.087772,0.306971,-0.358278,-0.713848,0.431876,0.233288,0.079006,0.182841,0.420922,0.575587,...,0.010085,0.000343,0.013018,-0.008956,0.001028,-0.000141,0.009586,-0.004209,0.000906,0.011727
1,0.04579,0.298297,-0.37376,-0.684706,0.456666,0.171122,0.098944,0.153985,0.360332,0.543631,...,0.004224,-0.002052,0.002568,0.003769,0.000829,-0.001214,0.010168,-0.005202,0.007966,0.002296
2,0.044001,0.297177,-0.449389,-0.679847,0.513206,0.161827,0.08239,0.152011,0.281488,0.510341,...,0.009204,-0.010951,0.000831,-0.004194,-0.000835,0.0011,-0.000148,-0.005812,0.000562,-0.005325


## Takens through takens matrix:

Create the Takens embedding matrix directly from the data matrix. 

In [None]:
# import data matrix:
if DATA == 'raw':
    df = pd.read_csv(INPUT_DATA + 'raw_data_coefficients.csv', sep=',')
elif DATA == 'anomalies':
    df = pd.read_csv(INPUT_DATA + 'anomalies_coefficients.csv', sep=',')
print(f'Input data shape: {df.shape}')

### Create Takens matrix:

From $X$ the data matrix (raw or anomalies) we build the Takens embedding matrix as the following: 

$Y = X[1 : (T -m*\tau); 1 : (Dx\tau)]$ where $Y [t; 1 : D] = X[t; 1 : D]$ and $Y [1; (D + 1) : (2D)] = X[t + 1; 1 : D]$,...
and where $m$ is the number of entire winters, $T$ the number of time-samples and $D$ the dimension of the original data matrix. 

In [None]:
# create dataframe with time as index and remove useless columns
time = df['Date']
time = pd.to_datetime(time)
df_ = df.drop(['Unnamed: 0', 'Date'], axis=1)
df_time = pd.concat([time, df_], axis=1)
df_time = df_time.set_index('Date')

# Years in the data:
years = range(BEGIN_YEAR, END_YEAR)
years = [str(y) for y in years]

# Count number of samples per year:
counts = {}
sum_ = 0
for y in years:
    sum_ += len(df_time[y])
    counts[y] = len(df_time[y])
print('For example, number of samples in 1979 and 1980: {}, {}'.format(counts['1979'], counts['1980']))

In [None]:
# get the position of the last element for each year 
cs = [counts[i] for i in years]
last_year_pos = [np.sum(cs[:i + 1]) for i in range(len(cs))]

# get the number of entire winters e.g, winters that range 
# from Dec - April (not like 1979 and 2019)
entire_winters = END_YEAR - BEGIN_YEAR - 1
print(f'Number of entire winters in data: {entire_winters}')
print('Last position of first and second year: {}, {}'.format(
    last_year_pos[0], last_year_pos[1]))

Get first and last position of each winter for each year. A winter stretches from Dec-April -->only for entire winters (not like 1979 and 2019).

In [None]:
last_pos_winters = give_last_post_winters(last_year_pos, years, df_time)
first_pos_winters = give_first_post_winters(last_year_pos, years, df_time)
assert (len(last_pos_winters) == len(first_pos_winters))

np.save(PATH1 + 'last_pos_winters.npy', last_pos_winters)
np.save(PATH1 + 'first_pos_winters.npy', first_pos_winters)

print('First position of first and second winter: {}, {}'.format(
    first_pos_winters[0], first_pos_winters[1]))
print('Last position of first and second winter: {}, {}'.format(
    last_pos_winters[0], last_pos_winters[1]))

From the first and last position of each winter, get the indices of all points considered during the Takens embedding. We range from the $first$ to the $last - \tau$, where $\tau$ is the size of the embedding (here two months). These indices will be the only ones considered during the building of the Takens matrix. Because $Y = X[1 : (T -\tau); 1 : (Dx\tau)]$ we stop at $last - \tau$ when building the matrix. 

In [None]:
indices_of_points = []
for i in range(len(first_pos_winters)):
    indices_of_points.append(
        range(first_pos_winters[i] , last_pos_winters[i] -TAU+1, 1))
for i in indices_of_points:
    # check that we take only full winters
    assert (i[-1] - i[0] > 600)
print('First three takens indicices:')
indices_of_points[:3]

Safety check to see everything is working all right:

In [None]:
# total number of points:
le = np.sum([len(i) for i in indices_of_points])
# time series corresponding to those points:
time_nlsa = time[indices_of_points[0]]
for i in range(1, len(last_pos_winters)):
    time_nlsa = pd.concat([time_nlsa, time[indices_of_points[i]]], axis=0)
assert (le == len(time_nlsa))

In [None]:
# unravel the indices from the ranges above into one list:
indices_m = []
for j in range(len(indices_of_points)):
    for i in range(len(indices_of_points[j])):
        indices_m.append(indices_of_points[j][i])
assert(len(indices_m)==le)
print(f'First and last indice: {indices_m[0]}, {indices_m[-1]}')

Create the hull of the Takens matrix we're going to fill below. It's of shape $(T - m*\tau, D*\tau)$ where $m$ is the number of entire winters, $T$ the number of time-samples and $D$ is the number of samples we choose from the dimension of the original data matrix. Here we choose to take $D = 150$ while the original $D = 1003$. This is to fasten computations. 

In [None]:
# Number of entire winters:
num_years = END_YEAR - BEGIN_YEAR - 1
print(f'Number of entire winters: {num_years}')

X = df.drop(['Unnamed: 0', 'Date'], axis=1).values
# Shape (T - m*Tau, D*Tau)
D = 150
print(f'Tau: {TAU}')
print(f'D: {D}')
m, n = le, TAU * D
takens_Y = np.zeros((m, n))
print(f'New Takens matrix shape: {takens_Y.shape}')

Fill the matrix by concatenating rows of the data matrix. Careful, don't concatenate with points of other years, that's why we extracted the indices of points before. This way we don't cross-over to another winter. 

Re-run this or load pre-saved matrix.

In [None]:
"""
%%time
m, n = takens_Y.shape[0], takens_Y.shape[1]
print(f'Takens matrix being constructed of shape {takens_Y.shape}')

for i in tqdm(range(m)):
    indice_row = indices_m[i]
    # X[t, 1:D]
    row = X[indice_row, :D]
    for t in range(1, TAU):
        # X[t+1, 1:D]
        row = np.concatenate((row, X[indice_row + t, :D]))
    takens_Y[i, :len(row)] = row

takens_Y_df = pd.DataFrame(takens_Y)

# Save matrix:
print(f'Saving Takens matrix at {PATH1}')
takens_Y_df.to_pickle(PATH1 + 'takens_Y_df_{}.pkl'.format(D))"""

Read matrix:

In [None]:
takens_Y_df = pd.read_pickle(PATH1 + 'takens_Y_df_{}.pkl'.format(D))
print(takens_Y_df.shape)
takens_Y_df.head()

In [None]:
%%time
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(takens_Y_df.values)
del takens_Y_df

### Create distance matrix:

From this Takens matrix calculate the distance between all points. This will be used later when creating the Laplacian. 

In [None]:
%%time
D = 150

if DATA == 'raw':
    df = pd.read_csv(INPUT_DATA + 'raw_data_coefficients.csv', sep=',')
elif DATA == 'anomalies': 
    df = pd.read_csv(INPUT_DATA + 'anomalies_coefficients.csv', sep=',')
print(f'Input data shape: {df.shape}')

Note: takes a lot of time to compute. Uncomment below if you want to re-compute it. Otherwise can just load a pre-saved one. 

In [None]:
"""
%%time
X = df.drop(['Unnamed: 0', 'Date'], axis = 1).values[:,:D]

print(f'Distance matrix for {DATA}')
distance_df = pd.DataFrame(distance_matrix(X, X),
                           index= df.drop(['Unnamed: 0', 'Date'], axis = 1).index,
                           columns= df.drop(['Unnamed: 0', 'Date'], axis = 1).index)
distance_df.to_pickle(PATH1 + 'distance_matrix_{}.pkl'.format(D))"""

In [None]:
%%time
print(f'Distance matrix read from {PATH1}')
distance_matrix = pd.read_pickle(PATH1 + 'distance_matrix_{}.pkl'.format(D)).values

#check distance matrix is symmetric:
assert(np.all(distance_matrix.T == distance_matrix))

print(f'Distance matrix shape: {distance_matrix.shape}')
print('Sample of distance matrix:')
pd.DataFrame(distance_matrix).head(3)

## Taken's embedding through distance matrix:

A second method to get the distance matrix of the Takens embedding is to apply another formula:  

$dist(Y(t_1), Y(t_2))=\sqrt(\sum_{j=0}^{\tau-1}dist(X(t_1 +j),X(t_2+j))^2)$ 

Where $\tau$ is the size of the embedding. Here we build the distance matrix directly from the distance matrix on the original data, without needing to build the Takens matrix first. 

In [None]:
%%time
# Read the original distance matrix:
D = 150
distance_matrix = pd.read_pickle(PATH1 + 'distance_matrix_{}.pkl'.format(D)).values
print(f'Distance matrix shape: {distance_matrix.shape}')

In [None]:
# Read the original data:
if DATA == 'raw':
    df = pd.read_csv(INPUT_DATA + 'raw_data_coefficients.csv', sep=',')
elif DATA == 'anomalies': 
    df = pd.read_csv(INPUT_DATA + 'anomalies_coefficients.csv', sep=',')
print(f'Input data shape: {df.shape}')

Get time information and the number of measures per year: 

In [None]:
# create dataframe with time as index and remove useless columns
time = df['Date']
time = pd.to_datetime(time)
df_ = df.drop(['Unnamed: 0', 'Date'], axis=1)
df_time = pd.concat([time, df_], axis=1)
df_time = df_time.set_index('Date')

# Years in the data:
years = range(BEGIN_YEAR, END_YEAR)
years = [str(y) for y in years]

# Count number of samples per year:
counts = {}
sum_ = 0
for y in years:
    sum_ += len(df_time[y])
    counts[y] = len(df_time[y])
print('For example, number of samples in 1979 and 1980: {}, {}'.format(counts['1979'], counts['1980']))

In [None]:
# get the position of the last element for each year 
cs = [counts[i] for i in years]
last_year_pos = [np.sum(cs[:i + 1]) for i in range(len(cs))]

# get the number of entire winters e.g, winters that range 
# from Dec - April (not like 1979 and 2019)
entire_winters = END_YEAR - BEGIN_YEAR - 1
print(f'Number of entire winters in data: {entire_winters}')
print('Last position of first and second year: {}, {}'.format(
    last_year_pos[0], last_year_pos[1]))

Get first and last position of each winter for each year. A winter stretches from Dec-April -->only for entire winters (not like 1979 and 2019).


In [None]:
last_pos_winters = give_last_post_winters(last_year_pos, years, df_time)
first_pos_winters = give_first_post_winters(last_year_pos, years, df_time)
assert (len(last_pos_winters) == len(first_pos_winters))

np.save(PATH1 + 'last_pos_winters.npy', last_pos_winters)
np.save(PATH1 + 'first_pos_winters.npy', first_pos_winters)

print('First position of first and second winter: {}, {}'.format(
    first_pos_winters[0], first_pos_winters[1]))
print('Last position of first and second winter: {}, {}'.format(
    last_pos_winters[0], last_pos_winters[1]))

Because we build from the past i.e., from one point $i$ look back till $i-\tau$ we need to keep only the indices of points that have at least $\tau$ points behind them in their year. So go from $first+\tau$ to $last$.

In [None]:
indices_of_points = []
for i in range(len(first_pos_winters)):
    indices_of_points.append(
        range(first_pos_winters[i] + TAU, last_pos_winters[i] + 1, 1))
for i in indices_of_points:
    assert (i[-1] - i[0] > 600)
print('Range of first three winters:')
indices_of_points[:3]

In [None]:
# Safety check to see everything is working all right:
# total number of points:
le = np.sum([len(i) for i in indices_of_points])
# time series corresponding to those points:
time_nlsa = time[indices_of_points[0]]
for i in range(1, len(last_pos_winters)):
    time_nlsa = pd.concat([time_nlsa, time[indices_of_points[i]]], axis=0)
assert (le == len(time_nlsa))

Unwrap the indices of all ranges of `indices_of_points` to get one list of all indices we need to go over. 

In [None]:
indices_m = []
for j in range(len(indices_of_points)):
    for i in range(len(indices_of_points[j])):
        indices_m.append(indices_of_points[j][i])
assert (len(indices_m) == le)
print(f'First and last indice: {indices_m[0]}, {indices_m[-1]}')

Compute the new Takens distance matrix:

In [None]:
#Number of entire winters:
num_years = END_YEAR-BEGIN_YEAR-1
print(f'Number of entire winters: {num_years}')
m,n  = le, le
dist_Y = np.zeros((m,n))
print(f'New distance matrix shape: {dist_Y.shape}')

Uncomment below to recompute it or just load a pre-saved one.

In [None]:
"""
%%time
dist_Y = apply_takens(dist_Y, distance_matrix, indices_m, TAU, PATH1)

#print(f'Reading upper triangle matrix at {PATH1}:')
#dist_Y = pd.read_pickle(PATH1+'dist_Y_takens_final.pkl').values

# Make matrix symetric:
takens_matrix = dist_Y + dist_Y.T

# Save NLSA distance matrix matrix:
print(f'Save NLSA distance matrix at {PATH1}')
takens_df = pd.DataFrame(takens_matrix)
takens_df.to_pickle(PATH1 + 'distance_matrix_takens_final.pkl')

del dist_Y
"""

Read NLSA distance matrix from memory:

In [None]:
%%time
print('Reading Takens matrix:')
D = 150
takens_matrix = pd.read_pickle(
    PATH1 + 'distance_matrix_takens_{}.pkl'.format(D)).values

#check distance matrix is symmetric:
assert (np.all(takens_matrix.T == takens_matrix))

In [None]:
pd.DataFrame(takens_matrix)

In [None]:
%%time
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(takens_matrix)
del takens_matrix

### Nearest neighbours:
Calculate matrix with neighbouring vertices, e.g. 1 for $x_j,x_i$ if $x_j$ or $x_i$ selected the other as a closest  neighbour and 0 otherwise. This weight matrix will also serve as the simple kernel. Creates the binary weight matrix.  

In [None]:
%%time
# Percentage of neighbours
perc = PERC_NEIGH / 100
D = 150
print(f'Starting nearest neighbours for {perc*100}% nearest neihbours ')

# Reading input data:
if DATA == 'raw':
    print(f'Reading raw input data from: {INPUT_DATA}')
    df = pd.read_csv(INPUT_DATA + 'raw_data_coefficients.csv', sep=',')
elif DATA == 'anomalies': 
    print(f'Reading anomalies input data from: {INPUT_DATA}')
    df = pd.read_csv(INPUT_DATA + 'anomalies_coefficients.csv', sep=',')

print(f'Reading distance matrix')

if USE_TAKENS == True:
    print(f'Using takens embedding, reading from {PATH1}')
    distance_matrix = pd.read_pickle(PATH1 + 'distance_matrix_takens_{}.pkl'.format(D)).values
    print(f'Distance matrix shape: {distance_matrix.shape}')
else:
    print(f'Using normal distance matrix, reading from {PATH1}')
    distance_matrix = pd.read_pickle(PATH1+'distance_matrix_{}.pkl'.format(D)).values
    print(f'Distance matrix shape: {distance_matrix.shape}')

# K-nearest neighbours:
print(f'Look for {perc*100}% nearest neihbours:')
K = int(len(df.values) * perc)
N = len(distance_matrix)
weight_matrix = np.zeros((N, N))

for i in tqdm(range(N)):
    # select K closest neihbours:
    indices = np.argsort(distance_matrix[i])[:K]
    for j in indices:
        if i != j and weight_matrix[i, j] == 0:
            weight_matrix[i, j] += 1
weight_matrix = weight_matrix.T

# Make weight matrix is symmetric:
for i in tqdm(range(N)):
    indices = np.argsort(distance_matrix.T[i])[:K]
    for j in indices:
        if i != j and weight_matrix[i, j] == 0:
            weight_matrix[i, j] += 1
weight_df = pd.DataFrame(weight_matrix)

# Save weight matrix:
if USE_TAKENS:
    PATH_SAVE = PATH_SIMPLE_TAKENS+'weight_matrix_takens_{}.pkl'.format(PERC_NEIGH)
    print('Saving weight matrix at {}'.format(PATH_SAVE))
    weight_df.to_pickle(PATH_SAVE)
else:
    PATH_SAVE = PATH_SIMPLE+'weight_matrix_{}.pkl'.format(PERC_NEIGH)
    print('Saving weight matrix at {}'.format(PATH_SAVE))
    weight_df.to_pickle(PATH_SAVE)

del weight_df, weight_matrix, distance_matrix
print('Done')

In [None]:
%%time
# Read weight matrix:
if USE_TAKENS:
    print(f'Reading binary weight matrix from {PATH_SIMPLE_TAKENS}')
    weight_m = pd.read_pickle(PATH_SIMPLE_TAKENS +
                          'weight_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
else:
    print(f'Reading binary weight matrix from {PATH_SIMPLE}')
    weight_m = pd.read_pickle(PATH_SIMPLE +
                          'weight_matrix_{}.pkl'.format(PERC_NEIGH)).values
# Test if matrix is symetric:
assert (np.all(weight_m.T == weight_m))
print(f'Weight matrix shape: {weight_m.shape}')

In [None]:
pd.DataFrame(weight_m).head(5)

Look at image of weight matrix:

In [None]:
%%time
fig, ax = plt.subplots(figsize=(10, 10))
ax.imshow(weight_m)
del weight_m

### Binary kernel: 

Compute Laplacian eigenmaps for the binary kernel (0/1). 

#### Diagonal matrix:
Diagonal matrix from binary weight matrix, computed as $D_{ij} = \sum_{j}W_{ij}$

In [None]:
%%time

if USE_TAKENS:
    weight_m = pd.read_pickle(PATH_SIMPLE_TAKENS +
                          'weight_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
else:
    weight_m = pd.read_pickle(PATH_SIMPLE +
                          'weight_matrix_{}.pkl'.format(PERC_NEIGH)).values

# Compute diagonal matrix:
D = np.zeros((len(weight_m), len(weight_m)))
print('Computing diagonal matrix:')
for i in tqdm(range(len(weight_m))):
    D[i, i] = np.sum(weight_m[i])
D_df = pd.DataFrame(D)


if USE_TAKENS:
    print(f'Saving diagonal matrix at: {PATH_SIMPLE_TAKENS}')
    D_df.to_pickle(PATH_SIMPLE_TAKENS+'diagonal_matrix_takens_{}.pkl'.format(PERC_NEIGH))
else:
    print(f'Saving diagonal matrix at: {PATH_SIMPLE}')
    D_df.to_pickle(PATH_SIMPLE+'diagonal_matrix_{}.pkl'.format(PERC_NEIGH))
print('Done!')

#### Laplacian matrix:

Laplacian matrix computed as $L = D-W$

In [None]:
%%time
print('Computing Laplacian matrix:')

L = np.subtract(D, weight_m)
L_df = pd.DataFrame(L)

if USE_TAKENS: 
    print(f'Saving laplacian matrix at: {PATH_SIMPLE_TAKENS}')
    L_df.to_pickle(PATH_SIMPLE_TAKENS + 'laplacian_simple_matrix_takens_{}.pkl'.format(PERC_NEIGH))
else:
    print(f'Saving laplacian matrix at: {PATH_SIMPLE}')
    L_df.to_pickle(PATH_SIMPLE + 'laplacian_simple_matrix_{}.pkl'.format(PERC_NEIGH))

del L, L_df, D, weight_m
print('Done!')

#### Eigenvalues:
Eigenvalues and eigenvectors for binary kernel. 

In [None]:
%%time

if USE_TAKENS:
    print(f'Reading D and L from: {PATH_SIMPLE_TAKENS}')
    D = pd.read_pickle(
        PATH_SIMPLE_TAKENS +
        'diagonal_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
    print(f'Diagonal matrix shape: {D.shape}')
    L = pd.read_pickle(
        PATH_SIMPLE_TAKENS +
        'laplacian_simple_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
    print(f'Laplacian matrix shape: {L.shape}')

else:
    print(f'Reading D and L from: {PATH_SIMPLE}')
    D = pd.read_pickle(PATH_SIMPLE +
                       'diagonal_matrix_{}.pkl'.format(PERC_NEIGH)).values
    L = pd.read_pickle(
        PATH_SIMPLE +
        'laplacian_simple_matrix_{}.pkl'.format(PERC_NEIGH)).values

# Compute first eigenvalues
print(f'Computing {NUM_EIGENVALUES} eigenvalues:')
w, eigv = linalg.eigs(L, k=NUM_EIGENVALUES, M=D, which='SM')

# Save values:
if USE_TAKENS:
    print(f'Saving eigenvalues and eigenvectors to: {PATH_SIMPLE_TAKENS}')
    pd.DataFrame(w).to_pickle(PATH_SIMPLE_TAKENS +
                              'eigenvalues_takens_{}.pkl'.format(PERC_NEIGH))
    pd.DataFrame(eigv).to_pickle(
        PATH_SIMPLE_TAKENS + 'eigenvectors_takens_{}.pkl'.format(PERC_NEIGH))
    print(f'Eigenvector shape: {eigv.shape}')
else:
    print(f'Saving eigenvalues and eigenvectors to: {PATH_SIMPLE}')
    pd.DataFrame(w).to_pickle(PATH_SIMPLE +
                              'eigenvalues_{}.pkl'.format(PERC_NEIGH))
    pd.DataFrame(eigv).to_pickle(PATH_SIMPLE +
                                 'eigenvectors_{}.pkl'.format(PERC_NEIGH))
del L, D, w, eigv
print('Done!')

### Heat kernel: 
The heat kernel is computed as $W_{ij} = exp(\frac{-||x_j-x_i||^2_2}{t})$ for connected neighbours and 0 otherwise.
Source:[paper](https://www2.imm.dtu.dk/projects/manifold/Papers/Laplacian.pdf)

#### Choice of bandwidth: 
Compute the right bandwidth $t$ values for the heat kernel. Try the mean, median and maximum over distances of neihgbouring vertices.

In [None]:
%%time
print(f'Reading distance and weight matrix from: {PATH1}')

if USE_TAKENS: 
    print('Using takens:')
    distance_matrix = pd.read_pickle(PATH1 + 'distance_matrix_takens.pkl').values
    weight_m = pd.read_pickle(PATH_SIMPLE_TAKENS +'weight_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
else:
    distance_matrix = pd.read_pickle(PATH1+'distance_matrix.pkl').values
    weight_m = pd.read_pickle(PATH_SIMPLE +'weight_matrix_{}.pkl'.format(PERC_NEIGH)).values

In [None]:
%%time
# Select only distances that are chosen in the nearest neighbours step:
mult_df = np.multiply(distance_matrix, weight_m)
non_zero_mult = np.extract(mult_df > 0, mult_df)

In [None]:
# Calculate the mean, median and max over non-zero distances:
mean_distances = np.mean(non_zero_mult)
median_dist = np.median(non_zero_mult)
max_distances = np.max(non_zero_mult)

t = [mean_distances, median_dist, max_distances]

ts = {
    'mean_distances': mean_distances,
    'median_dist': median_dist,
    'max_distances': max_distances,
    'mean_dist_all': np.mean(distance_matrix),
    'max_dist_all': np.max(distance_matrix),
}
if USE_TAKENS:
    np.save(PATH1 + 't_takens_.npy', np.array(t))
else:
    np.save(PATH1+'t.npy', np.array(t))
print(ts)

In [None]:
fig = plt.figure(figsize = (8,5))
axs = plt.subplot(1,1,1)
axs.hist(non_zero_mult)
#plt.title('Non zero distances between neighbours:')
axs.axvline(t[0], color = 'green', label = 'mean')
axs.axvline(t[1], color = 'orange', label = 'median')
axs.axvline(t[2], color = 'red', label = 'max')
axs.set_xlabel('Distance between neighbours')
axs.set_ylabel('Count')
plt.legend()

In [None]:
del distance_matrix, weight_m, mult_df, non_zero_mult

#### Heat matrix: 

Computing heat kernels:

In [None]:
%%time

if USE_TAKENS:
    t = np.load(PATH1+'t_takens_.npy')
    chosen_t = t[0]
    corr_t = ['mean']
    
    print(f'Reading distance and weight matrix from: {PATH_SIMPLE}')
    distance_matrix = pd.read_pickle(PATH1 + 'distance_matrix_takens.pkl').values
    weight_m = pd.read_pickle(PATH_SIMPLE_TAKENS+'weight_matrix_takens_{}.pkl'.format(PERC_NEIGH)).values
    
    print('Computing heat matrix for bandwitdh {}: {}'.format(corr_t[0],chosen_t))
    PATH2 = PATH1+'t_'+corr_t[0]+'/takens/'
    if not os.path.exists(PATH2):
            os.makedirs(PATH2)

    distance_df = pd.DataFrame(distance_matrix)
    # Create heat matrix:
    heat_matrix_df = distance_df.apply(lambda x: np.exp(-(x**2) / (chosen_t**2)))
    heat_matrix_df = pd.DataFrame(np.multiply(weight_m, heat_matrix_df))
    print(f'Saving heat matrix at {PATH2}')
    heat_matrix_df.to_pickle(PATH2+'heat_matrix_'+'t_'+corr_t[0]+ '_takens_.pkl')
    
else:
    t = np.load(PATH1+'t.npy')
    corr_t = ['mean', 'med', 'max']
    
    print(f'Reading distance and weight matrix from: {PATH1}')
    distance_matrix = pd.read_pickle(PATH1+'distance_matrix.pkl').values
    weight_m = pd.read_pickle(PATH_SIMPLE+'weight_matrix_{}.pkl'.format(PERC_NEIGH)).values
    
    for i in range(3):
        chosen_t = t[i]
        print('Computing heat matrix for bandwitdh {}: {}'.format(corr_t[i],chosen_t))

        PATH2 = PATH1+'t_'+corr_t[i]+'/'
        if not os.path.exists(PATH2):
            os.makedirs(PATH2)

        distance_df = pd.DataFrame(distance_matrix)
        # Create heat matrix:
        heat_matrix_df = distance_df.apply(lambda x: np.exp(-(x**2) / (chosen_t**2)))
        heat_matrix_df = pd.DataFrame(np.multiply(weight_m, heat_matrix_df))
        heat_matrix_df.to_pickle(PATH2+'heat_matrix_'+'t_'+corr_t[i]+ '_.pkl')
del heat_matrix_df, distance_matrix, weight_m, distance_df

Load pre-computed heat kernels for 10% neighbours:

In [None]:
%%time
corr_t = ['mean', 'med', 'max']


if USE_TAKENS:
    PATH_READ = '../../../data/vandermeer/pickles/{}/10perc/'.format(DATA)
    print(f'Path to heat matrices:{PATH_READ}')
    print(f'Reading heat matrix for t = {corr_t[0]}:')
    heat_matrix_mean = pd.read_pickle(PATH_READ +'t_'+corr_t[0]+ '/takens/' +
                                      'heat_matrix_' +'t_'+corr_t[0]+'_takens_.pkl').values
    print(f'Heat matrix shape: {heat_matrix_mean.shape}')
else:
    PATH_READ = '../../../data/vandermeer/pickles/{}/10perc/'.format(DATA)
    print(f'Path to heat matrices:{PATH_READ}')
    print(f'Reading heat matrix for t = {corr_t[0]}:')
    heat_matrix_mean = pd.read_pickle(PATH_READ +'t_'+corr_t[0]+ '/' +
                                      'heat_matrix_' +'t_'+corr_t[0]+'_.pkl').values
    print(f'Reading heat matrix for t = {corr_t[1]}:')
    heat_matrix_max = pd.read_pickle(PATH_READ +'t_'+corr_t[1]+ '/' +
                                     'heat_matrix_' +'t_'+corr_t[1]+'_.pkl').values
    print(f'Reading heat matrix for t = {corr_t[2]}:')
    heat_matrix_med = pd.read_pickle(PATH_READ +'t_'+corr_t[2]+ '/' +
                                     'heat_matrix_' +'t_'+corr_t[2]+'_.pkl').values

Load pre-computed heat kernels for 20% neighbours if you want to compare to those:

In [None]:
%%time
if not USE_TAKENS:
    corr_t = ['mean', 'med', 'max']
    PATH_READ = '../../../data/vandermeer/pickles/{}/20perc/'.format(DATA)
    print(f'Path to heat matrices:{PATH_READ}')
    print(f'Reading heat matrix for t = {corr_t[0]}:')
    heat_matrix_mean2 = pd.read_pickle(PATH_READ +'t_'+corr_t[0]+ '/' +
                                      'heat_matrix_' +'t_'+corr_t[0]+
                                      '_.pkl').values
    print(f'Reading heat matrix for t = {corr_t[1]}:')
    heat_matrix_max2 = pd.read_pickle(PATH_READ +'t_'+corr_t[1]+ '/' +
                                     'heat_matrix_' +'t_'+corr_t[1]+
                                     '_.pkl').values
    print(f'Reading heat matrix for t = {corr_t[2]}:')
    heat_matrix_med2 = pd.read_pickle(PATH_READ +'t_'+corr_t[2]+ '/' +
                                     'heat_matrix_' +'t_'+corr_t[2]+
                                     '_.pkl').values

Plot sample of heat kernels as image:

In [None]:
if not USE_TAKENS:
    fig, axs = plt.subplots(1, 3, figsize=(10, 10))
    i = 0
    corr_t = ['mean', 'med', 'max']
    matrices = [heat_matrix_mean[:10, :10],heat_matrix_max[:10, :10],heat_matrix_med[:10, :10]]
    for chosen_t in corr_t:
        axs[i].imshow(matrices[i])
        axs[i].set_title(chosen_t)
        i += 1

Plot histogram of non zero values of heat kernels:

In [None]:
%%time
if USE_TAKENS:
    fig, axs = plt.subplots(1,1, figsize = (5,5))
    corr_t = ['mean']
    non_zero_heat = np.extract(heat_matrix_mean>0, heat_matrix_mean)
    axs.set_title('Heat kernel, t = {} distance'.format(corr_t[0]))
    axs.hist(non_zero_heat, label = '10 perc')

In [None]:
%%time

if not USE_TAKENS:
    fig, axs = plt.subplots(1,3, figsize = (15,5))
    i = 0
    corr_t = ['mean', 'med', 'max']
    matrices = [heat_matrix_mean, heat_matrix_max, heat_matrix_med]
    for i in range(3):
        heat_matrix = matrices[i]
        non_zero_heat = np.extract(heat_matrix>0, heat_matrix)
        axs[i].set_title('Heat kernel, t = {} distance'.format(corr_t[i]))
        axs[i].hist(non_zero_heat, label = '10 perc')
        i+=1

Plot overlapping histogram of non zero values of heat kernels for 10% and 20%:

In [None]:
%%time
if not USE_TAKENS:
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    i = 0
    matrices = [heat_matrix_mean, heat_matrix_max, heat_matrix_med]
    matrices2 = [heat_matrix_mean2, heat_matrix_max2, heat_matrix_med2]
    corr_t = ['mean', 'med', 'max']
    for i in range(3):
        heat_matrix = matrices[i]
        heat_matrix2 = matrices2[i]
        non_zero_heat = np.extract(heat_matrix > 0, heat_matrix)
        non_zero_heat2 = np.extract(heat_matrix2 > 0, heat_matrix2)
        t = ['mean', 'max', 'median']
        axs[i].set_title('Heat kernel, t = {} distance'.format(corr_t[i]))
        axs[i].hist(non_zero_heat, label='10 perc', alpha=0.5)
        axs[i].hist(non_zero_heat2, label='20 perc', alpha=0.5)
        i += 1
    plt.legend()
    del matrices, matrices2, heat_matrix_mean, heat_matrix_max, heat_matrix_med, heat_matrix_mean2, heat_matrix_max2, heat_matrix_med2

#### Diagonal weight matrix: 
Compute diagonal matrix as $D_{ii} = \sum_j W_{ij}$

In [None]:
if USE_TAKENS:
    corr_t = ['mean']
    print(f'Computing diagonal matrix for heat kernel with t: {corr_t[0]}')
    PATH2 = PATH1+'t_'+corr_t[0]+'/takens/'
    if not os.path.exists(PATH2):
        os.makedirs(PATH2)
    wm = pd.read_pickle(PATH1+'t_'+corr_t[0]+'/takens/heat_matrix_t_'+corr_t[0]+'_takens_.pkl').values
    D = np.zeros((len(wm),len(wm)))
    for i in tqdm(range(len(wm))):
        D[i,i] = np.sum(wm[i])
    D_df = pd.DataFrame(D)
    print(f'Writing diagonal matrix to {PATH2}')
    D_df.to_pickle(PATH2+'diagonal_heat_matrix_t_'+corr_t[0]+'_takens_.pkl')
else:
    corr_t = ['mean', 'med', 'max']
    for j in range(3):
        print(f'Computing diagonal matrix for heat kernel with t: {corr_t[j]}')
        PATH2 = PATH1+'t_'+corr_t[j]+'/'
        if not os.path.exists(PATH2):
            os.makedirs(PATH2)
        wm = pd.read_pickle(PATH1+'t_'+corr_t[j]+'/heat_matrix_t_'+corr_t[j]+'_.pkl').values
        D = np.zeros((len(wm),len(wm)))
        for i in tqdm(range(len(wm))):
            D[i,i] = np.sum(wm[i])
        D_df = pd.DataFrame(D)
        print(f'Writing diagonal matrix to {PATH2}')
        D_df.to_pickle(PATH2+'diagonal_heat_matrix_t_'+corr_t[j]+'_.pkl')
del D_df, D, wm

Load pre-computed diagonal matrix_:

In [None]:
%%time
if USE_TAKENS:
    D = pd.read_pickle(PATH1+'t_mean/takens/diagonal_heat_matrix_'+'t_mean_takens_.pkl').values
else:
    D = pd.read_pickle(PATH1+'t_mean/diagonal_heat_matrix_'+'t_mean_.pkl').values
print(f'Shape of diagonal matrix: {D.shape}')
pd.DataFrame(D).head(5)

In [None]:
del D

#### Laplacian matrix: 

Compute Laplacian matrix as $L = D- W$

In [None]:
%%time
if USE_TAKENS:
    corr_t = ['mean']
    print(f'Computing Laplacian matrix for heat kernel with t: {corr_t[0]}')
    PATH2 = PATH1 + 't_' + corr_t[0] + '/takens/'
    if not os.path.exists(PATH2):
        os.makedirs(PATH2)
    print('Reading heat matrix:')
    wm = pd.read_pickle(PATH2+'heat_matrix_t_'+corr_t[0]+'_takens_.pkl').values
    print('Reading diagonal matrix:')
    D = pd.read_pickle(PATH2+'diagonal_heat_matrix_t_'+corr_t[0]+'_takens_.pkl').values
    print('Calculating Laplacian:')
    L = np.subtract(D, wm)
    L_df = pd.DataFrame(L)
    print(f'Writing Laplacian to {PATH2}')
    L_df.to_pickle(PATH2+'laplacian_heat_matrix_t_'+corr_t[0]+'_takens_.pkl')

else:
    corr_t = ['mean', 'med', 'max']
    for j in range(3):
        print(f'Computing Laplacian matrix for heat kernel with t: {corr_t[j]}')
        PATH2 = PATH1 + 't_' + corr_t[j] + '/'
        if not os.path.exists(PATH2):
            os.makedirs(PATH2)
        print('Reading heat matrix:')
        wm = pd.read_pickle(PATH2+'heat_matrix_t_'+corr_t[j]+'_.pkl').values
        print('Reading diagonal matrix:')
        D = pd.read_pickle(PATH2+'diagonal_heat_matrix_t_'+corr_t[j]+'_.pkl').values
        print('Calculating Laplacian:')
        L = np.subtract(D, wm)
        L_df = pd.DataFrame(L)
        L_df.to_pickle(PATH2+'laplacian_heat_matrix_t_'+corr_t[j]+'_.pkl')
    del L_df, L, D, wm
print('Done!')

Load pre-computed laplacian matrix:

In [None]:
%%time

if USE_TAKENS:
    D = pd.read_pickle(PATH1+'t_mean/takens/laplacian_heat_matrix_t_mean_takens_.pkl').values
else:
    D = pd.read_pickle(PATH1+'t_mean/laplacian_heat_matrix_t_mean_.pkl').values
print(f'Shape of Laplacian matrix: {L.shape}')
pd.DataFrame(L).head(5)

In [None]:
del L

#### Eigenvalues: 

Eigendecomposition of: $Lf = \gamma Df$ where $f$ are the eigenvector solutions ordered according to their increasing eigenvalue $\lambda_0 = 0 < \lambda_1 < ...$

Computing eigendecomposition:

In [None]:
%%time
chosen_t = 'mean'
print(f'Computing eigenvalues for heat kernel with t={chosen_t}')

if not os.path.exists(PATH2):os.makedirs(PATH2)

if USE_TAKENS:  
    PATH2 = PATH1+'t_{}/takens/'.format(chosen_t)
    print(f'Loading L from:{PATH2}')   
    L = pd.read_pickle(PATH2+'laplacian_heat_matrix_t_'+chosen_t+'_takens_.pkl').values

    print(f'Loading D from:{PATH2}')   
    D = pd.read_pickle(PATH2+'diagonal_heat_matrix_t_'+chosen_t+'_takens_.pkl').values

    print(f'Computing {NUM_EIGENVALUES} eigenvalues:')
    w, eigv = linalg.eigs(L, k=NUM_EIGENVALUES, M=D, which='SM')

    print('Saving eigendecomposition:')
    pd.DataFrame(w).to_pickle(PATH2 + 'eigenval_t_'+chosen_t+f'_takens_{150}.pkl')
    pd.DataFrame(eigv).to_pickle(PATH2 + 'eigenvec_t_'+chosen_t+ f'_takens_{150}.pkl')
    print(f'Shape of eigenvectors: {eigv.shape}')
else:
    PATH2 = PATH1+'t_{}/'.format(chosen_t)
    print(f'Loading L from:{PATH2}')   
    L = pd.read_pickle(PATH2+'laplacian_heat_matrix_t_'+chosen_t+'_.pkl').values

    print(f'Loading D from:{PATH2}')   
    D = pd.read_pickle(PATH2+'diagonal_heat_matrix_t_'+chosen_t+'_.pkl').values

    print(f'Computing {NUM_EIGENVALUES} eigenvalues:')
    w, eigv = linalg.eigs(L, k=NUM_EIGENVALUES, M=D, which='SM')

    print('Saving eigendecomposition:')
    pd.DataFrame(w).to_pickle(PATH2 + 'eigenvalues_heat_matrix_t_'+chosen_t+'_.pkl')
    pd.DataFrame(eigv).to_pickle(PATH2 + 'eigenvectors_heat_matrix_t_'+chosen_t+ '_.pkl')

del L, D, w, eigv