<h1>
<center>Graph Construction </center>
</h1>

### Generals 

Dependencies

In [1]:
#Data
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
import shutil

#Graph Counstruction
import torch
from torch_geometric_temporal.signal import StaticGraphTemporalSignal
from statsmodels.tsa.stattools import grangercausalitytests
import matplotlib.pyplot as plt


import torch
import torch.nn.functional as F
from torch_geometric_temporal.nn.recurrent import A3TGCN2
from torch.nn import Linear
from torch.nn import ReLU
import torch.nn as nn
from torch.nn.init import kaiming_uniform_

Envariables

In [2]:
train_path = 'io/input/base_data/trainset.csv'
test_path = 'io/input/base_data/testset.csv'
val_path ='io/input/base_data/valset.csv'

data_intermediate_path ='io/input/data_intermediate/'

train_node_features_path = data_intermediate_path + 'node_features/train/'
train_node_labels_path = data_intermediate_path + 'node_labels/train/'
train_edges_path = data_intermediate_path + 'edges/train/'
train_edge_weights_path = data_intermediate_path + 'edge_weights/train/'

val_node_labels_path = data_intermediate_path + 'node_labels/val/'
val_node_features_path = data_intermediate_path + 'node_features/val/'
val_edges_path = data_intermediate_path + 'edges/val/'
val_edge_weights_path = data_intermediate_path + 'edge_weights/val/'

test_node_labels_path = data_intermediate_path + 'node_labels/test/'
test_node_features_path = data_intermediate_path + 'node_features/test/'
test_edges_path = data_intermediate_path + 'edges/test/'
test_edge_weights_path = data_intermediate_path + 'edge_weights/test/'

plot_export_path = 'io/output/exports/analysis_plots/'
chunk_size = 10000
train_timesteps = 200
val_timesteps = 50
test_timesteps = 50

### Core Functionality

Save numpy arrays

In [3]:
def save_object(arr,chunk_size,path):
    num_chunks = arr.shape[0] // chunk_size + 1
    shutil.rmtree(path)
    os.makedirs(path)
    for i in tqdm(range(num_chunks)):
        chunk = arr[i*chunk_size:(i+1)*chunk_size, :]
        filename = f"{path}chunk_{i}.npy"
        np.save(filename, chunk)

Load numpy arrays

In [4]:
def load_object(path):
    num_chunks = len([f for f in os.listdir(path) if f.startswith('chunk_') and f.endswith('.npy')])
    # Load array from chunks
    chunks = []
    for i in range(num_chunks):
        filename = f"{path}chunk_{i}.npy"
        print()
        chunk = np.load(filename,allow_pickle=True)
        chunks.append(chunk)
    arr_reconstructed = np.concatenate(chunks, axis=0)
    return arr_reconstructed

Load csv files

In [5]:
def load_data(train_path,test_path,val_path):
    trainset = pd.read_csv(train_path,index_col=0)
    testset = pd.read_csv(test_path,index_col=0)
    valset = pd.read_csv(val_path,index_col=0)
    return trainset,testset,valset

### Graph info construction

Node features

In [6]:
def node_features_construction(df,timesteps):
    feature_names = df.columns
    # Convert the 'Date' column to a pandas datetime object
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Get a list of all unique asset IDs in the dataset
    asset_ids = df['Asset_ID'].unique()
    Dates = df['Date'].unique()

    # Get a list of all features in the dataset
    features = list(df.columns)
    features.remove('Date')
    features.remove('Asset_ID')
    results=[]
    results=[0 for i in range(len(Dates))]

    for i in tqdm( range (0,len(Dates))):
        results[i] = df[df["Date"]==Dates[i]]
        results[i] = results[i].drop(['Date'], axis=1)
        results[i] = results[i].drop(['Asset_ID'], axis=1)
    
    
    results = results[:timesteps]
    np_result = np.array(results)
    np_result = np.reshape(np_result, (len(results),len(asset_ids),len(features),1))
    return np_result

Nodel Labels

In [7]:
def node_labels_construction(df,timsteps):
    asset_ids = df['Asset_ID'].unique()
    timestamps = df['Date'].unique()
    timestamps = timestamps[:timsteps]
    df = df.loc[df['Date'].isin(timestamps)].reset_index(drop=True)
    result = df.pivot(index='Date', columns='Asset_ID', values='Close')
    result = np.array(result)
    result = np.reshape(result, (len(result), len(asset_ids),1))
    return result

Edge caclulation using Grager Casuality test

In [8]:
def adjency_matrixes_granger_test(df,timsteps):
    df_pivot = df.pivot_table(index='Date', columns='Asset_ID', values='Close')
    df_pivot = df_pivot.head(timsteps)
    date_list = df_pivot.index
    asset_ids = df_pivot.columns
    # Create an empty list to store the adjacency matrices
    adj_matrices = []

    # Loop through each timestamp
    for i in tqdm (range (len(df_pivot))):

        # Select the data up to the current timestamp
        data = df_pivot.iloc[:i+1,:]
        
        # Create an empty adjacency matrix for the current timestamp
        adj_matrix = np.zeros((len(df_pivot.columns), len(df_pivot.columns)))

        # Loop through each combination of asset IDs
        for j in range(len(df_pivot.columns)):
            for k in range(len(df_pivot.columns)):

                # Skip if j and k are the same asset ID
                if j == k:
                    continue

                # Select the data for the current pair of asset IDs
                data_jk = data.iloc[:,[j,k]].dropna()

                # Skip if there is no data for the current pair of asset IDs
                if len(data_jk) == 0:
                    continue

                # Calculate Granger causality in both directions between j and k
                try:
                    result1 = grangercausalitytests(data_jk, maxlag=2, verbose=False)
                    max_F_statistic1 = max(result1[lag][0]['params_ftest'][0] for lag in result1.keys())
                    result2 = grangercausalitytests(data_jk.iloc[:,::-1], maxlag=2, verbose=False)
                    max_F_statistic2 = max(result2[lag][0]['params_ftest'][0] for lag in result2.keys())
                except:
                    max_F_statistic1 = None
                    max_F_statistic2 = None

                # Add the Granger causality values to the adjacency matrix
                if max_F_statistic1 is not None and max_F_statistic2 is not None:
                    adj_matrix[j,k] = max_F_statistic1
                    adj_matrix[k,j] = max_F_statistic2

        # Add the current adjacency matrix to the list of adjacency matrices
        adj_matrices.append(adj_matrix)
    return adj_matrices,asset_ids,date_list

Nomalize edge weights to range 0.1-0.9

In [9]:
def normalize_adj(adjency_matrixes):
    norm_adjency_matrixes = []
    for adj in tqdm(adjency_matrixes):
        # Create a copy of the input matrix
        adj_norm = np.copy(adj)
        # Get the diagonal indices
        diag_indices = np.diag_indices(adj.shape[0])
        # Set the diagonal elements to 1
        adj_norm[diag_indices] = 1
        # Normalize the non-diagonal elements on the range 0.1-0.9
        non_diag_mask = np.ones(adj.shape, dtype=bool)
        non_diag_mask[diag_indices] = False
        adj_norm[non_diag_mask] = 0.1 + 0.8 * (adj_norm[non_diag_mask] - np.min(adj_norm[non_diag_mask])) / (np.max(adj_norm[non_diag_mask]) - np.min(adj_norm[non_diag_mask]))
        norm_adjency_matrixes.append(adj_norm)
    norm_adjency_matrixes = np.array(norm_adjency_matrixes)
    return norm_adjency_matrixes

Convert edges and edge-weights to the final format

In [10]:
def adj_to_edge_lists(norm_adjency_matrixes):
    undirected_edges_list = []
    undirected_weights_list = [] 
    
    for adj_matrix in tqdm(norm_adjency_matrixes):
        edges = np.transpose(np.where(adj_matrix != 0))
        edge_weights = adj_matrix[edges[:, 0], edges[:, 1]]
        edges = [list(edges[:, 0] + 1), list(edges[:, 1] + 1)]
        edg_weights = list(edge_weights)
        undirected_edges_list.append(edges)
        undirected_weights_list.append(edge_weights)
    
    undirected_edges_list = np.array(undirected_edges_list)
    undirected_weights_list = np.array(undirected_weights_list)
    return undirected_edges_list,undirected_weights_list

Plot adjency matrixes of the edges

In [11]:
def plot_adjency_matrixes(adj_matrix1,adj_matrix2,labels,date1,date2,plot_export_path):
    # Create two random adjacency matrices
    labels = asset_ids
    # Set up the figure and two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    # Plot the first adjacency matrix
    im1 = ax1.imshow(adj_matrix1, cmap='coolwarm', interpolation='None')
    ax1.set_xticks(np.arange(len(labels)))
    ax1.set_yticks(np.arange(len(labels)))
    ax1.set_xticklabels(labels,rotation=90)
    ax1.set_yticklabels(labels)
    ax1.tick_params(axis='both', labelsize=12)
    ax1.set_title(timestamp1)
    # Plot the second adjacency matrix
    im2 = ax2.imshow(adj_matrix2, cmap='coolwarm', interpolation='None')
    ax2.set_xticks(np.arange(len(labels)))
    ax2.set_yticks(np.arange(len(labels)))
    ax2.set_xticklabels(labels,rotation=90)
    ax2.set_yticklabels(labels)
    ax2.tick_params(axis='both', labelsize=12)
    ax2.set_title(timestamp2)
    # Add the matrix values as text to the plots
    for ax, adj_matrix in zip([ax1, ax2], [adj_matrix1, adj_matrix2]):
        for i in range(adj_matrix.shape[0]):
            for j in range(adj_matrix.shape[1]):
                text = ax.text(j, i, round(adj_matrix[i, j],1),
                               ha="center", va="center", color="w", fontsize=12)
    # Set the title of the figure
    fig.suptitle("Dynamic Graph Edge Info", fontsize=18)
    # Adjust the spacing between subplots
    fig.tight_layout(pad=2)
    # Display the plot
    plt.savefig(plot_export_path + 'Adjency_Matrixes.pdf')
    plt.show()

### Create and saved node features and nodel labels

In [12]:
trainset,testset,valset = load_data(train_path,test_path,val_path)

# Train-set
print('Construction: train node features')
train_node_features = node_features_construction(trainset,train_timesteps)
save_object(train_node_features,chunk_size,train_node_features_path)

print('Construction: train node labels')
train_node_labels = node_labels_construction(trainset,train_timesteps)
save_object(train_node_labels,chunk_size,train_node_labels_path)

# Validation-set
print('Construction: val node features')
val_node_features = node_features_construction(valset,val_timesteps)
save_object(val_node_features,chunk_size,val_node_features_path)

print('Construction: val node labels')
val_node_labels = node_labels_construction(valset,val_timesteps)
save_object(val_node_labels,chunk_size,val_node_labels_path)

# Test-set
print('Construction: test node features')
test_node_features = node_features_construction(testset,test_timesteps)
save_object(test_node_features,chunk_size,test_node_features_path)

print('Construction: test node labels')
test_node_labels = node_labels_construction(testset,test_timesteps)
save_object(test_node_labels,chunk_size,test_node_labels_path)

Construction: train node features


100%|███████████████████████████████████████| 887/887 [00:00<00:00, 2816.48it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1236.16it/s]


Construction: train node labels


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1744.72it/s]


Construction: val node features


100%|███████████████████████████████████████| 887/887 [00:00<00:00, 2900.46it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1666.39it/s]


Construction: val node labels


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2074.33it/s]


Construction: test node features


100%|███████████████████████████████████████| 887/887 [00:00<00:00, 2968.37it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1623.81it/s]


Construction: test node labels


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1489.45it/s]


### Create and saved edges and edge-weights 

In [13]:
trainset,testset,valset = load_data(train_path,test_path,val_path)

# Train-set
print('Construction: train edge info')
adjency_matrixes,asset_ids,dates = adjency_matrixes_granger_test(trainset,train_timesteps)
norm_adjency_matrixes = normalize_adj(adjency_matrixes)
train_undirected_edges,train_undirected_weights = adj_to_edge_lists(norm_adjency_matrixes)
save_object(train_undirected_edges,chunk_size,train_edges_path)
save_object(train_undirected_weights,chunk_size,train_edge_weights_path)

# Validation-set
print('Construction: val edge info')
adjency_matrixes,asset_ids,dates = adjency_matrixes_granger_test(valset,val_timesteps)
norm_adjency_matrixes = normalize_adj(adjency_matrixes)
val_undirected_edges_train,val_undirected_weights_train = adj_to_edge_lists(norm_adjency_matrixes)
save_object(val_undirected_edges_train,chunk_size,val_edges_path)
save_object(val_undirected_weights_train,chunk_size,val_edge_weights_path)

# Test-set
print('Construction: test edge info')
adjency_matrixes,asset_ids,dates = adjency_matrixes_granger_test(testset,test_timesteps)
norm_adjency_matrixes = normalize_adj(adjency_matrixes)
test_undirected_edges_train,test_undirected_weights_train = adj_to_edge_lists(norm_adjency_matrixes)
save_object(test_undirected_edges_train,chunk_size,test_edges_path)
save_object(test_undirected_weights_train,chunk_size,test_edge_weights_path)

Construction: train edge info


100%|█████████████████████████████████████████| 200/200 [01:28<00:00,  2.26it/s]
  adj_norm[non_diag_mask] = 0.1 + 0.8 * (adj_norm[non_diag_mask] - np.min(adj_norm[non_diag_mask])) / (np.max(adj_norm[non_diag_mask]) - np.min(adj_norm[non_diag_mask]))
100%|██████████████████████████████████████| 200/200 [00:00<00:00, 53261.00it/s]
100%|██████████████████████████████████████| 200/200 [00:00<00:00, 37342.45it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 586.45it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1109.02it/s]


Construction: val edge info


100%|███████████████████████████████████████████| 50/50 [00:19<00:00,  2.57it/s]
  adj_norm[non_diag_mask] = 0.1 + 0.8 * (adj_norm[non_diag_mask] - np.min(adj_norm[non_diag_mask])) / (np.max(adj_norm[non_diag_mask]) - np.min(adj_norm[non_diag_mask]))
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 47222.52it/s]
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 31765.40it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1490.51it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2236.96it/s]


Construction: test edge info


100%|███████████████████████████████████████████| 50/50 [00:19<00:00,  2.57it/s]
  adj_norm[non_diag_mask] = 0.1 + 0.8 * (adj_norm[non_diag_mask] - np.min(adj_norm[non_diag_mask])) / (np.max(adj_norm[non_diag_mask]) - np.min(adj_norm[non_diag_mask]))
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 48827.75it/s]
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 38227.34it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1565.04it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2666.44it/s]


In [14]:
# plot_adjency_matrixes(norm_adjency_matrixes[5],norm_adjency_matrixes[9],
#                       asset_ids,dates[5],dates[9],plot_export_path)