In [1]:
import os
import numpy as np

import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import from_networkx

import networkx as nx
from networkx.convert_matrix import from_numpy_array

# Additional imports to deal with Matrix Profiling
import matrixprofile as mp
import stumpy
from stumpy import stumped



In [7]:
# To retrieve variables from Graph_Building.ipynb
%store -r full_corr_path_lists
%store -r diagnostic_label
%store -r ages
%store -r min_age
%store -r max_age
%store -r sex

In [5]:
# Defining functions to simplify the code in the class Raw_to_Graph_Time_Series_Features, like in Graph_Building.ipynb

# To convert a dictionnary into a numpy array
def dict_to_array(dict):
    array = np.array(list(dict.values()))
    return array

# To normalize an array
def normalize_array(array):
    norm_array = (array - np.mean(array)) / np.std(array)
    return norm_array

In [None]:
def dataset_features_and_stats(dataset):
    print()
    print(f'Dataset: {dataset}:')
    print('====================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Weighted: {dataset.weight}')
    print(f'Threshold: {dataset.threshold}')
    print(f'Correlation Method: {dataset.method}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {len(np.unique(diagnostic_label))}')

    # Getting the first graph object in the dataset.
    data = dataset[0]

    print()
    print(data)
    print('=============================================================')

    # Some statistics about the first graph.
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

In [8]:
# Defining a class to preprocess raw data into a format suitable for training Graph Neural Networks (GNNs).
## With time series features and with the possibility of assigning weight to edges.

class Raw_to_Graph_MatrixProfile(InMemoryDataset):
    def __init__(self, root, threshold, method, weight, transform=None, pre_transform=None):
        self.threshold = threshold
        self.method = method
        self.weight = weight
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def processed_file_names(self):
        return ['data.pt']
    
    # Computing the matrix profile using STUMPY++
    def compute_matrix_profile(self, time_series):
        m, idx = stumped(time_series, m=3)
        return m, idx

    # Finding the motif and discord in the matrix profile
    def find_motif_discord(self, matrix_profile, idx):
        motif_idx = np.argmin(matrix_profile)
        discord_idx = np.argmax(matrix_profile)
        motif = idx[motif_idx]
        discord = idx[discord_idx]
        return motif, discord

    # This function is used to process the raw data into a format suitable for GNNs, by constructing graphs out of the connectivity matrices.
    def process(self):
        graphs=[]
        corr_matrices = full_corr_path_lists[self.method]
        for patient_idx, patient_matrix in enumerate(corr_matrices):
            path = f'ADNI_full/corr_matrices/corr_matrix_{self.method}/{patient_matrix}'
            corr_matrix = pd.read_csv(path, header=None).values

            # Here ROIs stands for Regions of Interest and we are building the edge_matrix from the correlation matrix
            nbr_ROIs = corr_matrix.shape[0]
            edge_matrix = np.zeros((nbr_ROIs,nbr_ROIs))
            for j in range(nbr_ROIs):
                for k in range(nbr_ROIs):
                    # Here we are using the absolute value of each element of the correlation matrix, as the corr coeff is in the range [-1,1].
                    if np.abs(corr_matrix[ j , k ]) < self.threshold:
                        edge_matrix[ j , k ] = 0
                    else:
                        if self.weight:
                            # Here we assign the absolute value of the correlation coefficient as the edge weight.
                            edge_matrix[ j , k ] = corr_matrix[ j , k]
                        else:
                            # Here we assign 1 as the edge weight, i.e. regardless of the the absolute value of the correlation coefficient.
                            edge_matrix[ j , k ] = 1

            # Computing the matrix profile for each row (time series) of the correlation matrix
            matrix_profiles = []
            motifs = []
            discords = []
            for row in corr_matrix:
                m, idx = self.compute_matrix_profile(row)
                matrix_profiles.append(m)
                motif, discord = self.find_motif_discord(m, idx)
                motifs.append(motif)
                discords.append(discord)
            motifs_array = np.array(motifs)
            discords_array = np.array(discords)

            # Create a NetworkX graph from the edge matrix
            NetworkX_graph = from_numpy_array(edge_matrix)

            # Compute the degree, betweenness centrality, clustering coefficient, local efficiency for each node of the graph and the global efficiency of the graph
            degree_dict = dict(NetworkX_graph.degree())
            between_central_dict = nx.betweenness_centrality(NetworkX_graph)
            cluster_coeff_dict = nx.clustering(NetworkX_graph)
            global_eff = nx.global_efficiency(NetworkX_graph)
            local_eff_dict = {}
            for node in NetworkX_graph.nodes():
                subgraph_neighb = NetworkX_graph.subgraph(NetworkX_graph.neighbors(node))
                if subgraph_neighb.number_of_nodes() > 1:
                    efficiency = nx.global_efficiency(subgraph_neighb)
                else:
                    efficiency = 0.0
                local_eff_dict[node] = efficiency

            # Convert the degree, betweenness centrality, local efficiency, clustering coefficient and ratio of local to global efficiency dictionaries to NumPy arrays then normalize them
            degree_array = dict_to_array(degree_dict)
            degree_array_norm = normalize_array(degree_array)

            between_central_array = dict_to_array(between_central_dict)
            between_central_array_norm = normalize_array(between_central_array)

            local_efficiency_array = dict_to_array(local_eff_dict)
            local_eff_array_norm = normalize_array(local_efficiency_array)

            ratio_local_global_array = dict_to_array(local_eff_dict) / global_eff
            ratio_local_global_array_norm = normalize_array(ratio_local_global_array)

            cluster_coeff_array = dict_to_array(cluster_coeff_dict)
            cluster_coeff_array_norm = normalize_array(cluster_coeff_array)
            
            # Extracting the age and sex features of the patient
            patient_age = ages[patient_idx]
            age_norm = (patient_age - min_age) / (max_age - min_age)
            patient_sex = int(sex[patient_idx])
            # Making the age and sex arrays the same size as the other arrays
            age_array = np.full((nbr_ROIs,), age_norm)
            sex_array = np.full((nbr_ROIs,), patient_sex)

            # Concatenate the degree, participation coefficient, betweenness centrality, local efficiency, and ratio of local to global efficiency arrays to form a single feature vector
            x_conc = torch.tensor(np.concatenate((degree_array_norm, between_central_array_norm, local_eff_array_norm, cluster_coeff_array_norm, ratio_local_global_array_norm, motifs_array, discords_array, age_array, sex_array)), dtype=torch.float)
            # Determining the number of features concatenated to reshape with the correct dimensions
            x = torch.reshape(x_conc, (9, nbr_ROIs)).T

            # Create a Pytorch Geometric Data object from the NetworkX 
            graph_data = from_networkx(NetworkX_graph)
            ## The feature matrix of the graph is the degree, betweenness centrality, local efficiency, clustering coefficient and ratio of local to global efficiency of each node
            graph_data.x = x
            ## The target/output variable that we want to predict is the diagnostic label of the patient
            graph_data.y = diagnostic_label[patient_idx]
            graphs.append(graph_data)

        data, slices = self.collate(graphs)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
threshold = 0.4
weight = False
method = 'pearson'

root = f'Raw_to_graph_MatrixProfile/ADNI_T_{threshold}_W_{weight}_M_{method}'
dataset = Raw_to_Graph_MatrixProfile(root=root, threshold=threshold, method=method, weight=weight)
dataset_features_and_stats(dataset)

In [None]:
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
weights = [False, True]
methods = ['pearson', 'spearman', 'kendall', 'partial']

for weight in weights:
    for method in methods:
        for threshold in thresholds:
            root = f'Raw_to_graph_MatrixProfile/ADNI_T_{threshold}_W_{weight}_M_{method}'
            dataset = Raw_to_Graph_MatrixProfile(root=root, threshold=threshold, method=method, weight=weight)
            dataset_features_and_stats(dataset)