In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.5.3


In [2]:
!pip install stumpy

Collecting stumpy
  Downloading stumpy-1.12.0-py3-none-any.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.1/169.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: stumpy
Successfully installed stumpy-1.12.0


In [3]:
import os
import numpy as np
import pandas as pd
import pickle as pkl
import pdb

import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.utils import from_networkx

import networkx as nx
from networkx.convert_matrix import from_numpy_array

# Additional imports to deal with Matrix Profiling. Only needed for the matrix profiling part of the project that is currently commented out
# import matrixprofile as mp
import stumpy
from stumpy import stump

In [22]:
# Mounting my drive to have access to the other files
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
!git clone https://github.com/mathilde-cros/Bachelor_Thesis_Alzheimers_HGNN.git

Cloning into 'Bachelor_Thesis_Alzheimers_HGNN'...
remote: Enumerating objects: 19935, done.[K
remote: Counting objects: 100% (1987/1987), done.[K
remote: Compressing objects: 100% (1908/1908), done.[K
remote: Total 19935 (delta 69), reused 1956 (delta 54), pack-reused 17948[K
Receiving objects: 100% (19935/19935), 5.15 GiB | 27.51 MiB/s, done.
Resolving deltas: 100% (3560/3560), done.
Updating files: 100% (4703/4703), done.


In [23]:
# Creating a dictionary of lists of paths to the correlation matrices for each method. Each list in the dictionary represents a different method.
methods = ['pearson']
full_corr_path_lists = {}
for method in methods:
    method_dir = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/corr_matrices_selected_regions/corr_matrix_{method}/'
    full_corr_path_lists[method] = []
    for file in os.listdir(method_dir):
        full_corr_path_lists[method].append(file)

In [24]:
# Generating the diagnostic file from the diagnostic_label.csv file
diagnostic_label = np.loadtxt('/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/diagnostic_label.csv', dtype=str, delimiter=',')

# Combining the 'EMCI', 'LMCI' and 'MCI' diagnostics into a single 'MCI' label for simplicity, then one-hot encoding the diagnostics
for patient in range(len(diagnostic_label)):
    if diagnostic_label[patient] == 'CN':
        diagnostic_label[patient] = 0
    elif diagnostic_label[patient] == 'SMC':
        diagnostic_label[patient] = 1
    elif diagnostic_label[patient] == 'EMCI' or diagnostic_label[patient] == 'LMCI' or diagnostic_label[patient] == 'MCI':
        diagnostic_label[patient] = 2
    elif diagnostic_label[patient] == 'AD':
        diagnostic_label[patient] = 3
    else:
        print('Error: Diagnostic label not recognised')
        break

In [25]:
# Defining functions to simplify the code in the class Raw_to_Graph.

# To convert a dictionnary into a numpy array
def dict_to_array(dict):
    array = np.array(list(dict.values()))
    return array

# To normalize an array
def normalize_array(array):
    norm_array = (array - np.mean(array)) / np.std(array)
    return norm_array

In [26]:
# Functions to implement the matrix profile algorithm using STUMPY++.
# This is commented out as my local kernel dies when running this cell because of memory issues with matrix profiling. I acquired the data by running it on colab and saving it in the repo here. Only uncomment if needed.

# Computing the matrix profile using STUMPY++
def compute_matrix_profile(time_series, m=3):
    # Look into 3, 10, 20 m
    # Here the m parameter is the window size, i.e. the length of the subsequence to compute the matrix profile for.
    # Are we looking for short-lived neural activations (then small m), or are we interested in detecting longer-term changes or anomalies in brain activity (large m).
    # pdb.set_trace()
    matrix_profile = stump(time_series, m)
    return matrix_profile

# Finding the motif and discord in the matrix profile
def find_motif_discord(matrix_profile):
    motif_idx = motif_idx = np.argsort(matrix_profile[:, 0])[0]
    discord_idx = discord_idx = np.argsort(matrix_profile[:, 0])[-1]
    return motif_idx, discord_idx

# Function to compute the matrix profile, motifs and discords for a correlation matrix
def matrix_profile(corr_matrix):
    matrix_profiles = []
    motifs = []
    discords = []
    for row in corr_matrix:
        mp = compute_matrix_profile(row)
        matrix_profiles.append(mp)
        motif_idx, discord_idx = find_motif_discord(mp)
        motifs.append(motif_idx)
        discords.append(discord_idx)
    motifs_array = np.array(motifs)
    discords_array = np.array(discords)
    return matrix_profiles, motifs_array, discords_array

# Computing the matrix profiles for all the correlation matrices we have and saving them in a folder
for method in methods:
    method_dir = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/corr_matrices_selected_regions/corr_matrix_{method}/'
    for file in os.listdir(method_dir):
        corr_matrix = np.loadtxt(method_dir + file, delimiter=',')
        matrix_profiles, motifs, discords = matrix_profile(corr_matrix)
        path_mp = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/matrix_profiles_selected_regions/matrix_profile_{method}/'
        if not os.path.exists(path_mp):
          os.mkdir(path_mp)
        profile_dict = {}
        profile_dict['mp'] = matrix_profiles
        profile_dict['motifs'] = motifs
        profile_dict['discords'] = discords
        with open(f'{path_mp}/{file}', "wb") as fl:
          pkl.dump(profile_dict, fl)
        print("Done writing dict into .txt file")

Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writing dict into .txt file
Done writi

In [27]:
with open('/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/matrix_profiles_selected_regions/matrix_profile_pearson/patient_002_S_0295.csv', "rb") as fl:
  patient_dict = pkl.load(fl)

In [28]:
np.array(patient_dict['mp']).shape

(40, 38, 4)

In [29]:
# Defining a class to preprocess raw data into a format suitable for training Graph Neural Networks (GNNs).
## With the possibility of assigning weight to edges, adding the age feature, sex feature, and matrixe profiling.

class Raw_to_Graph_reduced_reg(InMemoryDataset):
    def __init__(self, root, threshold, method, weight=False, age=False, sex=False, matrixprofile=False, transform=None, pre_transform=None):
        self.threshold = threshold
        self.method = method
        self.weight = weight
        self.age = age
        self.sex = sex
        self.matrixprofile = matrixprofile
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def processed_file_names(self):
        return ['data.pt']

    # This function is used to process the raw data into a format suitable for GNNs, by constructing graphs out of the connectivity matrices.
    def process(self):
        graphs=[]
        corr_matrices = full_corr_path_lists[self.method]
        for patient_idx, patient_matrix in enumerate(corr_matrices):
            path = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/corr_matrices_selected_regions/corr_matrix_{self.method}/{patient_matrix}'
            corr_matrix = pd.read_csv(path, header=None).values
            # Here ROIs stands for Regions of Interest
            nbr_ROIs = corr_matrix.shape[0]
            edge_matrix = np.zeros((nbr_ROIs,nbr_ROIs))
            for j in range(nbr_ROIs):
                for k in range(nbr_ROIs):
                    # Here we are using the absolute value of each element of the correlation matrix, as the corr coeff is in the range [-1,1].
                    if np.abs(corr_matrix[j,k]) < self.threshold:
                        edge_matrix[j,k] = 0
                    else:
                        if self.weight:
                            # Here we assign the absolute value of the correlation coefficient as the edge weight.
                            edge_matrix[j,k] = corr_matrix[j,k]
                        else:
                            # Here we assign 1 as the edge weight, i.e. regardless of the the absolute value of the correlation coefficient.
                            edge_matrix[j,k] = 1
            # Create a NetworkX graph from the edge matrix
            NetworkX_graph = from_numpy_array(edge_matrix)

            # Compute the degree, betweenness centrality, clustering coefficient, local efficiency for each node of the graph and the global efficiency of the graph
            degree_dict = dict(NetworkX_graph.degree())
            between_central_dict = nx.betweenness_centrality(NetworkX_graph)
            cluster_coeff_dict = nx.clustering(NetworkX_graph)
            global_eff = nx.global_efficiency(NetworkX_graph)
            local_eff_dict = {}
            for node in NetworkX_graph.nodes():
                subgraph_neighb = NetworkX_graph.subgraph(NetworkX_graph.neighbors(node))
                if subgraph_neighb.number_of_nodes() > 1:
                    efficiency = nx.global_efficiency(subgraph_neighb)
                else:
                    efficiency = 0.0
                local_eff_dict[node] = efficiency

            # Convert the degree, betweenness centrality, local efficiency, clustering coefficient and ratio of local to global efficiency dictionaries to NumPy arrays then normalize them
            degree_array = dict_to_array(degree_dict)
            degree_array_norm = normalize_array(degree_array)

            between_central_array = dict_to_array(between_central_dict)
            between_central_array_norm = normalize_array(between_central_array)

            local_efficiency_array = dict_to_array(local_eff_dict)
            local_eff_array_norm = normalize_array(local_efficiency_array)

            ratio_local_global_array = dict_to_array(local_eff_dict) / global_eff
            ratio_local_global_array_norm = normalize_array(ratio_local_global_array)

            cluster_coeff_array = dict_to_array(cluster_coeff_dict)
            cluster_coeff_array_norm = normalize_array(cluster_coeff_array)

            # Initializing an array for the graph features
            x_array = np.stack([degree_array_norm, between_central_array_norm, local_eff_array_norm, cluster_coeff_array_norm, ratio_local_global_array_norm], axis=-1)
            x_array = x_array.astype(np.float32)

            if self.matrixprofile:
                path = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/ADNI_full/matrix_profiles_selected_regions/matrix_profile_{method}/{patient_matrix}'
                with open(path, "rb") as fl:
                  patient_dict = pkl.load(fl)
                # combine dimensions
                features = np.array(patient_dict['mp']).reshape(len(patient_dict['mp']),-1)
                features = features.astype(np.float32)
                x_array = np.concatenate((x_array, features), axis=-1)

            # Concatenate the degree, participation coefficient, betweenness centrality, local efficiency, and ratio of local to global efficiency arrays to form a single feature vector
            x = torch.tensor(x_array, dtype=torch.float)

            # Create a Pytorch Geometric Data object from the NetworkX
            graph_data = from_networkx(NetworkX_graph)
            ## The feature matrix of the graph is the degree, betweenness centrality, local efficiency, clustering coefficient and ratio of local to global efficiency of each node
            graph_data.x = x
            ## The target/output variable that we want to predict is the diagnostic label of the patient
            graph_data.y = float(diagnostic_label[patient_idx])
            graphs.append(graph_data)

        data, slices = self.collate(graphs)
        torch.save((data, slices), self.processed_paths[0])

In [30]:
# Defining a function to display some statistics and features about the dataset.
def dataset_features_and_stats(dataset):
    print()
    print(f'Dataset: {dataset}:')
    print('====================')
    print(f'Number of graphs: {len(dataset)}')
    print(f'Weighted: {dataset.weight}')
    print(f'Threshold: {dataset.threshold}')
    print(f'Correlation Method: {dataset.method}')
    print(f'Number of features: {dataset.num_features}')
    print(f'Number of classes: {len(np.unique(diagnostic_label))}')

    # Getting the first graph object in the dataset.
    data = dataset[0]

    print()
    print(data)
    print('=============================================================')

    # Some statistics about the first graph.
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Has isolated nodes: {data.has_isolated_nodes()}')
    print(f'Has self-loops: {data.has_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')

In [31]:
# Testing the class Raw_to_Graph_reduced_reg with one example and saving it
threshold = 0.4
weight = False
age = False
sex = False
matrixprofile = True
method = 'pearson'

root = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/Raw_to_graph_reduced_reg/ADNI_T_{threshold}_M_{method}_W{weight}_A{age}_S{sex}_MP{matrixprofile}'
dataset = Raw_to_Graph_reduced_reg(root=root, threshold=threshold, method=method, weight=weight, age=age, sex=sex, matrixprofile=matrixprofile)
dataset_features_and_stats(dataset)

Processing...



Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: False
Threshold: 0.4
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 352], weight=[352], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 352
Average node degree: 8.80
Has isolated nodes: False
Has self-loops: True
Is undirected: True


Done!


In [32]:
# Testing the class Raw_to_Graph_reduced_reg with different thresholds, weights and methods and saving it
# thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
thresholds = [0.4, 0.5, 0.6, 0.7, 0.8]
# weights = [True, False]
weights = [True]
age = False
sex = False
matrixprofile = True
# methods = ['pearson', 'spearman', 'kendall', 'partial']
methods = ['pearson']

for weight in weights:
    for method in methods:
        for threshold in thresholds:
            root = f'/content/drive/MyDrive/Bachelor-Thesis-Mcros/Bachelor_Thesis_Alzheimers_HGNN/Raw_to_graph_reduced_reg/ADNI_T_{threshold}_M_{method}_W{weight}_A{age}_S{sex}_MP{matrixprofile}'
            dataset = Raw_to_Graph_reduced_reg(root=root, threshold=threshold, method=method, weight=weight, age=age, sex=sex, matrixprofile=matrixprofile)
            dataset_features_and_stats(dataset)

Processing...
Done!
Processing...



Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: True
Threshold: 0.4
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 352], weight=[352], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 352
Average node degree: 8.80
Has isolated nodes: False
Has self-loops: True
Is undirected: True


  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
Done!
Processing...



Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: True
Threshold: 0.5
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 218], weight=[218], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 218
Average node degree: 5.45
Has isolated nodes: False
Has self-loops: True
Is undirected: True


  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
Done!
Processing...



Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: True
Threshold: 0.6
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 136], weight=[136], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 136
Average node degree: 3.40
Has isolated nodes: True
Has self-loops: True
Is undirected: True


  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std


Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: True
Threshold: 0.7
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 86], weight=[86], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 86
Average node degree: 2.15
Has isolated nodes: True
Has self-loops: True
Is undirected: True


  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std(array)
  norm_array = (array - np.mean(array)) / np.std


Dataset: Raw_to_Graph_reduced_reg(197):
Number of graphs: 197
Weighted: True
Threshold: 0.8
Correlation Method: pearson
Number of features: 157
Number of classes: 4

Data(edge_index=[2, 64], weight=[64], x=[40, 157], y=[1], num_nodes=40)
Number of nodes: 40
Number of edges: 64
Average node degree: 1.60
Has isolated nodes: True
Has self-loops: True
Is undirected: True


Done!
