In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## Data preparation

## Task 2a
We construct the networks for each embedding algorithm separately, as they have different input format. 

## Deepwalk

In [29]:
def csv_to_graph(path, threshold=0.3):
    # load in csv as dataframe
    df = pd.read_csv(path,header=None)
    
    # threshold dataframe and rem
    #ove diagonal
    A = (df>threshold).astype(int) - pd.DataFrame(np.identity(df.shape[0]))
         
    # convert to graph
    G = nx.from_pandas_adjacency(A)
    
    return G, A



def save_graphs_as_adjlist(folder):
    #load in all graphs in folder
    G1s, G2s = [],[]

    for i in range(1,61):
        filename = f'{folder}/p{i:03}_1.csv'
        G, A = csv_to_graph(filename)
        nx.write_adjlist(G,f"embeddings/deepwalk/adjlists/p{i:03}_1.adjlist")
        G1s.append(G)
        
        filename = f'{folder}/p{i:03}_2.csv'
        G, A = csv_to_graph(filename)
        nx.write_adjlist(G,f"embeddings/deepwalk/adjlists/p{i:03}_2.adjlist")
        G2s.append(G)
        
    return G1s, G2s


G1s, G2s = save_graphs_as_adjlist("FC")

### DMGI

In [4]:
def load_graphs(folder):
    #load in all graphs in folder
    G1s, G2s = [],[]
    A1s, A2s = [],[]
    features1 = 0
    features2 = 0

    for i in range(1,61):
        filename = f'{folder}/p{i:03}_1.csv'
        G, A = csv_to_graph(filename)
        G1s.append(G)
        A1s.append(A)
        

        filename = f'{folder}/p{i:03}_2.csv'
        G, A = csv_to_graph(filename)
        G2s.append(G)
        A2s.append(A)
        
    graphs_info = {
        'G1': {
            'As': A1s,
            'features': None
        },
        'G2': {
            'As': A2s,
            'features': None
        }
    }
    
    return G1s, G2s, graphs_info


def load_node_info(folder, graph_info):
    
    dfs_1 = []
    dfs_2 = []
    
    for i in range(1,61):
        filename = f'{folder}/{folder}_p{i:03}_1.csv'
        df = pd.read_csv(filename,header=None)
        dfs_1.append(df.to_numpy())

        filename = f'{folder}/{folder}_p{i:03}_2.csv'
        df = pd.read_csv(filename,header=None)
        dfs_2.append(df.to_numpy())
    
    sum1 = np.zeros(dfs_1[0].shape)
    for m in dfs_1:
        sum1 += m[:, 0:240]
    sum2 = np.zeros(dfs_2[0].shape)
    for m in dfs_2:
        sum2 += m[:, 0:240]

    mean1 = np.divide(sum1, 60)
    mean2 = np.divide(sum2, 60)
    
    graph_info['G1']['features'] = mean1
    graph_info['G2']['features'] = mean2
    
    return graph_info

In [63]:
G1s, G2s, graphs_info = load_graphs("FC")

nb_graphs = len(G1s)
nb_nodes = G1s[0].number_of_nodes()

The DMGI embedding algorthm considers node attributes. In this case, nodes are brain regions, and in order to include new information, we used an additional folder "AAL_timeseries" from the database. It contains the brain activity in a specific time period for each brain region, for each patient. This method incorporates the brain activities for all 60 patients per visit and for every node it computes the mean brain activity across all patient. 

In [67]:
graphs_info = load_node_info("AAL_timeseries", graphs_info)

In [68]:
import pickle as pkl

graphs = ['G1', 'G2']
for graph in graphs:
    output = {}
    output['feature'] = graphs_info[graph]['features']
    output['train_idx'] = np.arange(nb_nodes)
    output['val_idx'] = np.arange(nb_nodes)
    output['test_idx'] = np.arange(nb_nodes)
    output['label'] = np.expand_dims(np.ones(nb_nodes), axis=1)
    As = graphs_info[graph]['As']
    for i, A in enumerate(As):
        name = "A_"+str(i)
        output[name] = A
    
    pkl.dump(output, open('../DMGI/data/{}.pkl'.format(graph), 'wb'), protocol=pkl.HIGHEST_PROTOCOL)

The two graphs should be saved in the 'DMGI/data/' folder.

### Embeddings

## Task 2b

For the single graph node embedding we have chosen deepwalk. It belongs to the family of graph embedding techniques that uses walks, which are a concept in graph theory that enables the traversal of a graph by moving from one node to another, as long as they are connected to a common edge. One of the advantages of this algorithm is that it is a simple concept and as the node embeddings are independent of each other, it is easily paralizable. It is a good benchmark for comparing with other embeddings.  

For the multi-relation graph node embedding, we have decided for DMGI, which can jointly
integrate the embeddings from multiple types of relations between nodes through the consensus regularization framework, and the universal discriminator. Another property is that it can also consider node attributes when calculating the embeddings, which we found useful for this analysis, as we were able to use additional data from the given database and include additional information about the nodes.  

Here we apply the implementations of two algorithms, Deepwalk and DMGI.

### Deepwalk

* Here we compute deepwalk embeddings for all 120 graphs
* This takes around 40 mins to run on my machine
* The peer reviewer can skip running this section if they wish.

In [28]:
def compute_deepwalk_embeddings(input_file,output_file):
    # navigate to deepwalk folder
    os.chdir('..\deepwalk') 
    # use command line to compute embeddings
    os.system(f"python main.py --input {input_file} --output {output_file}")
    
    #return to task2 folder
    os.chdir('..\Task2')
    
    print("Wrote embedding to " + output_file)
    return


input_folder = "../Task2/embeddings/deepwalk/adjlists"
output_folder = "../Task2/embeddings/deepwalk/graphs"
#loop over all graphs
for file in os.listdir(input_folder):
    input_file = input_folder + "/" + file
    output_file = output_folder + "/" + file[:file.find(".")] + ".embeddings"
    
    print(f"Computing deepwalk embeddings for {file}...")
    #run deepwalk code with default settings
    compute_deepwalk_embeddings(input_file,output_file)

Computing deepwalk embeddings for p001_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p001_1.embeddings
Computing deepwalk embeddings for p001_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p001_2.embeddings
Computing deepwalk embeddings for p002_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p002_1.embeddings
Computing deepwalk embeddings for p002_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p002_2.embeddings
Computing deepwalk embeddings for p003_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p003_1.embeddings
Computing deepwalk embeddings for p003_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p003_2.embeddings
Computing deepwalk embeddings for p004_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p004_1.embeddings
Computing deepwalk embeddings for p004_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p004_2.embeddings


Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p034_1.embeddings
Computing deepwalk embeddings for p034_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p034_2.embeddings
Computing deepwalk embeddings for p035_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p035_1.embeddings
Computing deepwalk embeddings for p035_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p035_2.embeddings
Computing deepwalk embeddings for p036_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p036_1.embeddings
Computing deepwalk embeddings for p036_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p036_2.embeddings
Computing deepwalk embeddings for p037_1.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p037_1.embeddings
Computing deepwalk embeddings for p037_2.adjlist...
Wrote embedding to ../Task2/embeddings/deepwalk/graphs/p037_2.embeddings
Computing deepwalk embeddings for p038_1.adjlist...


In [30]:
#average over pre and post treatment patients to get two embeddings
def average_deepwalk_embeddings(input_folder,output_folder,G):
    columns = ["node_id"]+[str(i) for i in (range(64))]
    total_embedding = 0
    i = 0
    for file in os.listdir(input_folder):
        if file [5:6] == str(G):
            i += 1
            input_file = input_folder + "/" + file
            embedding = pd.read_csv(input_file,delimiter=" ",skiprows=1,names=columns).set_index("node_id").sort_index()
            total_embedding += embedding
    
    total_embedding = total_embedding/i
    
    total_embedding.to_csv(output_folder+"/G"+str(G)+".csv")
    
    return total_embedding

G1 = average_deepwalk_embeddings("embeddings/deepwalk/graphs","embeddings/deepwalk",1)
G2 = average_deepwalk_embeddings("embeddings/deepwalk/graphs","embeddings/deepwalk",2)


### DMGI

In [None]:
# create embeddings for G1
os.chdir('..\DMGI') 
os.system('python main.py --embedder DMGI --dataset G1 --metapaths A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,A_10,A_11,A_12,A_13,A_14,A_15,A_16,A_17,A_18,A_19,A_20,A_21,A_22,A_23,A_24,A_25,A_26,A_27,A_28,A_29,A_30,A_31,A_32,A_33,A_34,A_35,A_36,A_37,A_38,A_39,A_40,A_41,A_42,A_43,A_44,A_45,A_46,A_47,A_48,A_49,A_50,A_51,A_52,A_53,A_54,A_55,A_56,A_57,A_58,A_59')
os.chdir(cwd)

In [None]:
# create embeddings for G2
os.chdir('..\DMGI') 
os.system('python main.py --embedder DMGI --dataset G2 --metapaths A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7,A_8,A_9,A_10,A_11,A_12,A_13,A_14,A_15,A_16,A_17,A_18,A_19,A_20,A_21,A_22,A_23,A_24,A_25,A_26,A_27,A_28,A_29,A_30,A_31,A_32,A_33,A_34,A_35,A_36,A_37,A_38,A_39,A_40,A_41,A_42,A_43,A_44,A_45,A_46,A_47,A_48,A_49,A_50,A_51,A_52,A_53,A_54,A_55,A_56,A_57,A_58,A_59')
os.chdir(cwd)

The model embeddings should be saved in 'DMGI/saved_model' folder.