# Task 2c

In [193]:
import torch
import os
import networkx as nx
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from networkx import normalized_cut_size
import ncut

### Loading the graphs

Graph loading functions, could be changed to load multigraph directly from file

In [5]:
def csv_to_graph(path, threshold=0.3):
    # load in csv as dataframe
    df = pd.read_csv(path,header=None)
    
    # threshold dataframe and remove diagonal
    A = (df>threshold).astype(int) - pd.DataFrame(np.identity(df.shape[0]))
         
    # convert to graph
    G = nx.from_pandas_adjacency(A)
    
    return G

def load_graphs(folder):
    #load in all graphs in folder
    G1s, G2s = [],[]

    for i in range(1,61):
        filename = f'{folder}/p{i:03}_1.csv'
        G1s.append(csv_to_graph(filename))

        filename = f'{folder}/p{i:03}_2.csv'
        G2s.append(csv_to_graph(filename))
        
    return G1s, G2s

G1s, G2s = load_graphs("FC")

In [63]:
def graph_list_to_multigraph(graphs):
    G = nx.MultiGraph()
    for subgraph in graphs:
        G.add_nodes_from(subgraph.nodes)
        G.add_edges_from(subgraph.edges)
    return G

In [84]:
G1 = graph_list_to_multigraph(G1s)
G2 = graph_list_to_multigraph(G2s)

### Loading the embeddings

In [7]:
EMBEDDING_DIR = "embedding_pickles"
G1_PATH = "best_G1_DMGI.pkl"
G2_PATH = "best_G2_DMGI.pkl"

In [8]:
g1_model = torch.load(os.path.join(EMBEDDING_DIR, G1_PATH))
g1_embedding = g1_model['H'].squeeze()

g2_model = torch.load(os.path.join(EMBEDDING_DIR, G2_PATH))
g2_embedding = g2_model['H'].squeeze()

### Loading additional clinical data

In [91]:
clinical_csv = pd.read_csv("clinical.csv")
clinical_csv

60


Unnamed: 0,ID,SEX,MS_TYPE,AGE,EDSS,DATASET,DIAG_YEARS,BMI,THERAPY,outliers_V1,outliers_V2
0,p001,M,primary_progressive,40,7,2,11,19.918,Functional_electric_stimulation,0,14
1,p002,M,secondary_progressive,44,7,2,17,20.529,Motor_Program_Activating_Therapy,4,0
2,p003,M,primary_progressive,51,6,2,21,25.249,Functional_electric_stimulation,4,0
3,p004,M,relapsing_remitting,29,3,2,9,21.5,Functional_electric_stimulation,4,2
4,p005,F,secondary_progressive,60,6,2,21,21.411,Functional_electric_stimulation,0,4
5,p006,M,secondary_progressive,68,4,2,7,21.605,Motor_Program_Activating_Therapy,2,3
6,p007,F,relapsing_remitting,36,4,2,9,19.37,Functional_electric_stimulation,6,7
7,p008,F,relapsing_remitting,32,4,2,4,19.141,Functional_electric_stimulation,5,7
8,p009,F,relapsing_remitting,35,4,1,20,16.436,Motor_Program_Activating_Therapy,3,0
9,p010,M,secondary_progressive,54,6,2,12,23.148,Motor_Program_Activating_Therapy,20,6


## Validation of our NCut implementation
Validates our implementation of NCut against networkx by comparing the NCut values of random binary graph partitions.

In [183]:
# binary partitions validation

identical = True
graphs = [G1, G2]

n_checks = 10
precision = 1e-10

for i in range(n_checks):
    
        print(f"Checking random partition no. {i}")
        clustering = np.random.randint(0, 2, size=len(G1.nodes))
        
        for graph in graphs:
            nx_result = normalized_cut_size(G2, np.where(clustering==0)[0], np.where(clustering==1)[0])
            our_result = ncut.ncut_multigraph(G2, np.where(clustering==0)[0], np.where(clustering==1)[0])
            our_k_result = ncut.k_ncut_multigraph(G2, np.asarray([np.where(clustering==0)[0], np.where(clustering==1)[0]]))
            
            if abs(our_result - nx_result) > precision or abs(our_k_result - nx_result) > precision:
                identical = False
                break
        
        else:
            continue
        
        break

print(f"\nOur results were{' ' if identical else ' not '}identical to those of networkx for {n_checks} random partitions.")

Checking random partition no. 0
Checking random partition no. 1
Checking random partition no. 2
Checking random partition no. 3
Checking random partition no. 4
Checking random partition no. 5
Checking random partition no. 6
Checking random partition no. 7
Checking random partition no. 8
Checking random partition no. 9

Our results were identical to those of networkx for 10 random partitions.


In [None]:
# TODO: can k-way n-cut be validated?

## Clustering the data

In [194]:
def ncut_by_k_partition(graph, partition):
    """Helper function to generate the right input structure for ncut.k_ncut_multigraph"""
    node_lists = np.asarray([np.where(partition == value)[0] for value in np.unique(partition)], dtype=object)
    return ncut.k_ncut_multigraph(graph, node_lists)

In [163]:
n_clusters = range(2, 20)
n_clusters= [2]

kmeans_ncut_results = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n)
    g1_ncut = ncut_by_k_partition(G1, kmeans.fit_predict(g1_embedding))
    g2_ncut = ncut_by_k_partition(G2, kmeans.fit_predict(g2_embedding))
    kmeans_ncut_results.append((f"KMeans n_clusters={n}", g1_ncut, g2_ncut))



In [161]:
for result in kmeans_ncut_results:
    print(result)

('KMeans n_clusters=2', 0.4485409889722306, 0.498379984310383)
('KMeans n_clusters=3', 1.3399197916131274, 1.2944462057863382)
('KMeans n_clusters=4', 2.149554661477875, 2.1197873849831925)
('KMeans n_clusters=5', 2.984142938124949, 3.0425825546755423)
('KMeans n_clusters=6', 3.6089015018011126, 3.857967382853049)
('KMeans n_clusters=7', 4.632372513353431, 4.701274601258655)
('KMeans n_clusters=8', 5.517697251901609, 5.527931816535309)
('KMeans n_clusters=9', 6.227619969061243, 6.283002440669202)
('KMeans n_clusters=10', 7.169181946451304, 7.011255537770586)
('KMeans n_clusters=11', 8.111500756031978, 7.921641872525767)
('KMeans n_clusters=12', 9.034166858363342, 8.608744943754676)
('KMeans n_clusters=13', 9.872984410018116, 9.606619766314731)
('KMeans n_clusters=14', 10.871431002335791, 10.686916880708205)
('KMeans n_clusters=15', 11.837992767696251, 11.783916111641672)
('KMeans n_clusters=16', 12.601366193881422, 12.824139699109535)
('KMeans n_clusters=17', 13.537361821607655, 13.842

In [162]:
# trying to normalize the k-way n-cut values but unfortunately they are still dependent on k
for k, result in zip(range(2,20), kmeans_ncut_results):
    print(result[0], result[1]/k, result[2]/k)

KMeans n_clusters=2 0.2242704944861153 0.2491899921551915
KMeans n_clusters=3 0.4466399305377091 0.4314820685954461
KMeans n_clusters=4 0.5373886653694687 0.5299468462457981
KMeans n_clusters=5 0.5968285876249898 0.6085165109351085
KMeans n_clusters=6 0.6014835836335187 0.6429945638088416
KMeans n_clusters=7 0.661767501907633 0.671610657322665
KMeans n_clusters=8 0.6897121564877011 0.6909914770669137
KMeans n_clusters=9 0.691957774340138 0.698111382296578
KMeans n_clusters=10 0.7169181946451304 0.7011255537770585
KMeans n_clusters=11 0.7374091596392707 0.7201492611387061
KMeans n_clusters=12 0.7528472381969452 0.7173954119795564
KMeans n_clusters=13 0.7594603392321628 0.7389707512549794
KMeans n_clusters=14 0.776530785881128 0.7633512057648718
KMeans n_clusters=15 0.7891995178464167 0.7855944074427781
KMeans n_clusters=16 0.7875853871175889 0.801508731194346
KMeans n_clusters=17 0.7963154012710385 0.814286171110786
KMeans n_clusters=18 0.8068078809822304 0.8143784794905459
KMeans n_clu

## Partitions according to clinical.csv

In [197]:
def ncut_by_binary_partition(graph, partition):
    return ncut.ncut_multigraph(graph, np.where(partition==0)[0], np.where(partition==1)[0])

In [198]:
clinical_ncut_results = []

partition = clinical_csv['SEX'] == 'M'
clinical_ncut_results.append(('sex', ncut_by_binary_partition(G1, partition), ncut_by_binary_partition(G2, partition)))

partition = clinical_csv['AGE'] > 50
clinical_ncut_results.append(('age over 50', ncut_by_binary_partition(G1, partition), ncut_by_binary_partition(G2, partition)))

partition = clinical_csv['BMI'] > 25
clinical_ncut_results.append(('BMI over 25', ncut_by_binary_partition(G1, partition), ncut_by_binary_partition(G2, partition)))

partition = clinical_csv['outliers_V1'] > 0
clinical_ncut_results.append(('more than 0 outliers_V1', ncut_by_binary_partition(G1, partition), ncut_by_binary_partition(G2, partition)))


partition = clinical_csv['MS_TYPE']
clinical_ncut_results.append(('MS type', ncut_by_k_partition(G1, partition), ncut_by_k_partition(G2, partition)))

partition = clinical_csv['THERAPY']
clinical_ncut_results.append(('therapy type', ncut_by_k_partition(G1, partition), ncut_by_k_partition(G2, partition)))

In [199]:
for result in clinical_ncut_results:
    print(result)

('sex', 0.6869285342620781, 0.6929509636280847)
('age over 50', 0.5970106339119701, 0.6075459547198837)
('BMI over 25', 0.6676645210478376, 0.6708754770329284)
('more than 0 outliers_V1', 0.6413116994505805, 0.6498990228155529)
('MS type', 1.3967519328548115, 1.4061308104810528)
('therapy type', 1.2446418648225488, 1.2605377309546069)


# Task 2d