# Task 2c

In [1]:
import torch
import os
import networkx as nx
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import describe
import ncut
from itertools import product

### Loading the graphs

In [2]:
def csv_to_graph(path, threshold=0.3):
    # load in csv as dataframe
    df = pd.read_csv(path,header=None)
    
    # threshold dataframe and remove diagonal
    A = (df>threshold).astype(int) - pd.DataFrame(np.identity(df.shape[0]))
         
    # convert to graph
    G = nx.from_pandas_adjacency(A)
    
    return G

def load_graphs(folder):
    #load in all graphs in folder
    G1s, G2s = [],[]

    for i in range(1,61):
        filename = f'{folder}/p{i:03}_1.csv'
        G1s.append(csv_to_graph(filename))

        filename = f'{folder}/p{i:03}_2.csv'
        G2s.append(csv_to_graph(filename))
        
    return G1s, G2s

G1s, G2s = load_graphs("FC")

In [3]:
def graph_list_to_multigraph(graphs):
    G = nx.MultiGraph()
    for subgraph in graphs:
        G.add_nodes_from(subgraph.nodes)
        G.add_edges_from(subgraph.edges)
    return G

In [4]:
G1 = graph_list_to_multigraph(G1s)
G2 = graph_list_to_multigraph(G2s)

### Loading the embeddings

In [5]:
EMBEDDING_DIR = "embeddings"
G1_PATH = "best_G1_DMGI.pkl"
G2_PATH = "best_G2_DMGI.pkl"


In [17]:
EMBEDDING_DIR2 = "deepwalk_avg_embeddings"
G1_PATH2 = "G1.csv"
G2_PATH2 = "G2.csv"

In [7]:
g1_model = torch.load(os.path.join(EMBEDDING_DIR, G1_PATH))
g1_embedding = g1_model['H'].squeeze()

g2_model = torch.load(os.path.join(EMBEDDING_DIR, G2_PATH))
g2_embedding = g2_model['H'].squeeze()

In [18]:
g1_embedding2 = pd.read_csv(os.path.join(EMBEDDING_DIR2, G1_PATH2))
g2_embedding2 = pd.read_csv(os.path.join(EMBEDDING_DIR2, G2_PATH2))

### Loading additional clinical data

In [9]:
clinical_csv = pd.read_csv("clinical.csv")
clinical_csv

Unnamed: 0,ID,SEX,MS_TYPE,AGE,EDSS,DATASET,DIAG_YEARS,BMI,THERAPY,outliers_V1,outliers_V2
0,p001,M,primary_progressive,40,7,2,11,19.918,Functional_electric_stimulation,0,14
1,p002,M,secondary_progressive,44,7,2,17,20.529,Motor_Program_Activating_Therapy,4,0
2,p003,M,primary_progressive,51,6,2,21,25.249,Functional_electric_stimulation,4,0
3,p004,M,relapsing_remitting,29,3,2,9,21.5,Functional_electric_stimulation,4,2
4,p005,F,secondary_progressive,60,6,2,21,21.411,Functional_electric_stimulation,0,4
5,p006,M,secondary_progressive,68,4,2,7,21.605,Motor_Program_Activating_Therapy,2,3
6,p007,F,relapsing_remitting,36,4,2,9,19.37,Functional_electric_stimulation,6,7
7,p008,F,relapsing_remitting,32,4,2,4,19.141,Functional_electric_stimulation,5,7
8,p009,F,relapsing_remitting,35,4,1,20,16.436,Motor_Program_Activating_Therapy,3,0
9,p010,M,secondary_progressive,54,6,2,12,23.148,Motor_Program_Activating_Therapy,20,6


## Validation of our NCut implementation
We validate our implementation of NCut against networkx by comparing the NCut values of random graph partitions. The reason for restricting ourselves to binary partitions is that the networkx function normalized_cut_size is only applicable to binary partitions.

In [12]:
# binary partitions validation on multigraphs

identical = True
graphs = [G1, G2]

n_checks = 10
precision = 1e-10

for i in range(n_checks):

        print(f"Checking random partition no. {i}")
        clustering = np.random.randint(0, 2, size=len(G1.nodes))
        
        nodes1 = np.where(clustering==0)[0]
        nodes2 = np.where(clustering==1)[0]

        for graph in graphs:
            nx_result = nx.normalized_cut_size(graph, nodes1, nodes2)
            our_result = ncut.ncut_multigraph(graph, nodes1, nodes2)
            our_k_result = ncut.k_ncut_multigraph(graph, np.asarray([nodes1, nodes2], dtype=object))
            
            if abs(our_result - nx_result) > precision or abs(our_k_result - nx_result) > precision:
                identical = False
                break
        
        else:
            continue
        
        break

print(f"\nOur results were{' ' if identical else ' not '}identical to those of networkx for {n_checks} random partitions.")

Checking random partition no. 0
Checking random partition no. 1
Checking random partition no. 2
Checking random partition no. 3
Checking random partition no. 4
Checking random partition no. 5
Checking random partition no. 6
Checking random partition no. 7
Checking random partition no. 8
Checking random partition no. 9

Our results were identical to those of networkx for 10 random partitions.


After validating our NCut implementation on multigraphs, we do the same for the individual patient graphs.

In [40]:
# binary partitions validation on single graphs

identical = True
graphs = [G1s, G2s]

n_checks = 10
precision = 1e-10

for i in range(n_checks):
    
        print(f"Checking random partition no. {i}")
        clustering = np.random.randint(0, 2, size=len(G1.nodes))
        
        for graph in graphs:
            for subgraph in graph:

                nodes1 = np.where(clustering==0)[0]
                nodes2 = np.where(clustering==1)[0]

                nx_result = nx.normalized_cut_size(subgraph, nodes1, nodes2)
                our_result = ncut.ncut_multigraph(subgraph, nodes1, nodes2)
                our_k_result = ncut.k_ncut_multigraph(subgraph, np.asarray([nodes1, nodes2], dtype=object))

                if abs(our_result - nx_result) > precision or \
                    abs(our_k_result - nx_result) > precision or \
                    abs(our_result - nx_result) > precision:
                        
                    identical = False
                    break
        
        else:
            continue
        
        break

print(f"\nOur results were{' ' if identical else ' not '}identical to those of networkx for {n_checks} random partitions.")

Checking random partition no. 0
Checking random partition no. 1
Checking random partition no. 2
Checking random partition no. 3
Checking random partition no. 4
Checking random partition no. 5
Checking random partition no. 6
Checking random partition no. 7
Checking random partition no. 8
Checking random partition no. 9

Our results were identical to those of networkx for 10 random partitions.


## Clustering the data
We cluster the embeddings with KMeans with different cluster values (i.e. the n_clusters parameter) between 2 and 20 and compare the results with the NCut value. To find out the NCut value of more than two partitions, we implement the k_NCut value, which is an extension of the NCut for two partitions. When comparing different cluster values, it seems that higher cluster values always lead to higher NCut values. This seems intuitive, as a higher cluster value means that there are more edges between clusters rather than within clusters. \
\
When we try to use the k_Ncut value to compare the clusterings, it appears that the best cluster value is 2, since it has the lowest amount of inter-cluster connectivity divided by intra-cluster connectivity.

In [10]:
def ncut_by_k_partition(graph, partition):
    """Helper function to generate the right input structure for ncut.k_ncut_multigraph"""
    node_lists = np.asarray([np.where(partition == value)[0] for value in np.unique(partition)], dtype=object)
    return ncut.k_ncut_multigraph(graph, node_lists)

### DMGI embeddings
For all cluster values, it seems that the NCut results are slightly lower for the second graph: This is surprising, as we expect the patients' functional connectivity between different brain regions to be increased after the intervention, while the NCut-value seems to show the opposite. However, it needs to be noted that the clusterings are not identical between the two graphs (for a comparison, see the section about NMIs) and this might also play a role.

In [11]:
n_clusters = range(2, 20)

kmeans_ncut_results = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n, n_init=10)
    clustering1 = kmeans.fit_predict(g1_embedding)
    clustering2 = kmeans.fit_predict(g2_embedding)
    g1_ncut = ncut_by_k_partition(G1, clustering1)
    g2_ncut = ncut_by_k_partition(G2, clustering2)
    kmeans_ncut_results.append((f"KMeans n_clusters={n}", g1_ncut, g2_ncut, clustering1, clustering2))

In [12]:
for result in kmeans_ncut_results:
    print(result[:3])

('KMeans n_clusters=2', 1.024880406778286, 0.6495013698487705)
('KMeans n_clusters=3', 1.9580156474352046, 1.519939476871253)
('KMeans n_clusters=4', 3.0065981141918745, 2.5559301316819623)
('KMeans n_clusters=5', 4.018580585153811, 3.638231279504795)
('KMeans n_clusters=6', 4.9847158876454865, 4.620613493407045)
('KMeans n_clusters=7', 6.058132478190282, 5.654989180923791)
('KMeans n_clusters=8', 7.065467506137667, 6.733883586836054)
('KMeans n_clusters=9', 8.05968786144697, 7.7965225017960345)
('KMeans n_clusters=10', 9.022788650045973, 8.661385107042365)
('KMeans n_clusters=11', 10.065214524559195, 9.849303608131923)
('KMeans n_clusters=12', 10.997347383376189, 10.782902996043434)
('KMeans n_clusters=13', 12.039823455419073, 11.708529314413722)
('KMeans n_clusters=14', 12.983167324670026, 12.833004467458428)
('KMeans n_clusters=15', 14.104219928117955, 13.884194788011358)
('KMeans n_clusters=16', 15.086816569062066, 14.802082457710556)
('KMeans n_clusters=17', 16.02026035255005, 15.

#### Hemisphere cluster correspondence

For the binary clustering, we checked whether the clusters correspond to the hemispheres in the brain. As stated in the file AAL_labels.csv, odd indices correspond to left hempishere region and even indices to right hemisphere regions, apart from the Cerebellar Vermis. It appears that there is no connection between the cluster labels and the hemisphere for the first graph, but for the second graph, about 75% of the left hemisphere belongs to cluster 2 and about 75% of the right hemisphere to cluster 1. The cerebellar vermis was assigned to cluster 1 in its entirety. In conclusion, it seems that the information stored in the DMGI embedding allows us to find a connection between the brain hemisphere and functional connectivity after the treatment.

In [61]:
clustering_DMGI_G1 = kmeans_ncut_results[0][3]

even_indices = clustering_DMGI_G1[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_DMGI_G1[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_DMGI_G1[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.5370370370370371
Fraction of the left hemisphere which belongs to cluster 2: 0.46296296296296297

Fraction of the right hemisphere which belongs to cluster 1: 0.4074074074074074
Fraction of the right hemisphere which belongs to cluster 2: 0.5925925925925926

Fraction of the cerebellar vermis which belongs to cluster 1: 0.25
Fraction of the cerebellar vermis which belongs to cluster 2: 0.75


In [62]:
clustering_DMGI_G2 = kmeans_ncut_results[0][4]

even_indices = clustering_DMGI_G2[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_DMGI_G2[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_DMGI_G2[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.25925925925925924
Fraction of the left hemisphere which belongs to cluster 2: 0.7407407407407407

Fraction of the right hemisphere which belongs to cluster 1: 0.2777777777777778
Fraction of the right hemisphere which belongs to cluster 2: 0.7222222222222222

Fraction of the cerebellar vermis which belongs to cluster 1: 1.0
Fraction of the cerebellar vermis which belongs to cluster 2: 0.0


### Deepwalk_embeddings

In [19]:
kmeans_ncut_results2 = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n, n_init=10)
    clustering1 = kmeans.fit_predict(g1_embedding2)
    clustering2 = kmeans.fit_predict(g2_embedding2)
    g1_ncut = ncut_by_k_partition(G1, clustering1)
    g2_ncut = ncut_by_k_partition(G2, clustering2)
    kmeans_ncut_results2.append((f"KMeans n_clusters={n}", g1_ncut, g2_ncut, clustering1, clustering2))

In [22]:
for result in kmeans_ncut_results2:
    print(result[:3])

('KMeans n_clusters=2', 0.820246617737858, 0.8042759676805062)
('KMeans n_clusters=3', 1.4400629044966995, 1.4092902004836843)
('KMeans n_clusters=4', 2.1516882610944132, 2.0835758753989575)
('KMeans n_clusters=5', 3.0348326614164005, 3.0312564616456927)
('KMeans n_clusters=6', 3.8418771331593153, 3.8648791537045035)
('KMeans n_clusters=7', 4.788028739182342, 4.751821079418336)
('KMeans n_clusters=8', 5.56942732852477, 5.586115166186394)
('KMeans n_clusters=9', 6.559653012549149, 6.598872719793632)
('KMeans n_clusters=10', 7.663557774351519, 7.3516615096505955)
('KMeans n_clusters=11', 8.415235710964383, 8.355696514443885)
('KMeans n_clusters=12', 9.443606057812932, 9.102016119018064)
('KMeans n_clusters=13', 10.218559154096031, 10.170828701315088)
('KMeans n_clusters=14', 11.21989353405305, 11.116672962094908)
('KMeans n_clusters=15', 12.071271051457352, 11.642394123267648)
('KMeans n_clusters=16', 13.072848983417073, 12.625045383503572)
('KMeans n_clusters=17', 14.051196777159518, 13

#### Hemisphere cluster correspondence
It appears that there is no consistent correspondence between the cluster labels and the hemispheres for the deepwalk embeddings. Since the clusterings are identical for n_clusters=2 (see the NMI section), the hemisphere/cluster correspendences are also identical for the two graphs. 

In [64]:
clustering_deepwalk_G1 = kmeans_ncut_results2[0][3]

even_indices = clustering_deepwalk_G1[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_deepwalk_G1[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_deepwalk_G1[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.46296296296296297
Fraction of the left hemisphere which belongs to cluster 2: 0.5370370370370371

Fraction of the right hemisphere which belongs to cluster 1: 0.46296296296296297
Fraction of the right hemisphere which belongs to cluster 2: 0.5370370370370371

Fraction of the cerebellar vermis which belongs to cluster 1: 1.0
Fraction of the cerebellar vermis which belongs to cluster 2: 0.0


In [65]:
clustering_deepwalk_G2 = kmeans_ncut_results2[0][4]

even_indices = clustering_deepwalk_G2[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_deepwalk_G2[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_deepwalk_G2[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.46296296296296297
Fraction of the left hemisphere which belongs to cluster 2: 0.5370370370370371

Fraction of the right hemisphere which belongs to cluster 1: 0.46296296296296297
Fraction of the right hemisphere which belongs to cluster 2: 0.5370370370370371

Fraction of the cerebellar vermis which belongs to cluster 1: 1.0
Fraction of the cerebellar vermis which belongs to cluster 2: 0.0


## Comparison of partitions

### Comparison of NCut-values
For both embeddings, the NCut-values of the clusterings are slightly higher for the first graph.

In [29]:
# DMGI emebeddings
kmeans_ncut_results = [list(res) for res in kmeans_ncut_results]
result_diffs = np.asarray([res[1] - res[2] for res in kmeans_ncut_results])
describe(result_diffs)

DescribeResult(nobs=18, minmax=(-0.0037201683041381273, 0.45066798250991225), mean=0.29926447241467735, variance=0.012841906122102656, skewness=-0.9969422040326614, kurtosis=0.8565042692840699)

In [30]:
# Deepwalk embeddings
kmeans_ncut_results2 = [list(res) for res in kmeans_ncut_results2]
result_diffs = np.asarray([res[1] - res[2] for res in kmeans_ncut_results2])
describe(result_diffs)

DescribeResult(nobs=18, minmax=(-0.17774110252373987, 0.536434339656962), mean=0.12714737881275595, variance=0.03915712589689944, skewness=0.780559951787912, kurtosis=-0.5156534503959724)

### NMI scores
We calculate the normalized mutual information scores of out clustering results to find how similar the KMeans clusterings are in first compared to the second graph. The DMGI embedding clusterings are very different from each other but become more similar as the number of clusters increases. The opposite effect occurs for the deepwalk embeddings, where the NMI scores rise with the number of clusters, but the deepwalk clusterings are very similar in general.

In [31]:
# DMGI embeddings
descriptions = [res[0] for res in kmeans_ncut_results]

kmeans_nmis = [normalized_mutual_info_score(res[3], res[4]) for res in kmeans_ncut_results]
for res, nmi in zip(descriptions, kmeans_nmis):
    print(f"{res} NMI: {nmi}")

KMeans n_clusters=2 NMI: 9.422748854569653e-06
KMeans n_clusters=3 NMI: 0.032391744409827526
KMeans n_clusters=4 NMI: 0.19481148712949176
KMeans n_clusters=5 NMI: 0.109158657088973
KMeans n_clusters=6 NMI: 0.14491995785596387
KMeans n_clusters=7 NMI: 0.21118830546009346
KMeans n_clusters=8 NMI: 0.17900473981915177
KMeans n_clusters=9 NMI: 0.24402673833914307
KMeans n_clusters=10 NMI: 0.28194824170958543
KMeans n_clusters=11 NMI: 0.3034865423877481
KMeans n_clusters=12 NMI: 0.3177710082020356
KMeans n_clusters=13 NMI: 0.2943245095235929
KMeans n_clusters=14 NMI: 0.37885843121235946
KMeans n_clusters=15 NMI: 0.35268584261404695
KMeans n_clusters=16 NMI: 0.5102443308185651
KMeans n_clusters=17 NMI: 0.4075450237562027
KMeans n_clusters=18 NMI: 0.45681089097247274
KMeans n_clusters=19 NMI: 0.47815599222348676


In [32]:
# Deepwalk embeddings
descriptions = [res[0] for res in kmeans_ncut_results2]

kmeans_nmis = [normalized_mutual_info_score(res[3], res[4]) for res in kmeans_ncut_results2]
for res, nmi in zip(descriptions, kmeans_nmis):
    print(f"{res} NMI: {nmi}")

KMeans n_clusters=2 NMI: 1.0
KMeans n_clusters=3 NMI: 0.9014884656713177
KMeans n_clusters=4 NMI: 0.9727300607485628
KMeans n_clusters=5 NMI: 0.9192781466225672
KMeans n_clusters=6 NMI: 0.8205210633900838
KMeans n_clusters=7 NMI: 0.7935904350274092
KMeans n_clusters=8 NMI: 0.8687171305710435
KMeans n_clusters=9 NMI: 0.8393766982496269
KMeans n_clusters=10 NMI: 0.7859333826730857
KMeans n_clusters=11 NMI: 0.8570674608083465
KMeans n_clusters=12 NMI: 0.8314009475854333
KMeans n_clusters=13 NMI: 0.829817107193702
KMeans n_clusters=14 NMI: 0.7849568476940962
KMeans n_clusters=15 NMI: 0.8361091882621564
KMeans n_clusters=16 NMI: 0.8493950213814582
KMeans n_clusters=17 NMI: 0.790304392695595
KMeans n_clusters=18 NMI: 0.8325079369991409
KMeans n_clusters=19 NMI: 0.8268504636781306


### Comparison by attributes in clinical.csv
In this section we investigate whether the attributes in clinical.csv (such as the age) have an influence on the NCut values of the partitions. To do that, we interpret the column values of clinical.csv as cluster labels, and compute the NCut values of the graph layers (i.e. the patients) individually. Afterwards, we group the NCut values by cluster labels and compare descriptive statistics between the groups.

In [33]:
comparison_attributes = [
    ['sex', clinical_csv['SEX']],
    ['age over 50', clinical_csv['AGE'] > 50],
    ['BMI over 25', clinical_csv['BMI'] > 25],
    ['MS type', clinical_csv['MS_TYPE']],
    ['therapy', clinical_csv['THERAPY']]
]

#### DMGI Embeddings

In [34]:
# takes a while to run

individual_ncuts = []

for res in kmeans_ncut_results:
    
    G1_ncuts = []
    G1_partition = res[3]
    for patient in G1s:
        G1_ncuts.append(ncut_by_k_partition(patient, G1_partition))

    G2_ncuts = []
    G2_partition = res[4]
    for patient in G2s:
        G2_ncuts.append(ncut_by_k_partition(patient, G2_partition))
        
    individual_ncuts.append([res[0], G1_ncuts, G2_ncuts])

In [35]:
attribute_statistics = []

for part_description, G1_ncuts, G2_ncuts in individual_ncuts:
    for comp_description, labels in comparison_attributes:
        
        label_ncuts_g1 = {}
        label_ncuts_g2 = {}
        
        for label in np.unique(labels):
            indices = np.where(labels == label)[0]
            
            # note: if desired, more descriptive statistics could be saved here
            stats = describe(np.asarray(G1_ncuts)[indices])
            label_ncuts_g1[label] = {'mean': stats.mean, 'variance': stats.variance}
            
            stats = describe(np.asarray(G2_ncuts)[indices])
            label_ncuts_g2[label] = {'mean': stats.mean, 'variance': stats.variance}
            
        attribute_statistics.append({'partition_name' : part_description, 'attribute_name' : comp_description,
                                    'G1' : label_ncuts_g1, 'G2' : label_ncuts_g2})

In [36]:
for entry in attribute_statistics:
    
    print("Partition:", entry['partition_name'])
    print("Attribute:", entry['attribute_name'], "\n")
    
    for key in entry['G1'].keys():
        print(key)
        print("G1 mean ncut:", entry['G1'][key]['mean'])
        print("G2 mean ncut:", entry['G2'][key]['mean'])
        print()
        
    print("-------------------")

Partition: KMeans n_clusters=2
Attribute: sex 

F
G1 mean ncut: 1.0257011217250185
G2 mean ncut: 0.6628454753436009

M
G1 mean ncut: 1.0241832793645904
G2 mean ncut: 0.6294301691180958

-------------------
Partition: KMeans n_clusters=2
Attribute: age over 50 

False
G1 mean ncut: 1.0282448260194192
G2 mean ncut: 0.6464230503297234

True
G1 mean ncut: 1.0210320324827311
G2 mean ncut: 0.6547612602391861

-------------------
Partition: KMeans n_clusters=2
Attribute: BMI over 25 

False
G1 mean ncut: 1.0240270117339338
G2 mean ncut: 0.6535553762916585

True
G1 mean ncut: 1.027005931060081
G2 mean ncut: 0.6439578262884734

-------------------
Partition: KMeans n_clusters=2
Attribute: MS type 

primary_progressive
G1 mean ncut: 1.0245643917606044
G2 mean ncut: 0.607378169681908

relapsing_remitting
G1 mean ncut: 1.0254146902517212
G2 mean ncut: 0.6533056111954246

secondary_progressive
G1 mean ncut: 1.0247835555044555
G2 mean ncut: 0.6572758339346775

-------------------
Partition: KMeans n

Comparing the mean NCuts for n_clusters=2 between patients of different ages, it seems that patients under 51 years have a lower average ncut value than those over 50 in graph 1, but a higher average ncut value in graph 2. A higher ncut value means that the partitions are more similar to each other. Based on this result, we can speculate that ???\
**insert more analysis here**

#### Deepwalk embeddings

In [37]:
# takes a while to run

individual_ncuts = []

for res in kmeans_ncut_results2:
    
    G1_ncuts = []
    G1_partition = res[3]
    for patient in G1s:
        G1_ncuts.append(ncut_by_k_partition(patient, G1_partition))

    G2_ncuts = []
    G2_partition = res[4]
    for patient in G2s:
        G2_ncuts.append(ncut_by_k_partition(patient, G2_partition))
        
    individual_ncuts.append([res[0], G1_ncuts, G2_ncuts])

In [38]:
attribute_statistics = []

for part_description, G1_ncuts, G2_ncuts in individual_ncuts:
    for comp_description, labels in comparison_attributes:
        
        label_ncuts_g1 = {}
        label_ncuts_g2 = {}
        
        for label in np.unique(labels):
            indices = np.where(labels == label)[0]
            
            # note: if desired, more descriptive statistics could be saved here
            stats = describe(np.asarray(G1_ncuts)[indices])
            label_ncuts_g1[label] = {'mean': stats.mean, 'variance': stats.variance}
            
            stats = describe(np.asarray(G2_ncuts)[indices])
            label_ncuts_g2[label] = {'mean': stats.mean, 'variance': stats.variance}
            
        attribute_statistics.append({'partition_name' : part_description, 'attribute_name' : comp_description,
                                    'G1' : label_ncuts_g1, 'G2' : label_ncuts_g2})

In [39]:
for entry in attribute_statistics:
    
    print("Partition:", entry['partition_name'])
    print("Attribute:", entry['attribute_name'], "\n")
    
    for key in entry['G1'].keys():
        print(key)
        print("G1 mean ncut:", entry['G1'][key]['mean'])
        print("G2 mean ncut:", entry['G2'][key]['mean'])
        print()
        
    print("-------------------")

Partition: KMeans n_clusters=2
Attribute: sex 

F
G1 mean ncut: 0.8134142484146929
G2 mean ncut: 0.8125184923616151

M
G1 mean ncut: 0.8099525424292431
G2 mean ncut: 0.781011784832379

-------------------
Partition: KMeans n_clusters=2
Attribute: age over 50 

False
G1 mean ncut: 0.8177160266295395
G2 mean ncut: 0.7941169871725611

True
G1 mean ncut: 0.8047265677619954
G2 mean ncut: 0.8087106809483614

-------------------
Partition: KMeans n_clusters=2
Attribute: BMI over 25 

False
G1 mean ncut: 0.8037169308100777
G2 mean ncut: 0.799014913618519

True
G1 mean ncut: 0.8265451043833305
G2 mean ncut: 0.8029040250463978

-------------------
Partition: KMeans n_clusters=2
Attribute: MS type 

primary_progressive
G1 mean ncut: 0.8471059828231148
G2 mean ncut: 0.7829114215973777

relapsing_remitting
G1 mean ncut: 0.8173212333340683
G2 mean ncut: 0.8003662368936623

secondary_progressive
G1 mean ncut: 0.7926838918459606
G2 mean ncut: 0.8058267342277844

-------------------
Partition: KMeans n

# Task 2d

In [None]:
functions = (
    nx.conductance,
    nx.cut_size,
    nx.edge_expansion,
    nx.mixing_expansion,
)

results = pd.DataFrame()
results['graph'] = ['G1', 'G2']

# only compare k=2 at position 0
# get the two k means clusterings
clustering1, clustering2 = kmeans_ncut_results[0][-2:]

# parameter embedding 1
params1 = (
    G1,
    list(np.where(clustering1==0)[0]),
    list(np.where(clustering1==1)[0]),
)

# parameter embedding 2
params2 = (
    G2,
    list(np.where(clustering2==0)[0]),
    list(np.where(clustering2==1)[0]),
)



for func in functions:

    results[func.__name__] = [func(*params1), func(*params2)]

results

Unnamed: 0,graph,conductance,cut_size,edge_expansion,mixing_expansion
0,G1,0.371081,6842,207.333333,0.064083
1,G2,0.387704,9037,220.414634,0.086098


In [None]:
functions = (
    nx.conductance,
    nx.cut_size,
    nx.edge_expansion,
    nx.mixing_expansion,
)

results = pd.DataFrame()
results['graph'] = ['G1', 'G2']

# only compare k=2 at position 0
# get the two k means clusterings
clustering1, clustering2 = kmeans_ncut_results2[0][-2:]

# parameter embedding 1
params1 = (
    G1,
    list(np.where(clustering1==0)[0]),
    list(np.where(clustering1==1)[0]),
)

# parameter embedding 2
params2 = (
    G2,
    list(np.where(clustering2==0)[0]),
    list(np.where(clustering2==1)[0]),
)



for func in functions:

    results[func.__name__] = [func(*params1), func(*params2)]

results

In [None]:
functions = (
    nx.boundary_expansion,
    nx.node_expansion,
    nx.volume,
)


results = []

for i, n in enumerate(n_clusters):
    df = pd.DataFrame(columns=['graph', 'partition', *[f.__name__ for f in functions]])
    for (G_str, G), cluster in  product({'G1': G1, 'G2': G2}.items(), list(range(n))):
        # a bit hacky
        if G_str == 'G1': clustering = kmeans_ncut_results[i][-2]
        else: clustering = kmeans_ncut_results[i][-1]

        df.loc[len(df.index)] = [
            G_str, cluster,
            *[f(G, np.where(clustering==cluster)[0]) for f in functions]
        ]
    results.append(df)

    

In [None]:
functions = (
    nx.boundary_expansion,
    nx.node_expansion,
    nx.volume,
)


results = []

for i, n in enumerate(n_clusters):
    df = pd.DataFrame(columns=['graph', 'partition', *[f.__name__ for f in functions]])
    for (G_str, G), cluster in  product({'G1': G1, 'G2': G2}.items(), list(range(n))):
        # a bit hacky
        if G_str == 'G1': clustering = kmeans_ncut_results2[i][-2]
        else: clustering = kmeans_ncut_results2[i][-1]

        df.loc[len(df.index)] = [
            G_str, cluster,
            *[f(G, np.where(clustering==cluster)[0]) for f in functions]
        ]
    results.append(df)

    

In [None]:
results[2]

Unnamed: 0,graph,partition,boundary_expansion,node_expansion,volume
0,G1,0,3.461538,4.461538,15698
1,G1,1,0.757576,1.757576,74935
2,G1,2,4.272727,5.272727,15587
3,G1,3,48.5,49.5,548
4,G2,0,2.222222,3.222222,34461
5,G2,1,3.142857,4.142857,15928
6,G2,2,2.052632,3.052632,46400
7,G2,3,7.285714,8.285714,8173
