# Task 2c

In [None]:
import torch
import os
import networkx as nx
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import describe
import ncut
from itertools import product

### Loading the graphs

In [11]:
def csv_to_graph(path, threshold=0.3):
    # load in csv as dataframe
    df = pd.read_csv(path,header=None)
    
    # threshold dataframe and remove diagonal
    A = (df>threshold).astype(int) - pd.DataFrame(np.identity(df.shape[0]))
         
    # convert to graph
    G = nx.from_pandas_adjacency(A)
    
    return G

def load_graphs(folder):
    #load in all graphs in folder
    G1s, G2s = [],[]

    for i in range(1,61):
        filename = f'{folder}/p{i:03}_1.csv'
        G1s.append(csv_to_graph(filename))

        filename = f'{folder}/p{i:03}_2.csv'
        G2s.append(csv_to_graph(filename))
        
    return G1s, G2s

G1s, G2s = load_graphs("FC")

In [12]:
def graph_list_to_multigraph(graphs):
    G = nx.MultiGraph()
    for subgraph in graphs:
        G.add_nodes_from(subgraph.nodes)
        G.add_edges_from(subgraph.edges)
    return G

In [13]:
G1 = graph_list_to_multigraph(G1s)
G2 = graph_list_to_multigraph(G2s)

### Loading the embeddings

In [14]:
EMBEDDING_DIR = "embeddings"
G1_PATH = "best_G1_DMGI.pkl"
G2_PATH = "best_G2_DMGI.pkl"


In [18]:
EMBEDDING_DIR2 = os.path.join("embeddings", "deepwalk")
G1_PATH2 = "G1.csv"
G2_PATH2 = "G2.csv"

In [19]:
g1_model = torch.load(os.path.join(EMBEDDING_DIR, G1_PATH))
g1_embedding = g1_model['H'].squeeze()

g2_model = torch.load(os.path.join(EMBEDDING_DIR, G2_PATH))
g2_embedding = g2_model['H'].squeeze()

In [20]:
g1_embedding2 = pd.read_csv(os.path.join(EMBEDDING_DIR2, G1_PATH2))
g2_embedding2 = pd.read_csv(os.path.join(EMBEDDING_DIR2, G2_PATH2))

### Loading additional clinical data

In [53]:
clinical_csv = pd.read_csv("clinical.csv")

## Validation of our NCut implementation
We validate our implementation of NCut against networkx by comparing the NCut values of random graph partitions. The reason for restricting ourselves to binary partitions is that the networkx function normalized_cut_size is only applicable to binary partitions.

In [22]:
# binary partitions validation on multigraphs

identical = True
graphs = [G1, G2]

n_checks = 10
precision = 1e-10

for i in range(n_checks):

        print(f"Checking random partition no. {i}")
        clustering = np.random.randint(0, 2, size=len(G1.nodes))
        
        nodes1 = np.where(clustering==0)[0]
        nodes2 = np.where(clustering==1)[0]

        for graph in graphs:
            nx_result = nx.normalized_cut_size(graph, nodes1, nodes2)
            our_result = ncut.ncut_multigraph(graph, nodes1, nodes2)
            our_k_result = ncut.k_ncut_multigraph(graph, np.asarray([nodes1, nodes2], dtype=object))
            
            if abs(our_result - nx_result) > precision or abs(our_k_result - nx_result) > precision:
                identical = False
                break
        
        else:
            continue
        
        break

print(f"\nOur results were{' ' if identical else ' not '}identical to those of networkx for {n_checks} random partitions.")

Checking random partition no. 0
Checking random partition no. 1
Checking random partition no. 2
Checking random partition no. 3
Checking random partition no. 4
Checking random partition no. 5
Checking random partition no. 6
Checking random partition no. 7
Checking random partition no. 8
Checking random partition no. 9

Our results were identical to those of networkx for 10 random partitions.


After validating our NCut implementation on multigraphs, we do the same for the individual patient graphs.

In [23]:
# binary partitions validation on single graphs

identical = True
graphs = [G1s, G2s]

n_checks = 10
precision = 1e-10

for i in range(n_checks):
    
        print(f"Checking random partition no. {i}")
        clustering = np.random.randint(0, 2, size=len(G1.nodes))
        
        for graph in graphs:
            for subgraph in graph:

                nodes1 = np.where(clustering==0)[0]
                nodes2 = np.where(clustering==1)[0]

                nx_result = nx.normalized_cut_size(subgraph, nodes1, nodes2)
                our_result = ncut.ncut_multigraph(subgraph, nodes1, nodes2)
                our_k_result = ncut.k_ncut_multigraph(subgraph, np.asarray([nodes1, nodes2], dtype=object))

                if abs(our_result - nx_result) > precision or \
                    abs(our_k_result - nx_result) > precision or \
                    abs(our_result - nx_result) > precision:
                        
                    identical = False
                    break
        
        else:
            continue
        
        break

print(f"\nOur results were{' ' if identical else ' not '}identical to those of networkx for {n_checks} random partitions.")

Checking random partition no. 0
Checking random partition no. 1
Checking random partition no. 2
Checking random partition no. 3
Checking random partition no. 4
Checking random partition no. 5
Checking random partition no. 6
Checking random partition no. 7
Checking random partition no. 8
Checking random partition no. 9

Our results were identical to those of networkx for 10 random partitions.


## Clustering the data
We cluster the embeddings with KMeans with different cluster values (i.e. the n_clusters parameter) between 2 and 20 and compare the results with the NCut value. To find out the NCut value of more than two partitions, we implement the k_NCut value, which is an extension of the NCut for two partitions. When comparing different cluster values, it seems that higher cluster values always lead to higher NCut values. This seems intuitive, as a higher cluster value means that there are more edges between clusters rather than within clusters. \
\
When we try to use the k_Ncut value to compare the clusterings, it appears that the best cluster value is 2, since it has the lowest amount of inter-cluster connectivity divided by intra-cluster connectivity.

In [24]:
def ncut_by_k_partition(graph, partition):
    """Helper function to generate the right input structure for ncut.k_ncut_multigraph"""
    node_lists = np.asarray([np.where(partition == value)[0] for value in np.unique(partition)], dtype=object)
    return ncut.k_ncut_multigraph(graph, node_lists)

### DMGI embeddings
For all cluster values, it seems that the NCut results are slightly lower for the second graph: This is surprising, as we expect the patients' functional connectivity between different brain regions to be increased after the intervention, while the NCut-value seems to show the opposite. The reason for this might be that connectivity inside the cluster is even more increased than between the clusters. However, it needs to be noted that the clusterings are very different between the two graphs (for a comparison, see the section about NMIs) and this might also play a role.

In [25]:
n_clusters = range(2, 20)

kmeans_ncut_results = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n, n_init=10)
    clustering1 = kmeans.fit_predict(g1_embedding)
    clustering2 = kmeans.fit_predict(g2_embedding)
    g1_ncut = ncut_by_k_partition(G1, clustering1)
    g2_ncut = ncut_by_k_partition(G2, clustering2)
    kmeans_ncut_results.append((f"KMeans n_clusters={n}", g1_ncut, g2_ncut, clustering1, clustering2))

In [26]:
for result in kmeans_ncut_results:
    print(result[:3])

('KMeans n_clusters=2', 0.9641499975801373, 0.6615836198253547)
('KMeans n_clusters=3', 2.004927662979731, 1.5799885317808782)
('KMeans n_clusters=4', 2.9790809215881957, 2.6634369152461255)
('KMeans n_clusters=5', 4.0179075143264384, 3.606420299288393)
('KMeans n_clusters=6', 4.919152183910789, 4.641739604645354)
('KMeans n_clusters=7', 6.0023728212361895, 5.67687562566527)
('KMeans n_clusters=8', 7.05063664927966, 6.6369758194064135)
('KMeans n_clusters=9', 7.9189108746399945, 7.813814051126349)
('KMeans n_clusters=10', 9.009295139980152, 8.702179690462122)
('KMeans n_clusters=11', 9.983564042438122, 9.784246195828723)
('KMeans n_clusters=12', 10.898018969652849, 10.620592650936395)
('KMeans n_clusters=13', 12.06509846702373, 11.750046439079505)
('KMeans n_clusters=14', 13.043930200119709, 12.89536942520423)
('KMeans n_clusters=15', 14.044201370111605, 13.778201630642274)
('KMeans n_clusters=16', 15.006249313709498, 14.741124194792398)
('KMeans n_clusters=17', 16.041199566864325, 15.

#### Hemisphere cluster correspondence

For the binary clustering, we checked whether the clusters correspond to the hemispheres in the brain. As stated in the file AAL_labels.csv, odd indices correspond to left hempishere region and even indices to right hemisphere regions, apart from the Cerebellar Vermis. It appears that there is no connection between the cluster labels and the hemisphere for the first graph as well the second graph.

After treatment the clusters are not equally sized anymore but cluster 1 increases by 50%.

In [27]:
clustering_DMGI_G1 = kmeans_ncut_results[0][3]

even_indices = clustering_DMGI_G1[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_DMGI_G1[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_DMGI_G1[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.6851851851851852
Fraction of the left hemisphere which belongs to cluster 2: 0.3148148148148148

Fraction of the right hemisphere which belongs to cluster 1: 0.5740740740740741
Fraction of the right hemisphere which belongs to cluster 2: 0.42592592592592593

Fraction of the cerebellar vermis which belongs to cluster 1: 0.625
Fraction of the cerebellar vermis which belongs to cluster 2: 0.375


In [28]:
clustering_DMGI_G2 = kmeans_ncut_results[0][4]

even_indices = clustering_DMGI_G2[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_DMGI_G2[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_DMGI_G2[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.2777777777777778
Fraction of the left hemisphere which belongs to cluster 2: 0.7222222222222222

Fraction of the right hemisphere which belongs to cluster 1: 0.2777777777777778
Fraction of the right hemisphere which belongs to cluster 2: 0.7222222222222222

Fraction of the cerebellar vermis which belongs to cluster 1: 1.0
Fraction of the cerebellar vermis which belongs to cluster 2: 0.0


### Deepwalk_embeddings

In [29]:
kmeans_ncut_results2 = []

for n in n_clusters:
    kmeans = KMeans(n_clusters=n, n_init=10)
    clustering1 = kmeans.fit_predict(g1_embedding2)
    clustering2 = kmeans.fit_predict(g2_embedding2)
    g1_ncut = ncut_by_k_partition(G1, clustering1)
    g2_ncut = ncut_by_k_partition(G2, clustering2)
    kmeans_ncut_results2.append((f"KMeans n_clusters={n}", g1_ncut, g2_ncut, clustering1, clustering2))

In [30]:
for result in kmeans_ncut_results2:
    print(result[:3])

('KMeans n_clusters=2', 0.820246617737858, 0.8042759676805062)
('KMeans n_clusters=3', 1.436341914360783, 1.411777115153281)
('KMeans n_clusters=4', 2.1701846022760685, 2.1313829786860286)
('KMeans n_clusters=5', 3.100209109778881, 3.015334490661229)
('KMeans n_clusters=6', 3.9112444900303927, 3.739448607988603)
('KMeans n_clusters=7', 4.863879869498632, 4.734529135825166)
('KMeans n_clusters=8', 5.526020416855504, 5.495878474397954)
('KMeans n_clusters=9', 6.659052495513469, 6.582577616650722)
('KMeans n_clusters=10', 7.4170779599991095, 7.436001304785729)
('KMeans n_clusters=11', 8.483456884451432, 8.306497475474824)
('KMeans n_clusters=12', 9.278219426849338, 9.220925923348958)
('KMeans n_clusters=13', 10.296711037452527, 10.083336872042299)
('KMeans n_clusters=14', 11.09692860077152, 11.036166478865658)
('KMeans n_clusters=15', 11.882606547230196, 11.920976095871492)
('KMeans n_clusters=16', 12.910496584665093, 12.793079714643016)
('KMeans n_clusters=17', 14.156569593832028, 14.076

#### Hemisphere cluster correspondence
It appears that there is no consistent correspondence between the cluster labels and the hemispheres for the deepwalk embeddings. Since the clusterings are identical for n_clusters=2 (see the NMI section), the hemisphere/cluster correspendences are also identical for the two graphs. 

In [31]:
clustering_deepwalk_G1 = kmeans_ncut_results2[0][3]

even_indices = clustering_deepwalk_G1[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_deepwalk_G1[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_deepwalk_G1[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.5370370370370371
Fraction of the left hemisphere which belongs to cluster 2: 0.46296296296296297

Fraction of the right hemisphere which belongs to cluster 1: 0.5370370370370371
Fraction of the right hemisphere which belongs to cluster 2: 0.46296296296296297

Fraction of the cerebellar vermis which belongs to cluster 1: 0.0
Fraction of the cerebellar vermis which belongs to cluster 2: 1.0


In [32]:
clustering_deepwalk_G2 = kmeans_ncut_results2[0][4]

even_indices = clustering_deepwalk_G2[:108:2]
print(f"Fraction of the left hemisphere which belongs to cluster 1: {sum(even_indices==0)/len(even_indices)}")
print(f"Fraction of the left hemisphere which belongs to cluster 2: {sum(even_indices==1)/len(even_indices)}")
print()

odd_indices =  clustering_deepwalk_G2[1:108:2]
print(f"Fraction of the right hemisphere which belongs to cluster 1: {sum(odd_indices==0)/len(odd_indices)}")
print(f"Fraction of the right hemisphere which belongs to cluster 2: {sum(odd_indices==1)/len(odd_indices)}")

print()

cerebellum_indices = clustering_deepwalk_G2[108::]
print(f"Fraction of the cerebellar vermis which belongs to cluster 1: {sum(cerebellum_indices==0)/len(cerebellum_indices)}")
print(f"Fraction of the cerebellar vermis which belongs to cluster 2: {sum(cerebellum_indices==1)/len(cerebellum_indices)}")

Fraction of the left hemisphere which belongs to cluster 1: 0.5370370370370371
Fraction of the left hemisphere which belongs to cluster 2: 0.46296296296296297

Fraction of the right hemisphere which belongs to cluster 1: 0.5370370370370371
Fraction of the right hemisphere which belongs to cluster 2: 0.46296296296296297

Fraction of the cerebellar vermis which belongs to cluster 1: 0.0
Fraction of the cerebellar vermis which belongs to cluster 2: 1.0


## Comparison of partitions

### Comparison of NCut-values
For both embeddings, the NCut-values of the clusterings are slightly higher for the first graph.

In [33]:
# DMGI emebeddings
kmeans_ncut_results = [list(res) for res in kmeans_ncut_results]
result_diffs = np.asarray([res[1] - res[2] for res in kmeans_ncut_results])
describe(result_diffs)

DescribeResult(nobs=18, minmax=(0.10509682351364535, 0.42493913119885285), mean=0.2788306241833359, variance=0.008646005917794476, skewness=-0.2805735206508417, kurtosis=-0.5444754759471722)

In [34]:
# Deepwalk embeddings
kmeans_ncut_results2 = [list(res) for res in kmeans_ncut_results2]
result_diffs = np.asarray([res[1] - res[2] for res in kmeans_ncut_results2])
describe(result_diffs)

DescribeResult(nobs=18, minmax=(-0.03836954864129538, 0.2187190801860286), mean=0.08762779650369708, variance=0.005713120793585364, skewness=0.22361209544683244, kurtosis=-0.8804935067684245)

### NMI scores
We calculate the normalized mutual information scores of out clustering results to find how similar the KMeans clusterings are in first compared to the second graph. The DMGI embedding clusterings are very different from each other but become more similar as the number of clusters increases. The opposite effect occurs for the deepwalk embeddings, where the NMI scores rise with the number of clusters, but the deepwalk clusterings are very similar in general.

In [35]:
# DMGI embeddings
descriptions = [res[0] for res in kmeans_ncut_results]

kmeans_nmis = [normalized_mutual_info_score(res[3], res[4]) for res in kmeans_ncut_results]
for res, nmi in zip(descriptions, kmeans_nmis):
    print(f"{res} NMI: {nmi}")

KMeans n_clusters=2 NMI: 0.0013288647792431474
KMeans n_clusters=3 NMI: 0.12530939738142452
KMeans n_clusters=4 NMI: 0.1109915704213008
KMeans n_clusters=5 NMI: 0.0922655033215784
KMeans n_clusters=6 NMI: 0.22280531767391124
KMeans n_clusters=7 NMI: 0.1353375397641569
KMeans n_clusters=8 NMI: 0.172201686611475
KMeans n_clusters=9 NMI: 0.22632106484514047
KMeans n_clusters=10 NMI: 0.3089558710332223
KMeans n_clusters=11 NMI: 0.27201001074100006
KMeans n_clusters=12 NMI: 0.3368954615145153
KMeans n_clusters=13 NMI: 0.40999400042573253
KMeans n_clusters=14 NMI: 0.38154879153990856
KMeans n_clusters=15 NMI: 0.3581923172489403
KMeans n_clusters=16 NMI: 0.407816252144638
KMeans n_clusters=17 NMI: 0.4143754092581678
KMeans n_clusters=18 NMI: 0.45037440969200315
KMeans n_clusters=19 NMI: 0.4396401170100803


In [36]:
# Deepwalk embeddings
descriptions = [res[0] for res in kmeans_ncut_results2]

kmeans_nmis = [normalized_mutual_info_score(res[3], res[4]) for res in kmeans_ncut_results2]
for res, nmi in zip(descriptions, kmeans_nmis):
    print(f"{res} NMI: {nmi}")

KMeans n_clusters=2 NMI: 1.0
KMeans n_clusters=3 NMI: 0.9634002398991651
KMeans n_clusters=4 NMI: 0.9457877163234529
KMeans n_clusters=5 NMI: 0.977845984359593
KMeans n_clusters=6 NMI: 0.8813324294469441
KMeans n_clusters=7 NMI: 0.905143340923118
KMeans n_clusters=8 NMI: 0.8971060282787803
KMeans n_clusters=9 NMI: 0.8407447631435722
KMeans n_clusters=10 NMI: 0.8953786770407192
KMeans n_clusters=11 NMI: 0.7920720757963902
KMeans n_clusters=12 NMI: 0.9320378086585457
KMeans n_clusters=13 NMI: 0.8820903533337405
KMeans n_clusters=14 NMI: 0.8664782335380946
KMeans n_clusters=15 NMI: 0.8304579550540191
KMeans n_clusters=16 NMI: 0.8945449038710136
KMeans n_clusters=17 NMI: 0.8145282534162017
KMeans n_clusters=18 NMI: 0.8549425565795234
KMeans n_clusters=19 NMI: 0.8212532823519301


The explanation of the difference among the NMI scores of DMGI and Deepwalk, would be that in the DMGI we have used additional node attributes, in this case vo every node (brain region) we have included the average brain region activity across all patients. So, for that reason, this led to a different clustering results among G1 and G2, suggesting that the brain activity between visits has changed. 

### Comparison by attributes in clinical.csv
In this section we investigate whether the attributes in clinical.csv (such as the age) have an influence on the NCut values of the partitions. To do that, we interpret the column values of clinical.csv as cluster labels, and compute the NCut values of the graph layers (i.e. the patients) individually. Afterwards, we group the NCut values by cluster labels and compare descriptive statistics between the groups.

We analyse sex, age, body mass index, MS type and the therapy that was used for treatment.

In [37]:
comparison_attributes = [
    ['sex', clinical_csv['SEX']],
    ['age over 50', clinical_csv['AGE'] > 50],
    ['BMI over 25', clinical_csv['BMI'] > 25],
    ['MS type', clinical_csv['MS_TYPE']],
    ['therapy', clinical_csv['THERAPY']],
    ['more than 15 years after diagnosis', clinical_csv['DIAG_YEARS'] > 15]
]

#### DMGI Embeddings

In [38]:
# takes a while to run

individual_ncuts = []

for res in kmeans_ncut_results:
    
    G1_ncuts = []
    G1_partition = res[3]
    for patient in G1s:
        G1_ncuts.append(ncut_by_k_partition(patient, G1_partition))

    G2_ncuts = []
    G2_partition = res[4]
    for patient in G2s:
        G2_ncuts.append(ncut_by_k_partition(patient, G2_partition))
        
    individual_ncuts.append([res[0], G1_ncuts, G2_ncuts])

In [39]:
attribute_statistics = []

for part_description, G1_ncuts, G2_ncuts in individual_ncuts:
    for comp_description, labels in comparison_attributes:
        
        label_ncuts_g1 = {}
        label_ncuts_g2 = {}
        
        for label in np.unique(labels):
            indices = np.where(labels == label)[0]
            
            # note: if desired, more descriptive statistics could be saved here
            stats = describe(np.asarray(G1_ncuts)[indices])
            label_ncuts_g1[label] = {'mean': stats.mean, 'variance': stats.variance}
            
            stats = describe(np.asarray(G2_ncuts)[indices])
            label_ncuts_g2[label] = {'mean': stats.mean, 'variance': stats.variance}
            
        attribute_statistics.append({'partition_name' : part_description, 'attribute_name' : comp_description,
                                    'G1' : label_ncuts_g1, 'G2' : label_ncuts_g2})

In [50]:

output_text = True

for entry in attribute_statistics:
    
    if int(entry['partition_name'].split('=')[-1]) <= 2:
        print("Partition:", entry['partition_name'])
        print("Attribute:", entry['attribute_name'], "\n")
        
        for key in entry['G1'].keys():
            print(key)
            print("G1 mean ncut:", entry['G1'][key]['mean'])
            print("G2 mean ncut:", entry['G2'][key]['mean'])
            print()
            
        print("-------------------")

Partition: KMeans n_clusters=2
Attribute: sex 

F
G1 mean ncut: 0.8134142484146929
G2 mean ncut: 0.8125184923616151

M
G1 mean ncut: 0.8099525424292431
G2 mean ncut: 0.781011784832379

-------------------
Partition: KMeans n_clusters=2
Attribute: age over 50 

False
G1 mean ncut: 0.8177160266295395
G2 mean ncut: 0.7941169871725611

True
G1 mean ncut: 0.8047265677619954
G2 mean ncut: 0.8087106809483614

-------------------
Partition: KMeans n_clusters=2
Attribute: BMI over 25 

False
G1 mean ncut: 0.8037169308100777
G2 mean ncut: 0.799014913618519

True
G1 mean ncut: 0.8265451043833305
G2 mean ncut: 0.8029040250463978

-------------------
Partition: KMeans n_clusters=2
Attribute: MS type 

primary_progressive
G1 mean ncut: 0.8471059828231148
G2 mean ncut: 0.7829114215973777

relapsing_remitting
G1 mean ncut: 0.8173212333340683
G2 mean ncut: 0.8003662368936623

secondary_progressive
G1 mean ncut: 0.7926838918459606
G2 mean ncut: 0.8058267342277844

-------------------
Partition: KMeans n

Comparing the mean NCuts for n_clusters=2 between patients there is no significant difference along the ncut values.

#### Deepwalk embeddings

In [41]:
# takes a while to run

individual_ncuts = []

for res in kmeans_ncut_results2:
    
    G1_ncuts = []
    G1_partition = res[3]
    for patient in G1s:
        G1_ncuts.append(ncut_by_k_partition(patient, G1_partition))

    G2_ncuts = []
    G2_partition = res[4]
    for patient in G2s:
        G2_ncuts.append(ncut_by_k_partition(patient, G2_partition))
        
    individual_ncuts.append([res[0], G1_ncuts, G2_ncuts])

In [42]:
attribute_statistics = []

for part_description, G1_ncuts, G2_ncuts in individual_ncuts:
    for comp_description, labels in comparison_attributes:
        
        label_ncuts_g1 = {}
        label_ncuts_g2 = {}
        
        for label in np.unique(labels):
            indices = np.where(labels == label)[0]
            
            # note: if desired, more descriptive statistics could be saved here
            stats = describe(np.asarray(G1_ncuts)[indices])
            label_ncuts_g1[label] = {'mean': stats.mean, 'variance': stats.variance}
            
            stats = describe(np.asarray(G2_ncuts)[indices])
            label_ncuts_g2[label] = {'mean': stats.mean, 'variance': stats.variance}
            
        attribute_statistics.append({'partition_name' : part_description, 'attribute_name' : comp_description,
                                    'G1' : label_ncuts_g1, 'G2' : label_ncuts_g2})

In [51]:

output_text = True

for entry in attribute_statistics:
    
    if int(entry['partition_name'].split('=')[-1]) <= 2:
        print("Partition:", entry['partition_name'])
        print("Attribute:", entry['attribute_name'], "\n")
        
        for key in entry['G1'].keys():
            print(key)
            print("G1 mean ncut:", entry['G1'][key]['mean'])
            print("G2 mean ncut:", entry['G2'][key]['mean'])
            print()
            
        print("-------------------")

Partition: KMeans n_clusters=2
Attribute: sex 

F
G1 mean ncut: 0.8134142484146929
G2 mean ncut: 0.8125184923616151

M
G1 mean ncut: 0.8099525424292431
G2 mean ncut: 0.781011784832379

-------------------
Partition: KMeans n_clusters=2
Attribute: age over 50 

False
G1 mean ncut: 0.8177160266295395
G2 mean ncut: 0.7941169871725611

True
G1 mean ncut: 0.8047265677619954
G2 mean ncut: 0.8087106809483614

-------------------
Partition: KMeans n_clusters=2
Attribute: BMI over 25 

False
G1 mean ncut: 0.8037169308100777
G2 mean ncut: 0.799014913618519

True
G1 mean ncut: 0.8265451043833305
G2 mean ncut: 0.8029040250463978

-------------------
Partition: KMeans n_clusters=2
Attribute: MS type 

primary_progressive
G1 mean ncut: 0.8471059828231148
G2 mean ncut: 0.7829114215973777

relapsing_remitting
G1 mean ncut: 0.8173212333340683
G2 mean ncut: 0.8003662368936623

secondary_progressive
G1 mean ncut: 0.7926838918459606
G2 mean ncut: 0.8058267342277844

-------------------
Partition: KMeans n

same insights are provided by the deepwalk embedding. They prove no significant difference over the analysed attributes.

# Task 2d

In [44]:
functions = (
    nx.conductance,
    nx.cut_size,
    nx.edge_expansion,
    nx.mixing_expansion,
)

results = pd.DataFrame()
results['graph'] = ['G1', 'G2']

# only compare k=2 at position 0
# get the two k means clusterings
clustering1, clustering2 = kmeans_ncut_results[0][-2:]

# parameter embedding 1
params1 = (
    G1,
    list(np.where(clustering1==0)[0]),
    list(np.where(clustering1==1)[0]),
)

# parameter embedding 2
params2 = (
    G2,
    list(np.where(clustering2==0)[0]),
    list(np.where(clustering2==1)[0]),
)



for func in functions:

    results[func.__name__] = [func(*params1), func(*params2)]

results

Unnamed: 0,graph,conductance,cut_size,edge_expansion,mixing_expansion
0,G1,0.58596,24540,570.697674,0.229844
1,G2,0.487745,13452,354.0,0.128161


As ncut decrease so does cut size together with edge-expansion and mixing-exapansion as it is directly linked to the cut size. Conductance also deacreases which leads to the idea that the volume of both clusters is more equal.

In [52]:
functions = (
    nx.conductance,
    nx.cut_size,
    nx.edge_expansion,
    nx.mixing_expansion,
)

results = pd.DataFrame()
results['graph'] = ['G1', 'G2']

# only compare k=2 at position 0
# get the two k means clusterings
clustering1, clustering2 = kmeans_ncut_results2[0][-2:]

# parameter embedding 1
params1 = (
    G1,
    list(np.where(clustering1==0)[0]),
    list(np.where(clustering1==1)[0]),
)

# parameter embedding 2
params2 = (
    G2,
    list(np.where(clustering2==0)[0]),
    list(np.where(clustering2==1)[0]),
)


for func in functions:
    results[func.__name__] = [func(*params1), func(*params2)]

results

Unnamed: 0,graph,conductance,cut_size,edge_expansion,mixing_expansion
0,G1,0.463079,21529,371.189655,0.201643
1,G2,0.452849,20769,358.086207,0.197872


We can not see any sigificant differences between the graphs before and after treatment for the deepwalk embedding.

In [46]:
functions = (
    nx.boundary_expansion,
    nx.node_expansion,
    nx.volume,
)


results = []

for i, n in enumerate(n_clusters):
    df = pd.DataFrame(columns=['graph', 'partition', *[f.__name__ for f in functions]])
    for (G_str, G), cluster in  product({'G1': G1, 'G2': G2}.items(), list(range(n))):
        # a bit hacky
        if G_str == 'G1': clustering = kmeans_ncut_results[i][-2]
        else: clustering = kmeans_ncut_results[i][-1]

        df.loc[len(df.index)] = [
            G_str, cluster,
            *[f(G, np.where(clustering==cluster)[0]) for f in functions]
        ]
    results.append(df)

results[0]

Unnamed: 0,graph,partition,boundary_expansion,node_expansion,volume
0,G1,0,0.589041,1.589041,64888
1,G1,1,1.697674,2.697674,41880
2,G2,0,2.052632,3.052632,27580
3,G2,1,0.487179,1.487179,77382


As we saw in the previous section we can see that for the dmgi embedding the clustering before and after treatment are very different.

In [49]:
functions = (
    nx.boundary_expansion,
    nx.node_expansion,
    nx.volume,
)


results = []

for i, n in enumerate(n_clusters):
    df = pd.DataFrame(columns=['graph', 'partition', *[f.__name__ for f in functions]])
    for (G_str, G), cluster in  product({'G1': G1, 'G2': G2}.items(), list(range(n))):
        # a bit hacky
        if G_str == 'G1': clustering = kmeans_ncut_results2[i][-2]
        else: clustering = kmeans_ncut_results2[i][-1]

        df.loc[len(df.index)] = [
            G_str, cluster,
            *[f(G, np.where(clustering==cluster)[0]) for f in functions]
        ]
    results.append(df)

results[0]

Unnamed: 0,graph,partition,boundary_expansion,node_expansion,volume
0,G1,0,1.0,2.0,60277
1,G1,1,1.0,2.0,46491
2,G2,0,1.0,2.0,59099
3,G2,1,1.0,2.0,45863


Again the deepwalk embedding shows now big difference before and after treatment.