# Librerías

In [1]:
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import h5py

In [13]:
import sys
sys.path.append('../src/GMM /')

from utils import *

# Lectura de datos

In [30]:
path_distancias = '../data/correlaciones_10x/correlaciones.pickle'
path_datos = '../data/10X_PBMC_select_2100.h5'
path_out = '../data/correlaciones_10x/'

In [31]:
with open(path_distancias, 'rb') as f:
    correlaciones_hl = pickle.load(f)

with h5py.File(path_datos) as f:
    #X = np.array(f['X'])
    y = np.array(f['Y'])

In [32]:
assert len(y) == len(correlaciones_hl)

In [33]:
#sns.heatmap(correlaciones_hl)

# Pipeline completo

In [34]:
def create_kMST(distance_matrix, correlations = True, k = None, threshold = 1e-5):
    if k is None:
        N = np.log(len(distance_matrix))
        k = int(np.floor(N))
    
    print(f'k = {k}')
    grafo = nx.Graph()
    nodos = range(len(distance_matrix))

    # Crear nodo inicial
    grafo.add_nodes_from(nodos)

    for i in range(len(distance_matrix)):
        for j in range(i + 1, len(distance_matrix[i])):
            peso = distance_matrix[i][j]
            if peso > threshold:
                # para MST necesito el inverso de las correlaciones
                if correlations:
                    grafo.add_edge(i, j, weight=1-peso)
                else:
                    grafo.add_edge(i, j, weight=peso)


    print(f'---> Number of edges: {grafo.number_of_edges()}')

    mst_antes = None
    # Creamos los MSTs
    for iter in range(k):
        mst_new = nx.minimum_spanning_tree(grafo)

        edges_to_remove = list(mst_new.edges)
        grafo.remove_edges_from(edges_to_remove)
        print(f'---> {iter}. Number of edges: {grafo.number_of_edges()}')

        if mst_antes is None:
            mst_antes = mst_new.copy()
        else:
            mst_new.add_edges_from(list(mst_antes.edges()))
            mst_antes = mst_new.copy()

    return mst_antes 

In [35]:
union_graph_msts = create_kMST(correlaciones_hl)

k = 7
---> Number of edges: 897235
---> 0. Number of edges: 895136
---> 1. Number of edges: 893037
---> 2. Number of edges: 890938
---> 3. Number of edges: 888839
---> 4. Number of edges: 886740
---> 5. Number of edges: 884641
---> 6. Number of edges: 882542


In [36]:
union_graph_msts.number_of_edges()

14693

In [37]:
with open(path_out + 'grafo_kMST_correlaciones.pickle', 'wb') as f:
    pickle.dump(union_graph_msts, f)

# Louvain sobre el kMST

### 10X PBMC

In [38]:
with open('../data/correlaciones_10x/grafo_kMST_correlaciones.pickle', 'rb') as f:
    mst = pickle.load(f) 
    
with h5py.File('../data/10X_PBMC_select_2100.h5') as f:
    X = np.array(f['X'])
    y = np.array(f['Y'])

In [39]:
assert mst.number_of_nodes() == len(y)

In [40]:
particiones = nx.community.louvain_communities(mst, seed=123)

diccionario = {}
for i, conjunto in enumerate(particiones):
    for elemento in conjunto:
        diccionario[elemento] = i 

max_elemento = max(max(particiones, key = max), default=-1)
clusters = np.array([diccionario.get(i, -1) for i in range(max_elemento + 1)])
clusters

array([1, 6, 3, ..., 5, 1, 1])

In [41]:
len(set(clusters)), len(set(y))

(7, 8)

In [42]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score

acc = round(cluster_acc(clusters,y), 3)
nmi = round(normalized_mutual_info_score(clusters,y), 3)
ari = round(adjusted_mutual_info_score(clusters,y), 3)

print(f'ACC: {acc}. NMI: {nmi}. ARI: {ari}')

ACC: 0.744. NMI: 0.682. ARI: 0.68


### Human Liver

In [25]:
with open('../data/correlaciones_human_liver/grafo_kMST_correlaciones.pickle', 'rb') as f:
    mst = pickle.load(f) 
    
with h5py.File('../data/HumanLiver_counts_top5000.h5') as f:
    X = np.array(f['X'])
    y = np.array(f['Y'])

In [26]:
assert mst.number_of_nodes() == len(y)

In [27]:
particiones = nx.community.louvain_communities(mst, seed=123)

diccionario = {}
for i, conjunto in enumerate(particiones):
    for elemento in conjunto:
        diccionario[elemento] = i 

max_elemento = max(max(particiones, key = max), default=-1)
clusters = np.array([diccionario.get(i, -1) for i in range(max_elemento + 1)])
clusters

array([11, 12, 11, ..., 12,  8, 10])

In [28]:
len(set(clusters)), len(set(y))

(13, 11)

In [29]:
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score

acc = round(cluster_acc(clusters,y), 3)
nmi = round(normalized_mutual_info_score(clusters,y), 3)
ari = round(adjusted_mutual_info_score(clusters,y), 3)

print(f'ACC: {acc}. NMI: {nmi}. ARI: {ari}')

ACC: 0.715. NMI: 0.778. ARI: 0.777
