In [60]:
import sys
import pickle

import pandas as pd
import numpy as np
from sklearn import cluster
import random
import networkx as nx

import gensim

In [61]:
with open('../data/disease_ontograph.pkl', 'rb') as f:
    disease_ontograph = pickle.load(f)

In [62]:
diseases = [n for n in disease_ontograph.nodes if disease_ontograph.nodes[n].get('label') ==  'disease']

In [63]:
WALK_LEN=30
N_WALKS=100
def run_random_walks(G, nodes, walk_len = WALK_LEN,num_walks=N_WALKS):
    #print("now we start random walk")

    pairs = []
    for count, node in enumerate(nodes):

        if G.degree(node) == 0:
            continue
        for i in range(num_walks):
            curr_node = node
            walk_accumulate=[]
            for j in range(walk_len):
                next_node = random.choice(list(G.neighbors(curr_node)))

                edge_type = random.choice(list(G.get_edge_data(curr_node, next_node).keys()))

                if curr_node == node:
                    walk_accumulate.append(curr_node)
                walk_accumulate.append(edge_type)
                walk_accumulate.append(next_node)

                curr_node = next_node

            pairs.append(walk_accumulate)
        if count % 1000 == 0:
            print("Done walks for", count, "nodes")
    return(pairs)

In [64]:
for i in range(100):
    print(run_random_walks(disease_ontograph,['GARD:1'],1,1))

Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001994']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0003281']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0003355']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001396']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0003542']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0012464']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001994']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001396']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001394']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001397']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001394']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0004925']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0012465']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0012465']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001396']]
Done walks for 0 nodes
[['GARD:1', 'presents', 'HP:0001394']]
Done wal

In [65]:
pairs = run_random_walks(disease_ontograph,diseases,10,20)

Done walks for 0 nodes
Done walks for 1000 nodes
Done walks for 2000 nodes
Done walks for 3000 nodes


In [67]:
with open("../data/walks.txt", "a") as fp:
    for p in pairs:
        for sub_p in p:
            fp.write(str(sub_p)+" ")
        fp.write("\n")

In [69]:
sentences=gensim.models.word2vec.LineSentence("../data/walks.txt")
model=gensim.models.Word2Vec(sentences,sg=1, min_count=1, vector_size=100, window=10,epochs=30,workers=5)
model.save("../data/gene_pheno_embed.model")

In [70]:
disease_vectors = model.wv[diseases]

In [71]:
disease_vectors.shape

(3713, 100)

In [72]:
kmeans = cluster.MiniBatchKMeans(n_clusters=100,random_state=0)
kmeans.fit(disease_vectors)

MiniBatchKMeans(n_clusters=100, random_state=0)

In [73]:
cluster_labels = kmeans.predict(disease_vectors)

In [74]:
from collections import Counter
Counter(cluster_labels)

Counter({10: 343,
         27: 592,
         38: 203,
         39: 141,
         19: 118,
         98: 182,
         47: 175,
         44: 93,
         63: 2,
         8: 42,
         73: 136,
         16: 74,
         4: 97,
         58: 186,
         17: 18,
         21: 120,
         15: 76,
         25: 145,
         13: 122,
         41: 227,
         43: 1,
         96: 55,
         78: 35,
         6: 163,
         42: 32,
         83: 2,
         60: 1,
         64: 1,
         92: 4,
         20: 3,
         14: 69,
         2: 32,
         55: 15,
         26: 2,
         37: 1,
         0: 2,
         97: 1,
         29: 19,
         46: 1,
         51: 1,
         9: 27,
         33: 4,
         87: 1,
         30: 2,
         69: 1,
         80: 1,
         40: 1,
         86: 22,
         12: 1,
         82: 3,
         49: 1,
         67: 2,
         65: 1,
         93: 1,
         76: 1,
         11: 1,
         52: 2,
         5: 1,
         35: 23,
         24: 1,
   