In [1]:
import pandas as pd
import numpy as np

In [2]:
REL = 'interacts_with'

In [3]:
drug_gene_df = pd.read_csv('./Data/preprocessed_data/drug_gene.csv')
disease_gene_df = pd.read_csv('./Data/preprocessed_data/disease_gene.csv')
disease_drug_df = pd.read_csv('./Data/preprocessed_data/disease_drug.csv')
drug_drug_df = pd.read_csv('./Data/preprocessed_data/drug_drug.csv')

In [4]:
drug_gene_map = drug_gene_df.groupby('drug_name')['gene_name'].agg(set).agg(list).to_dict()
gene_drug_map = drug_gene_df.groupby('gene_name')['drug_name'].agg(set).agg(list).to_dict()


In [5]:
disease_gene_df.head()

Unnamed: 0,disease_name,gene_name
0,Gastritis,diphosphoinositol pentakisphosphate kinase 2
1,Lung Injury,pregnancy specific beta-1-glycoprotein 1
2,Intestinal Diseases,KH-type splicing regulatory protein
3,Tongue Neoplasms,leukocyte associated immunoglobulin like recep...
4,Takotsubo Cardiomyopathy,tribbles pseudokinase 1


In [6]:
disease_gene_map = disease_gene_df.groupby('disease_name')['gene_name'].agg(set).agg(list).to_dict()
gene_disease_map = disease_gene_df.groupby('gene_name')['disease_name'].agg(set).agg(list).to_dict()

In [25]:
disease_drug_df.head()

Unnamed: 0,disease_name,drug_name
0,"Glomerulosclerosis, Focal Segmental",Carbamazepine
1,Neutropenia,Atazanavir
2,"Albinism, Oculocutaneous",5-Hydroxy-2-(Hydroxymethyl)-4h-Pyran-4-One
3,Hypokinesia,Levothyroxine
4,"Multiple Endocrine Neoplasia, Type IV",Simvastatin


In [7]:
disease_drug_map = disease_drug_df.groupby('disease_name')['drug_name'].agg(set).agg(list).to_dict()
drug_disease_map = disease_drug_df.groupby('drug_name')['disease_name'].agg(set).agg(list).to_dict()

In [8]:
drug_drug_df.head()

Unnamed: 0,drug_1_name,drug_2_name
0,Vardenafil,Telmisartan
1,Clonidine,Pentoxifylline
2,Clomipramine,Mirabegron
3,Desipramine,Perampanel
4,L-DOPA,Hydralazine


In [9]:
drug_drug_map = drug_drug_df.groupby('drug_1_name')['drug_2_name'].agg(set).agg(list).to_dict()
for drug in drug_drug_df['drug_2_name'].unique():
    if drug not in drug_drug_map:
        drug_drug_map[drug] = list(set(drug_drug_df[drug_drug_df['drug_2_name'] == drug]['drug_1_name'].to_list()))
    else:
        drug_drug_map[drug].extend(list(set(drug_drug_df[drug_drug_df['drug_2_name'] == drug]['drug_1_name'].to_list())))

In [10]:
def get_walk_size():
    return np.random.randint(3, 15)

In [11]:
connected_nodes = {'drug': ['gene', 'disease', 'drug'],
                   'gene': ['disease', 'drug'],
                   'disease': ['gene', 'drug']}

In [44]:
init_node = np.random.choice(list(connected_nodes.keys()))

if init_node == 'drug':
    init_node_entity = np.random.choice(list(drug_gene_map.keys()))
elif init_node == 'gene':
    init_node_entity = np.random.choice(list(gene_drug_map.keys()))
elif init_node == 'disease':
    init_node_entity = np.random.choice(list(disease_gene_map.keys()))


def get_next_node_type(node):
    next_node_type = np.random.choice(connected_nodes[node])
    return next_node_type


def get_node_entity(node_type):
    if node_type == 'drug':
        node_entity = np.random.choice(list(drug_gene_map.keys()))
    elif node_type == 'gene':
        node_entity = np.random.choice(list(gene_drug_map.keys()))
    elif node_type == 'disease':
        node_entity = np.random.choice(list(disease_gene_map.keys()))
    return node_entity


def get_next_node_entity(current_node_type, current_node_entity, next_node_type):
    vals = None
    if current_node_type == 'drug':
        if next_node_type == 'gene':
            vals = drug_gene_map.get(current_node_entity, None)
        
        elif next_node_type == 'disease':
            vals = drug_disease_map.get(current_node_entity, None)
        elif next_node_type == 'drug':
            vals = drug_drug_map.get(current_node_entity, None)

    elif current_node_type == 'gene':
        if next_node_type == 'disease':
            vals = gene_disease_map.get(current_node_entity, None)
        elif next_node_type == 'drug':
            vals = gene_drug_map.get(current_node_entity, None)

    elif current_node_type == 'disease':
        if next_node_type == 'gene':
            vals = disease_gene_map.get(current_node_entity, None)
        elif next_node_type == 'drug':
            vals = disease_drug_map.get(current_node_entity, None)
    
    if vals is None:
        return None
    else:
        return np.random.choice(vals)
    

In [79]:
def get_random_walk(n, node_type, node_entity):
    walk = [node_entity]
    while len(walk) < n:
        next_node_type = get_next_node_type(node_type)
        next_node_entity = get_next_node_entity(node_type, node_entity, next_node_type)
        if next_node_entity is not None:
            walk.append(REL)
            walk.append(next_node_entity)
            node_type = next_node_type
            node_entity = next_node_entity
    return walk
    

In [82]:
init_node = np.random.choice(list(connected_nodes.keys()))
init_node_entity = get_node_entity(init_node)
get_random_walk(get_walk_size(), init_node , init_node_entity)

['glutamate ionotropic receptor NMDA type subunit 2D',
 'interacts_with',
 'Hematologic Diseases',
 'interacts_with',
 'glucagon like peptide 1 receptor',
 'interacts_with',
 'Acute Lung Injury']

In [83]:
from tqdm.notebook import tqdm
kg_paths = set()

pbar = tqdm(range(1000000), desc='Generating Random Walks')
while len(kg_paths) < 1000000:
    init_node = np.random.choice(list(connected_nodes.keys()))
    init_node_entity = get_node_entity(init_node)
    path = get_random_walk(get_walk_size(), init_node, init_node_entity)
    kg_paths.add(tuple(path))
    pbar.update(1)


Generating Random Walks:   0%|          | 0/1000000 [00:00<?, ?it/s]

In [84]:
len(kg_paths)

1000000

In [89]:
import pickle

with open('kg_paths.pkl', 'wb') as f:
    pickle.dump(kg_paths, f)


In [1]:
import pickle

In [2]:
data = pickle.load(open('kg_paths.pkl', 'rb'))

('(3,4-DIHYDROXY-2-NITROPHENYL)(PHENYL)METHANONE',
 'interacts_with',
 'catechol-O-methyltransferase',
 'interacts_with',
 'N-{3-[5-(6-Amino-Purin-9-Yl)-3,4-Dihydroxy-Tetrahydro-Furan-2-Yl]-Allyl}-2,3-Dihydroxy-5-Nitro-Benzamide')

In [7]:
import numpy as np

data = list(data)

TypeError: random_sample() takes at most 1 positional argument (2 given)

In [12]:
sample_data =np.random.choice(size=10000, a=data)

  """Entry point for launching an IPython kernel.


In [13]:
import pickle

with open('sample_data.pkl', 'wb') as f:
    pickle.dump(sample_data, f)