In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
from tqdm import tqdm

import sys
sys.path.append('../../hetnet-ml/src')
import graph_tools as gt

In [2]:
nodes = gt.remove_colons(pd.read_csv('../data/nodes_VER31_R.csv'))
edges = gt.remove_colons(pd.read_csv('../data/edges_VER31_R.csv', converters={'pmids':eval}))

In [3]:
nodes.head()

Unnamed: 0,id,label,name
0,C0349450,Activities & Behaviors,Soiling
1,C0413329,Activities & Behaviors,Partner Abuse
2,C0418940,Activities & Behaviors,Change of employment
3,C0418946,Activities & Behaviors,Parental support
4,C0422243,Activities & Behaviors,Private account


In [4]:
edges.head(10)

Unnamed: 0,start_id,end_id,type,pmids,n_pmids
0,C1273870,C1138603,ADMINISTERED_TO_ABatCI,"{15842188, 12943031}",2
1,C1273870,C0600241,ADMINISTERED_TO_ABatDO,{24831754},1
2,C0035028,C0008059,ADMINISTERED_TO_ABatLB,"{9745800, 2082268}",2
3,C1273870,C0030551,ADMINISTERED_TO_ABatLB,"{11120897, 221066, 991595, 11993452, 27830922,...",10
4,C0035028,C0085756,ADMINISTERED_TO_ABatLB,{21504264},1
5,C0035028,C0085979,ADMINISTERED_TO_ABatLB,{15834207},1
6,C1273870,C1527117,ADMINISTERED_TO_ABatLB,{10886431},1
7,C1273870,C0030705,ADMINISTERED_TO_ABatLB,"{9670657, 1572866, 16762884, 26988552, 2377524...",922
8,C0035028,C0086418,ADMINISTERED_TO_ABatLB,"{968049, 8530324, 26267901}",3
9,C1273870,C0031323,ADMINISTERED_TO_ABatLB,"{28411014, 28018151, 27579828, 26975638, 16154...",6


In [5]:
def sanitize(x):
    """Some pmids have the appearance of '2015332 [3]' for some reason. This fixes that"""
    if type(x) == str:
        if ' ' in x:
            x = x.split(' ')[0]
    return x

# Some pmids are appearing as string, e.g. row 6.  They should all be int
edges['pmids'] = edges['pmids'].apply(lambda ids: set([int(sanitize(x)) for x in ids]))

In [6]:
edge_map = pd.read_csv('../data/edge_condense_map.csv')

In [7]:
edge_map.head(2)

Unnamed: 0,original_edge,condensed_to,relationship,reverse,node_semtypes
0,AFFECTS_ABafAB,AFFECTS_ABafAB,neutral,False,Activities & Behaviors --- Activities & Behaviors
1,PREDISPOSES_ABpsAB,AFFECTS_ABafAB,neutral,False,Activities & Behaviors --- Activities & Behaviors


In [8]:
def change_edge_type(from_type, to_type, swap=False):
    idx = edges.query('type == @from_type').index
    edges.loc[idx, 'type'] = to_type
    if swap:
        tmp = edges.loc[idx, 'start_id']
        edges.loc[idx, 'start_id'] = edges.loc[idx, 'end_id']
        edges.loc[idx, 'end_id'] = tmp
                                             
def merge_edge_types(from_list, to_type, swap=False):
    for from_type in from_list:
        change_edge_type(from_type, to_type, swap=swap)
        
def drop_edges_from_list(drop_edges):
    idx = edges.query('type in @drop_edges').index
    edges.drop(idx, inplace=True)

In [9]:
# Order is order in which condensations are made, so important to preserve
for row in tqdm(edge_map.itertuples(), total=len(edge_map)):
    change_edge_type(row.original_edge, row.condensed_to, swap=row.reverse)
edges = edges.dropna(subset=['type']).reset_index(drop=True)

100%|██████████| 292/292 [04:39<00:00,  1.07it/s]


In [10]:
edges['type'].nunique()

2747

In [11]:
%%time

print(len(edges))

# Some edges now duplicated, de-duplicate and combine pmids
grpd = edges.groupby(['start_id', 'end_id', 'type'])
edges = grpd['pmids'].apply(lambda Series: set.union(*Series.values)).reset_index()

# re-count the pmid numbers
edges['n_pmids'] = edges['pmids'].apply(len)

print(len(edges))

16151935
14526080
CPU times: user 20min 43s, sys: 1.28 s, total: 20min 44s
Wall time: 20min 44s


In [12]:
# Sort values before writing to disk
nodes = nodes.sort_values('label')
edges = edges.sort_values('type')

# Add in colons required by neo4j
nodes = gt.add_colons(nodes)
edges = gt.add_colons(edges)

nodes.to_csv('../data/nodes_VER31_R_condensed.csv', index=False)
edges.to_csv('../data/edges_VER31_R_condensed.csv', index=False)