In [1]:
import os
import pickle
%matplotlib inline
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from collections import defaultdict

import sys
sys.path.append('../../hetnet-ml/src')
import graph_tools as gt

In [2]:
nodes = gt.remove_colons(pd.read_csv('../data/nodes_VER31_R.csv'))
edges = gt.remove_colons(pd.read_csv('../data/edges_VER31_R.csv', converters={'pmids':eval}))

In [3]:
nodes.head()

Unnamed: 0,id,name,label
0,C0260116,Personnel directors,Activities & Behaviors
1,C0008081,Child Language,Activities & Behaviors
2,C1514989,Strenuous Exercise,Activities & Behaviors
3,C0395000,School refusal,Activities & Behaviors
4,C0021189,Independent Living,Activities & Behaviors


In [4]:
edges.head(10)

Unnamed: 0,start_id,end_id,type,pmids,n_pmids
0,C1273870,C1138603,ADMINISTERED_TO_ABatCI,"{15842188, 12943031}",2
1,C1273870,C0600241,ADMINISTERED_TO_ABatDO,{24831754},1
2,C0035028,C0008059,ADMINISTERED_TO_ABatLB,"{9745800, 2082268}",2
3,C1273870,C0030551,ADMINISTERED_TO_ABatLB,"{11120897, 221066, 991595, 11993452, 27830922,...",10
4,C0035028,C0085756,ADMINISTERED_TO_ABatLB,{21504264},1
5,C0035028,C0085979,ADMINISTERED_TO_ABatLB,{15834207},1
6,C1273870,C1527117,ADMINISTERED_TO_ABatLB,{10886431},1
7,C1273870,C0030705,ADMINISTERED_TO_ABatLB,"{9670657, 1572866, 16762884, 26988552, 2377524...",922
8,C0035028,C0086418,ADMINISTERED_TO_ABatLB,"{968049, 8530324, 26267901}",3
9,C1273870,C0031323,ADMINISTERED_TO_ABatLB,"{28411014, 28018151, 27579828, 26975638, 16154...",6


In [5]:
def sanitize(x):
    """Some pmids have the appearance of '2015332 [3]' for some reason. This fixes that"""
    if type(x) == str:
        if ' ' in x:
            x = x.split(' ')[0]
    return x

# Some pmids are appearing as string, e.g. row 6.  They should all be int
edges['pmids'] = edges['pmids'].apply(lambda ids: set([int(sanitize(x)) for x in ids]))

In [6]:
edge_map = pd.read_csv('../data/edge_condense_map.csv')

In [7]:
edge_map.head(2)

Unnamed: 0,original_edge,condensed_to,relationship,reverse,node_semtypes
0,AFFECTS_ABafAB,AFFECTS_ABafAB,neutral,False,Activities & Behaviors --- Activities & Behaviors
1,PREDISPOSES_ABpsAB,AFFECTS_ABafAB,neutral,False,Activities & Behaviors --- Activities & Behaviors


In [8]:
def change_edge_type(from_type, to_type, swap=False):
    idx = edges.query('type == @from_type').index
    edges.loc[idx, 'type'] = to_type
    if swap:
        tmp = edges.loc[idx, 'start_id']
        edges.loc[idx, 'start_id'] = edges.loc[idx, 'end_id']
        edges.loc[idx, 'end_id'] = tmp
                                             
def merge_edge_types(from_list, to_type, swap=False):
    for from_type in from_list:
        change_edge_type(from_type, to_type, swap=swap)
        
def drop_edges_from_list(drop_edges):
    idx = edges.query('type in @drop_edges').index
    edges.drop(idx, inplace=True)

In [9]:
# Order is order in which condensations are made, so important to preserve
for row in tqdm(edge_map.itertuples(), total=len(edge_map)):
    change_edge_type(row.original_edge, row.condensed_to, swap=row.reverse)
edges = edges.dropna(subset=['type']).reset_index(drop=True)

100%|██████████| 291/291 [04:42<00:00,  1.03it/s]


In [10]:
edges['type'].nunique()

2743

### Fix Potential problems of duplicated undirected edges

In [11]:
abv, met = gt.get_abbrev_dict_and_edge_tuples(gt.add_colons(nodes), gt.add_colons(edges))

In [12]:
id_to_label = gt.map_id_to_value(gt.add_colons(nodes), ':LABEL')

In [13]:
edges['start_label'] = edges['start_id'].map(lambda c: id_to_label[c])
edges['end_label'] = edges['end_id'].map(lambda c: id_to_label[c])
edges['sem'] = edges['type'].map(lambda e: '_'.join(e.split('_')[:-1]))

edges['abbrev'] = edges['type'].map(lambda e: e.split('_')[-1])

proper_abbrevs = []
for e in tqdm(edges.itertuples(), total=len(edges)):
    if '>' in e.abbrev:
        abbrev = abv[e.start_label] + abv[e.sem] + '>' + abv[e.end_label]
    else:
        abbrev = abv[e.start_label] + abv[e.sem] + abv[e.end_label]
    proper_abbrevs.append(abbrev)
    
edges['calc_abbrev'] = proper_abbrevs

100%|██████████| 16151935/16151935 [00:55<00:00, 293566.95it/s]


In [14]:
edges.head(2)

Unnamed: 0,start_id,end_id,type,pmids,n_pmids,start_label,end_label,sem,abbrev,calc_abbrev
0,C1273870,C1138603,ADMINISTERED_TO_ABatCI,"{15842188, 12943031}",2,Activities & Behaviors,Concepts & Ideas,ADMINISTERED_TO,ABatCI,ABatCI
1,C1273870,C0600241,ADMINISTERED_TO_ABatDO,{24831754},1,Activities & Behaviors,Disorders,ADMINISTERED_TO,ABatDO,ABatDO


In [15]:
idx = edges['calc_abbrev'] != edges['abbrev']
idx.sum()

0

In [16]:
# Get the edges that are un-directed, between same type
idx = edges['start_label'] == edges['end_label']

self_refferential_types = edges.loc[idx, 'type'].unique()
self_refferential_types = [e for e in self_refferential_types if '>' not in e]

In [17]:
# Get a sorted CUI Map

edge_map = {}

for kind in tqdm(self_refferential_types):
    pmid_map = defaultdict(set)
    subedges = edges.query('type == @kind')
    
    for row in subedges.itertuples():
        edge_id = tuple(sorted([row.start_id, row.end_id]))
        
        pmid_map[edge_id] = pmid_map[edge_id].union(row.pmids)
        edge_map[kind] = pmid_map

100%|██████████| 287/287 [02:23<00:00,  2.23it/s]


In [18]:
# Convert back to a DataFrame
kinds = []
start_ids = []
end_ids = []
pmids = []

for kind, e_dict in edge_map.items():
    for (s_id, e_id), pms in e_dict.items():
        kinds.append(kind)
        start_ids.append(s_id)
        end_ids.append(e_id)
        pmids.append(pms)
        
fixed_edges = pd.DataFrame({'start_id': start_ids, 'end_id': end_ids, 'type': kinds, 'pmids': pmids})

In [19]:
len(fixed_edges), len(edges.loc[idx])

(2286010, 3760043)

In [20]:
fixed_edges.head()

Unnamed: 0,end_id,pmids,start_id,type
0,C0439824,{3980757},C0205087,OCCURS_IN_CIoiCI
1,C0449238,{26844580},C0439824,OCCURS_IN_CIoiCI
2,C0439824,"{15832627, 17445292}",C0205390,OCCURS_IN_CIoiCI
3,C0439824,{26736629},C0040223,OCCURS_IN_CIoiCI
4,C2362520,{19727847},C1547085,OCCURS_IN_CIoiCI


In [21]:
fixed_edges.tail()

Unnamed: 0,end_id,pmids,start_id,type
2286005,C0418981,{11092105},C0418981,lower_than_PRltPR
2286006,C0851827,{6453959},C0220905,NEG_USES_CInuCI
2286007,C0427861,{7850918},C0013203,NEG_MANIFESTATION_OF_PHnmfoPH
2286008,C0549255,{1731054},C0026606,NEG_CAUSES_ABncAB
2286009,C0518605,{12054383},C0015259,NEG_CAUSES_ABncAB


In [22]:
print('{:,}'.format(len(edges)))
edges.drop(idx[idx].index, inplace=True)
print('{:,}'.format(len(edges)))

16,151,935
12,391,892


In [23]:
edges = pd.concat([edges, fixed_edges])
print('{:,}'.format(len(edges)))

14,677,902


In [24]:
edges = edges[['start_id', 'end_id', 'type', 'pmids']]

In [25]:
edges.head()

Unnamed: 0,start_id,end_id,type,pmids
0,C1273870,C1138603,ADMINISTERED_TO_ABatCI,"{15842188, 12943031}"
1,C1273870,C0600241,ADMINISTERED_TO_ABatDO,{24831754}
2,C0035028,C0008059,ADMINISTERED_TO_ABatLB,"{9745800, 2082268}"
3,C1273870,C0030551,ADMINISTERED_TO_ABatLB,"{11120897, 221066, 14735595, 991595, 11993452,..."
4,C0035028,C0085756,ADMINISTERED_TO_ABatLB,{21504264}


In [26]:
%%time

print('{:,}'.format(len(edges)))

# Some edges now duplicated, de-duplicate and combine pmids
grpd = edges.groupby(['start_id', 'end_id', 'type'])
edges = grpd['pmids'].apply(lambda Series: set.union(*Series.values)).reset_index()

# re-count the pmid numbers
edges['n_pmids'] = edges['pmids'].apply(len)

print('{:,}'.format(len(edges)))

14677902
13284096
CPU times: user 19min 24s, sys: 1.64 s, total: 19min 26s
Wall time: 19min 26s


In [27]:
# Sort values before writing to disk
nodes = nodes.sort_values('label')
edges = edges.sort_values('type')

# Add in colons required by neo4j
nodes = gt.add_colons(nodes)
edges = gt.add_colons(edges)

nodes.to_csv('../data/nodes_VER31_R_condensed.csv', index=False)
edges.to_csv('../data/edges_VER31_R_condensed.csv', index=False)