# Data Munging 

This notebook contains code for data munging. Specifically, it converts data files generated by R scripts:
<br/>001-masinoa-exploratory.rmd
<br/>002-masinoa-pc_condition_tree.R
<br/>into the ontology and annotations formats necessary for use in the redcarpet rollup codebase.

In [None]:
# if not installed, install networkx
#!pip install -U networkx

In [1]:
import os, csv, time
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from math import log
%matplotlib inline

In [2]:
# snomed concept files
pth = "./data/rollup/primary_care_only"
files = os.listdir(pth)

In [3]:
# build networkx concept graph from R generated concept ancestor files
# the networkx impl will make it easier to check that every concept has a path to
# the root concept, clinical finding

# code assumes files are structured as 
# "concept_id","concept_name","parent_concept_id","parent_concept_name"
#  4274025,"Disease",441840,"Clinical finding"

concept_dict = {}
for fn in files:
    print("working on file {0} ...".format(fn))
    isHeader = True
    if fn.endswith('.csv'):
        with open('{0}/{1}'.format(pth, fn), 'rt') as f:
            reader = csv.reader(f)
            for row in reader:
                if isHeader:
                    isHeader = False
                else:
                    child_id = int(row[0])
                    parent_id = int(row[2])
                    if child_id in concept_dict:
                        concept_dict[child_id].append(parent_id)
                    else:
                        concept_dict[child_id] = [parent_id]
    
g = nx.DiGraph()
for child,parents in concept_dict.items():
    if child not in g.nodes():
        g.add_node(child)
    for parent in parents:
        if parent not in g:
            g.add_node(parent)
    
for child, parents in concept_dict.items():
    for parent in parents:
        g.add_edge(child, parent, relation="IS_A")

working on file level_16.csv ...
working on file level_08.csv ...
working on file level_05.csv ...
working on file level_07.csv ...
working on file level_09.csv ...
working on file level_06.csv ...
working on file level_04.csv ...
working on file level_12.csv ...
working on file level_10.csv ...
working on file level_15.csv ...
working on file level_01.csv ...
working on file level_13.csv ...
working on file level_03.csv ...
working on file level_11.csv ...
working on file level_02.csv ...
working on file level_14.csv ...


In [5]:
# load conditions file and store in visit dict 
# expected file format is csv with rows of form
# "person_id","visit_occurrence_id","age_days_visit","visit_start_date","condition_concept_id"
# 4239604,86424890,876,2006-12-11,75860

concept_dict = {}
missing_concepts = []
person_visits = {}
with open('./data/pc_conditions_with_visit_meta.csv') as f:
    isHeader = True
    reader = csv.reader(f)
    for row in reader:
        if isHeader:
            isHeader = False
        else:
            vid = int(row[1])
            cid = int(row[4])
            pid = int(row[0])
            age_days = int(row[2])
            if pid not in person_visits:
                person_visits[pid] = []
            if vid not in person_visits[pid]:
                person_visits[pid].append((vid,age_days))
            if cid not in g.nodes():
                g.add_node(cid)
                missing_concepts.append(cid)
            if cid in concept_dict:
                concept_dict[cid].append(vid)
            else:
                concept_dict[cid] = [vid]
        
print("done: found {0} concepts in visits not in ontology".format(len(missing_concepts)))        
print(missing_concepts)

done: found 1 concepts in visits not in ontology
[4228490]


In [6]:
# bridge links that do not reach clinical finding 
# these are the parents of the nodes that 
# do not have a path up to clinical finding that consist of nodes of only type domain_id=Condition

CLINICAL_FINDING = 441840
cnt = 0
for node in g.nodes():
    if node != CLINICAL_FINDING:
        # NOTE is_a relation runs in opposite direction of networkx convention
        if not nx.has_path(g, node, CLINICAL_FINDING):
            g.add_edge(node, CLINICAL_FINDING, relation="IS_A")
            cnt += 1
print("{0} nodes with no path to clinical finding".format(cnt))

18 nodes with no path to clinical finding


In [7]:
# store networkX graph as ontology file for redcarpet project
first_line = True
with open("./data/ontology.txt", 'a+') as f:
    for node in g.nodes():
        parents = g[node]
        line = "{0}:".format(node)
        for parent in parents:
            line = "{0}{1},".format(line, parent)
        line = line[:-1]
        if first_line:
            f.write(line)
            first_line = False
        else:
            line = "\n{0}".format(line)
            f.write(line)    
print("done")

done


In [8]:
# write visit_dict to annotations file
with open('./data/annotations.txt', 'a+') as f:
    first_line = True
    for cid, vid_list in concept_dict.items():
        line = "{0}:".format(cid)
        for vid in vid_list:
            line = "{0}{1},".format(line, vid)
        line = line[:-1]
        if first_line:
            f.write(line)
            first_line = False
        else:
            line="\n{0}".format(line)
            f.write(line)
print("done")   

done


In [9]:
# write person_visit to file
with open('./data/person_visits.txt', 'a+') as f:
    first_line = True
    for pid, vid_list in person_visits.items():
        line = "{0}:".format(pid)
        for tpl in vid_list:
            vid = tpl[0]
            age = tpl[1]
            line = "{0}{1},{2}:".format(line, vid,age)
        line = line[:-1]
        if first_line:
            f.write(line)
            first_line = False
        else:
            line="\n{0}".format(line)
            f.write(line)
print("done")   

done
