In [1]:
# Evaluate MetaMap

In [2]:
import os,sys,json,re
from tqdm import tqdm

In [3]:
input_file='../data/MetaMap_output.nd.json'
answer_file='../original-data/test/NCBItestset_corpus.txt'
output_file='../data/ncbi_disease_metamap.conll'

# output_file format
from https://www.clips.uantwerpen.be/conll2000/chunking/output.txt.gz

```
   Boeing NNP B-NP B-NP
   's POS B-NP B-NP
   747 CD I-NP I-NP
   jetliners NNS I-NP I-NP
   . . O O

   Rockwell NNP B-NP B-NP
   said VBD B-VP B-VP
   the DT B-NP B-NP
   agreement NN I-NP I-NP
```

In [4]:
data=[json.loads(line) for line in open(input_file)]
len(data)

100

In [5]:
item=data[0]
item

{'id': '9949209',
 'text': 'Genetic mapping of the copper toxicosis locus in Bedlington terriers to dog chromosome 10, in a region syntenic to human chromosome region 2p13-p16. Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellit

In [6]:
def parse_pos_info(pos_info):
    regex=r';?([0-9]*)\/([0-9]*)'
    groups=re.findall(regex, pos_info)
    
    positions=[]
    for pos, length in groups:
        start_offset=int(pos)
        end_offset=start_offset+int(length)
        positions.append((start_offset, end_offset))
        
    return positions

In [7]:
assert parse_pos_info("0/3") == [(0,3)]
assert parse_pos_info("55/23;83/4") == [(55,78), (83,87)]

In [8]:
# with open(output_file, mode='w') as f:
mm_dict={}
for item in data:
    pmid=item['id']
    if pmid not in mm_dict:
        mm_dict[pmid] = {}
    mm_dict[pmid]['text'] = item['text']
    
    mm_dict[pmid]['entities'] = []
    outset=set()
    for entity in item['mm']:
        if 'dsyn' in entity['semtypes']:
            pos_info=entity['pos_info']
            name=entity['name']
            cui=entity['cui']
            for start_offset, end_offset in parse_pos_info(pos_info):
                outset.add((start_offset, end_offset))
    mm_dict[pmid]['entities'] += (list(outset))
   
len(mm_dict)

100

# Parse answer file

In [9]:
line_title='9949209|t|Genetic mapping of the copper toxicosis locus in Bedlington terriers to dog chromosome 10, in a region syntenic to human chromosome region 2p13-p16.'
line_abstract='9949209|a|Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellite marker closely linked to CT. However, BAC clones containing ATP7B and C04107 mapped to the canine chromosome regions CFA22q11 and CFA10q26, respectively, demonstrating that WD cannot be homologous to CT. The copper transport genes CTR1 and CTR2 were also excluded as candidate genes for CT since they both mapped to canine chromosome region CFA11q22. 2-22. 5. A transcribed sequence identified from the C04107-containing BAC was found to be homologous to a gene expressed from human chromosome 2p13-p16, a region devoid of any positional candidate genes.'
line_entity="9949209\t23\t39\tcopper toxicosis\tModifier\tOMIM:215600"

In [10]:
line_entity

'9949209\t23\t39\tcopper toxicosis\tModifier\tOMIM:215600'

In [11]:
regex_title=r'^([0-9]*)\|t\|(.*)$'
regex_abstract=r'^([0-9]*)\|a\|(.*)$'
regex_entity=r'^([0-9]*)\t([0-9]*)\t([0-9]*)\t(.*)\t(.*)\t(.*)'

In [12]:
# Extract Title
match=re.match(regex_abstract, line_title)
assert match is None
match=re.match(regex_entity, line_title)
assert match is None
match=re.match(regex_title, line_title)
assert match
match.groups()

('9949209',
 'Genetic mapping of the copper toxicosis locus in Bedlington terriers to dog chromosome 10, in a region syntenic to human chromosome region 2p13-p16.')

In [13]:
# Extract Abstract
match=re.match(regex_title, line_abstract)
assert match is None
match=re.match(regex_entity, line_abstract)
assert match is None
match=re.match(regex_abstract, line_abstract)
assert match 
match.groups()

('9949209',
 'Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellite marker closely linked to CT. However, BAC clones containing ATP7B and C04107 mapped to the canine chromosome regions CFA22q11 and CFA10q26, respectively, demonst

In [14]:
# Extract Entities
match=re.match(regex_title, line_entity)
assert match is None
match=re.match(regex_abstract, line_entity)
assert match is None
match=re.match(regex_entity, line_entity)
assert match
match.groups()

('9949209', '23', '39', 'copper toxicosis', 'Modifier', 'OMIM:215600')

In [15]:
answer_dict={}

for i, line in enumerate(open(answer_file)):
    match=re.match(regex_title, line)
    if match:
        pmid=match.group(1)
        title=match.group(2)
        
        if pmid not in answer_dict:
            answer_dict[pmid] = {}
        answer_dict[pmid]['pmid']=pmid
        answer_dict[pmid]['title']=title
        
    match=re.match(regex_abstract, line)
    if match:
        pmid=match.group(1)
        answer_dict[pmid]['abstract']=match.group(2)
        
    match=re.match(regex_entity, line)
    if match:
        pmid=match.group(1)
        entity= (int(match.group(2)), int(match.group(3)))
        if 'entities' not in answer_dict[pmid]:
            answer_dict[pmid]['entities'] = []
        answer_dict[pmid]['entities'].append(entity)  
        
print(len(answer_dict))

100


In [16]:
len(answer_dict)

100

In [17]:
lines=[]
for pmid in answer_dict:
    text=mm_dict[pmid]['text']
    mm_entities=mm_dict[pmid]['entities']
    answer_entities=answer_dict[pmid]['entities']
    
    mm_set=set(mm_entities)
    answer_set=set(answer_entities)
    
    all_entities=mm_set.union(answer_set)

    for loc in all_entities:
        pred_val=answer_val="O-MISC"
        if loc in mm_set:
            pred_val="B-MISC"
        if loc in answer_set:
            answer_val="B-MISC"

        part=text[loc[0]:loc[1]]
        for i, token in enumerate(part.split(' ')):
            if i != 0:
                pred_val=pred_val.replace('B-','I-')
                answer_val=answer_val.replace('B-','I-')
            lines.append(f"{token} MISC {answer_val} {pred_val}")
    lines.append('. MISC O O')
    lines.append('')

In [18]:
with open(output_file, mode='w') as f:
    for line in lines:
        f.write(line + '\n')

In [19]:
!head $output_file

CT MISC B-MISC O-MISC
CT MISC B-MISC O-MISC
Wilson MISC B-MISC B-MISC
disease MISC I-MISC I-MISC
Indian MISC O-MISC B-MISC
childhood MISC O-MISC I-MISC
cirrhosis MISC O-MISC I-MISC
toxicosis MISC O-MISC B-MISC
CT MISC B-MISC O-MISC
WD MISC B-MISC O-MISC
