# RUN MetaMapLite with NCBI Disease corpus

In [6]:
import os,sys,json,re
from tqdm import tqdm

In [8]:
input_file='../data/NCBItestset_corpus.nd.json'
output_file='../data/MetaMap_output.nd.json'

In [7]:
import subprocess
def metamaplite(text):
    semtypes="chem,neop,sosy,dsyn,nusq,gngm,amas,crbs,mosq,moft,phsf,orch,phsu"
    sources=("AIR,AOD,CHV,COSTAR,CSP,CST,DXP,ICD10CM,LCH_NW,LNC,MEDLINEPLUS,"
        "MSH,MTH,MTHICD9,NCI,NCI_CTCAE,NCI_FDA,NCI_NICHD,NDFRT,NLMSubSyn,SNM,"
        "SNMI,SNOMEDCT_US,SNOMEDCT_VET")

    cmd=("docker exec -i cnt_metamap2020nt /opt/public_mm_lite/metamaplite.sh "
        "--outputformat=mmi --scheduler "
        "--modelsdir=/opt/public_mm_lite/data/models "
        "--configfile=/opt/public_mm_lite/config/metamaplite.properties "
        "--specialtermsfile=/opt/public_mm_lite/data/specialterms.txt "
        "--indexdir=/opt/public_mm_lite/data/ivf/2020AA/Base "
        "--restrict_to_sts={} "
        "--restrict_to_sources={} "
        "--").format(semtypes, sources)

    process = subprocess.Popen(cmd.split(), stdin=subprocess.PIPE, 
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    result, _ = process.communicate(text.encode(encoding='UTF-8'))
    
    concepts=[]
    for row in result.strip().decode('UTF-8').split('\n'):
        mmi=row.split("|")
        if len(mmi) == 11:
            obj={
                "cui": mmi[4],
                "name": mmi[3],
                "score": mmi[2],
                "pos_info": mmi[8],
                "semtypes": mmi[5][1:-1].split(','),
                "trigger": mmi[6]
            }
            concepts.append(obj)
        
    return concepts

In [9]:
data=[json.loads(line) for line in open(input_file)]
len(data)

100

In [10]:
data[0]

{'id': '9949209',
 'text': 'Genetic mapping of the copper toxicosis locus in Bedlington terriers to dog chromosome 10, in a region syntenic to human chromosome region 2p13-p16. Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellit

In [16]:
mm_results=[]
for item in tqdm(data):
    text=item['text']
    mmout=metamaplite(text)
    mm_results.append({
        "id":item['id'],
        "text":text,
        "mm":mmout
    })

len(mm_results)

100%|██████████| 100/100 [03:37<00:00,  2.18s/it]


100

In [17]:
mm_results[0]

{'id': '9949209',
 'text': 'Genetic mapping of the copper toxicosis locus in Bedlington terriers to dog chromosome 10, in a region syntenic to human chromosome region 2p13-p16. Abnormal hepatic copper accumulation is recognized as an inherited disorder in man, mouse, rat and dog. The major cause of hepatic copper accumulation in man is a dysfunctional ATP7B gene, causing Wilson disease (WD). Mutations in the ATP7B genes have also been demonstrated in mouse and rat. The ATP7B gene has been excluded in the much rarer human copper overload disease non-Indian childhood cirrhosis, indicating genetic heterogeneity. By investigating the common autosomal recessive copper toxicosis (CT) in Bedlington terriers, we have identified a new locus involved in progressive liver disease. We examined whether the WD gene ATP7B was also causative for CT by investigating the chromosomal co-localization of ATP7B and C04107, using fluorescence in situ hybridization (FISH). C04107 is an anonymous microsatellit

In [18]:
with open(output_file, mode='w') as f:
    for item in tqdm(mm_results):
        f.write(json.dumps(item) + '\n')

100%|██████████| 100/100 [00:00<00:00, 12614.07it/s]
