# CUI pairs differences between real and rephrased abstracts: SemRep processing

In [32]:
import pandas as pd
from pathlib import Path

from collections import defaultdict

## Initializing SemRep

In [6]:
from agatha.construct.semrep_handler import SemRepHandler

In [2]:
nlm_soft_folder = '/lustre/acslab/users/2288/NLM_soft'
sr_temp_folder = '/lustre/acslab/users/2288/scratch_dir/sr_temp/021225'
sr_replace_utf8_path = '/lustre/acslab/users/2288/NLM_soft/replace_utf8.jar'

In [3]:
t = SemRepHandler(
    nlm_soft_path=nlm_soft_folder,
    temp_folder=sr_temp_folder,
    #restart_mm_services=True,
    replace_utf8_path=sr_replace_utf8_path,
)

Existing services killed.
Starting wsdserverctl: 
started.
loading properties file /lustre/acslab/users/2288/NLM_soft/public_mm/WSD_Server/config/disambServer.cfg

SKR and WSD services started.


In [7]:
t.sr_binary_path = Path('/lustre/acslab/users/2288/NLM_soft/public_semrep/bin/semrep.v1.9_2021AB')

In [8]:
t.ProcessList_parallel(['Moderna COVID-19 vaccine treats Coronavirus disease'])

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...


{'s:user_input_0:1': {'sent_text': 'Moderna COVID-19 vaccine treats Coronavirus disease ',
  'terms': [{'CID': 'C5424056',
    'pref_name': 'Moderna COVID-19 Vaccine',
    'extracted_text': 'Moderna COVID-19 vaccine',
    'label': 'UMLS',
    'sem_types': ['imft', 'nnon', 'phsu'],
    'negated': False},
   {'CID': 'C5401375',
    'pref_name': 'Coronaviridae Measurement',
    'extracted_text': 'Coronavirus',
    'label': 'UMLS',
    'sem_types': ['lbpr'],
    'negated': False},
   {'CID': 'C0012634',
    'pref_name': 'Disease',
    'extracted_text': 'disease',
    'label': 'UMLS',
    'sem_types': ['dsyn'],
    'negated': False}],
  'relations': [{'subj_id': 'C5424056',
    'subj_name': 'Moderna COVID-19 Vaccine',
    'subj_text': 'Moderna COVID-19 vaccine',
    'subj_sem_type': 'phsu',
    'subj_negated': False,
    'verb': 'TREATS',
    'verb_negated': False,
    'obj_id': 'C0012634',
    'obj_name': 'Disease',
    'obj_text': 'disease',
    'obj_sem_type': 'dsyn',
    'obj_negated': 

## Opening results

In [10]:
pmids_df = pd.read_pickle('data/021225/100_pmids.pkl')

In [16]:
pmids_df

Unnamed: 0,pmid,pmid_text,rewritten_text,original_text_preds,rewritten_text_preds
0,1883399,Substrate analogues and divalent cations as in...,To explore whether glutamate decarboxylase fro...,"[(glutamate decarboxylase, source for study of...","[(Escherichia coli glutamate decarboxylase, mo..."
1,32887154,Comparison of capillary electrophoresis and zw...,This study compares zwitterionic-hydrophilic i...,"[(capillary electrophoresis, comparison), (zwi...","[(study, compares capZIC-HILIC and CE), (study..."
2,1654513,Angiotensin II receptor subtypes and biologica...,Research on angiotensin II (AII) receptor subt...,"[(Angiotensin II receptor subtypes, biological...",[(Rat and bovine adrenal zona glomerulosa cell...
3,11814052,Mechanisms of self-incompatibility in flowerin...,Self-incompatibility is a common mechanism in ...,"[(self-incompatibility, mechanism), (self-inco...","[(Self-incompatibility, mechanism in flowering..."
4,16350614,Accuracy and speed of orthographic processing ...,A study compared 39 individuals with developme...,"[(persons with developmental dyslexia, orthogr...","[(study, compared individuals with development..."
...,...,...,...,...,...
95,3682109,Leads from the MMWR. Premature mortality due t...,The MMWR reports on early deaths caused by bre...,[],"[(MMWR, reports on early deaths), (early death..."
96,21937697,High endothelial venules as traffic control po...,High endothelial venules (HEVs) serve as cruci...,"[(High endothelial venules, traffic control po...","[(High endothelial venules (HEVs), regulate th..."
97,26985970,Screening of Toxic Chemicals in a Single Drop ...,We introduce a novel approach for detecting to...,[(Surface-enhanced laser desorption/ionization...,"[(novel approach, detecting toxic chemicals), ..."
98,17340633,Predicting DNA-binding amino acid residues fro...,"To predict DNA-binding amino acid residues, we...","[(DNA-binding amino acid residues, predicted f...","[(positively charged atoms, clusters), (cluste..."


In [13]:

pmids_df['original_text_preds'].apply(lambda x: [len(p) for p in x])

0                     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
1     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
2     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
3     [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
4                  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
                            ...                        
95                                                   []
96        [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
97           [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
98                 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
99                    [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Name: original_text_preds, Length: 100, dtype: object

## Running SemRep to map entities

In [23]:
all_ents_list = []
for rec in pmids_df['original_text_preds']:
    for e in rec:
        all_ents_list += e
    
for rec in pmids_df['rewritten_text_preds']:
    for e in rec:
        all_ents_list += e

In [25]:
len(all_ents_list)

5556

In [27]:
len(set([w.lower() for w in all_ents_list]))

3724

In [29]:
all_ents_list = list(set(all_ents_list))
len(all_ents_list)

3812

In [30]:
%%time
r_out_dict = t.ProcessList_parallel(all_ents_list, chunkSize=50)

Run SemRep in interactive mode...
Processing input with replace_utf8.jar utility...
CPU times: user 163 ms, sys: 267 ms, total: 430 ms
Wall time: 2min 36s


## Applying SemRep mapping`

In [39]:
str_terms_to_cui_dict = defaultdict(set)
for k,v in r_out_dict.items():
    sent_text = v['sent_text'][:-1]
    for t in v['terms']:
        str_terms_to_cui_dict[sent_text].add(t['pref_name'])

In [41]:
str_terms_to_cui_dict['enhanced diagnostic accuracy for nodal metastasis in esophageal squamous cell carcinoma']

{'Diagnostic Accuracy',
 'Enhance (action)',
 'NODAL gene',
 'Secondary Neoplasm',
 'Squamous cell carcinoma of esophagus'}

In [42]:
pmids_df

Unnamed: 0,pmid,pmid_text,rewritten_text,original_text_preds,rewritten_text_preds
0,1883399,Substrate analogues and divalent cations as in...,To explore whether glutamate decarboxylase fro...,"[(glutamate decarboxylase, source for study of...","[(Escherichia coli glutamate decarboxylase, mo..."
1,32887154,Comparison of capillary electrophoresis and zw...,This study compares zwitterionic-hydrophilic i...,"[(capillary electrophoresis, comparison), (zwi...","[(study, compares capZIC-HILIC and CE), (study..."
2,1654513,Angiotensin II receptor subtypes and biologica...,Research on angiotensin II (AII) receptor subt...,"[(Angiotensin II receptor subtypes, biological...",[(Rat and bovine adrenal zona glomerulosa cell...
3,11814052,Mechanisms of self-incompatibility in flowerin...,Self-incompatibility is a common mechanism in ...,"[(self-incompatibility, mechanism), (self-inco...","[(Self-incompatibility, mechanism in flowering..."
4,16350614,Accuracy and speed of orthographic processing ...,A study compared 39 individuals with developme...,"[(persons with developmental dyslexia, orthogr...","[(study, compared individuals with development..."
...,...,...,...,...,...
95,3682109,Leads from the MMWR. Premature mortality due t...,The MMWR reports on early deaths caused by bre...,[],"[(MMWR, reports on early deaths), (early death..."
96,21937697,High endothelial venules as traffic control po...,High endothelial venules (HEVs) serve as cruci...,"[(High endothelial venules, traffic control po...","[(High endothelial venules (HEVs), regulate th..."
97,26985970,Screening of Toxic Chemicals in a Single Drop ...,We introduce a novel approach for detecting to...,[(Surface-enhanced laser desorption/ionization...,"[(novel approach, detecting toxic chemicals), ..."
98,17340633,Predicting DNA-binding amino acid residues fro...,"To predict DNA-binding amino acid residues, we...","[(DNA-binding amino acid residues, predicted f...","[(positively charged atoms, clusters), (cluste..."


Bad id: user_input_2709 Error code: 1
Skipped record: user_input_2710|enable regulation



In [43]:
def get_umls_pairs(r):
    umls_pairs_set = set()
    
    for t1, t2 in r:
        cur_t1_terms = str_terms_to_cui_dict.get(t1)
        cur_t2_terms = str_terms_to_cui_dict.get(t2)
        
        if cur_t1_terms and cur_t2_terms:
            for t1t in cur_t1_terms:
                for t2t in cur_t2_terms:
                    umls_pairs_set.add(
                        tuple(sorted([t1t, t2t]))
                    )
                    
    return umls_pairs_set

In [45]:
pmids_df['original_text_preds_umls'] = pmids_df['original_text_preds'].apply(get_umls_pairs)
pmids_df['rewritten_text_preds_umls'] = pmids_df['rewritten_text_preds'].apply(get_umls_pairs)

## Finding terms differences

In [50]:
pmids_df['intersect_umls'] = pmids_df.apply(
    lambda x: x['original_text_preds_umls'].intersection(
        x['rewritten_text_preds_umls']
    ),
    axis=1
)

pmids_df['diff_umls'] = pmids_df.apply(
    lambda x: x['original_text_preds_umls'].symmetric_difference(
        x['rewritten_text_preds_umls']
    ),
    axis=1
)

In [56]:
pmids_df.applymap(len)

  pmids_df.applymap(len)


Unnamed: 0,pmid,pmid_text,rewritten_text,original_text_preds,rewritten_text_preds,original_text_preds_umls,rewritten_text_preds_umls,intersect_umls,diff_umls
0,7,1196,999,11,21,62,62,19,86
1,8,1716,1384,31,18,53,34,0,87
2,7,2194,1841,24,22,129,191,72,176
3,8,1221,939,20,14,33,29,3,56
4,8,1085,993,12,8,50,34,3,78
...,...,...,...,...,...,...,...,...,...
95,7,83,86,0,4,0,3,0,3
96,8,1390,1361,15,11,17,20,5,27
97,8,1477,1182,14,19,50,40,15,60
98,8,1359,1063,12,24,96,27,11,101


In [63]:
pmids_df

Unnamed: 0,pmid,pmid_text,rewritten_text,original_text_preds,rewritten_text_preds,original_text_preds_umls,rewritten_text_preds_umls,intersect_umls,diff_umls
0,1883399,Substrate analogues and divalent cations as in...,To explore whether glutamate decarboxylase fro...,"[(glutamate decarboxylase, source for study of...","[(Escherichia coli glutamate decarboxylase, mo...","{(inhibitors, zinc cation), (Analog, Inhibitor...","{(Compound, IMPACT gene), (Potential, isophtha...","{(Bacterial, Model), (Cations, Divalent, inhib...","{(Compound, IMPACT gene), (Enzymes, Glutamate ..."
1,32887154,Comparison of capillary electrophoresis and zw...,This study compares zwitterionic-hydrophilic i...,"[(capillary electrophoresis, comparison), (zwi...","[(study, compares capZIC-HILIC and CE), (study...","{(Liquid Chromatography, Permutation), (Abunda...","{(Impacted, Nebulizers), (Analyte, L-lysine 6-...",{},"{(Liquid Chromatography, Permutation), (Impact..."
2,1654513,Angiotensin II receptor subtypes and biologica...,Research on angiotensin II (AII) receptor subt...,"[(Angiotensin II receptor subtypes, biological...",[(Rat and bovine adrenal zona glomerulosa cell...,"{(Cell Count, Receptor, Angiotensin II), (NLRP...","{(Cell Count, Receptor, Angiotensin II), (Bark...","{(Cell Count, Receptor, Angiotensin II), (Inte...","{(Bark - plant part, Cattle), (Known, SLC33A1 ..."
3,11814052,Mechanisms of self-incompatibility in flowerin...,Self-incompatibility is a common mechanism in ...,"[(self-incompatibility, mechanism), (self-inco...","[(Self-incompatibility, mechanism in flowering...","{(Endoribonucleases, pollen tube growth), (Inb...","{(Inbreeding, self incompatibility), (Family, ...","{(Inbreeding, self incompatibility), (Brassica...","{(Endoribonucleases, pollen tube growth), (Fam..."
4,16350614,Accuracy and speed of orthographic processing ...,A study compared 39 individuals with developme...,"[(persons with developmental dyslexia, orthogr...","[(study, compared individuals with development...","{(Age Cohort, Groups), (Disabled, Laboratory P...","{(Groups, Smaller), (Participant, Study), (Pho...","{(Control Groups, age differences), (Accurate ...","{(Participant, Study), (Delta (difference), Ol..."
...,...,...,...,...,...,...,...,...,...
95,3682109,Leads from the MMWR. Premature mortality due t...,The MMWR reports on early deaths caused by bre...,[],"[(MMWR, reports on early deaths), (early death...",{},"{(Malignant neoplasm of breast, United States)...",{},"{(Malignant neoplasm of breast, United States)..."
96,21937697,High endothelial venules as traffic control po...,High endothelial venules (HEVs) serve as cruci...,"[(High endothelial venules, traffic control po...","[(High endothelial venules (HEVs), regulate th...","{(Hepatitis E virus, Steady-State), (Lymphocyt...","{(High Endothelial Venule, Lesch-Nyhan Syndrom...","{(Lymphocyte, Mammals), (High Endothelial Venu...","{(High Endothelial Venule, Lesch-Nyhan Syndrom..."
97,26985970,Screening of Toxic Chemicals in a Single Drop ...,We introduce a novel approach for detecting to...,[(Surface-enhanced laser desorption/ionization...,"[(novel approach, detecting toxic chemicals), ...","{(Carbon, Ionization), (Carbon, Choose (action...","{(Background, Carbon), (Carbon, Salt Tolerance...","{(Carbon, Compound), (Macrophage Activation Sy...","{(Background, Carbon), (Carbon, Salt Tolerance..."
98,17340633,Predicting DNA-binding amino acid residues fro...,"To predict DNA-binding amino acid residues, we...","[(DNA-binding amino acid residues, predicted f...","[(positively charged atoms, clusters), (cluste...","{(Mutation, Strategy), (DNA Binding, Residue),...","{(Methods, Post-Translational Protein Processi...","{(Methods, Post-Translational Protein Processi...","{(Mutation, Strategy), (DNA Binding, Detected ..."


In [64]:
pmids_df

Unnamed: 0,pmid,pmid_text,rewritten_text,original_text_preds,rewritten_text_preds,original_text_preds_umls,rewritten_text_preds_umls,intersect_umls,diff_umls
0,1883399,Substrate analogues and divalent cations as in...,To explore whether glutamate decarboxylase fro...,"[(glutamate decarboxylase, source for study of...","[(Escherichia coli glutamate decarboxylase, mo...","{(inhibitors, zinc cation), (Analog, Inhibitor...","{(Compound, IMPACT gene), (Potential, isophtha...","{(Bacterial, Model), (Cations, Divalent, inhib...","{(Compound, IMPACT gene), (Enzymes, Glutamate ..."
1,32887154,Comparison of capillary electrophoresis and zw...,This study compares zwitterionic-hydrophilic i...,"[(capillary electrophoresis, comparison), (zwi...","[(study, compares capZIC-HILIC and CE), (study...","{(Liquid Chromatography, Permutation), (Abunda...","{(Impacted, Nebulizers), (Analyte, L-lysine 6-...",{},"{(Liquid Chromatography, Permutation), (Impact..."
2,1654513,Angiotensin II receptor subtypes and biologica...,Research on angiotensin II (AII) receptor subt...,"[(Angiotensin II receptor subtypes, biological...",[(Rat and bovine adrenal zona glomerulosa cell...,"{(Cell Count, Receptor, Angiotensin II), (NLRP...","{(Cell Count, Receptor, Angiotensin II), (Bark...","{(Cell Count, Receptor, Angiotensin II), (Inte...","{(Bark - plant part, Cattle), (Known, SLC33A1 ..."
3,11814052,Mechanisms of self-incompatibility in flowerin...,Self-incompatibility is a common mechanism in ...,"[(self-incompatibility, mechanism), (self-inco...","[(Self-incompatibility, mechanism in flowering...","{(Endoribonucleases, pollen tube growth), (Inb...","{(Inbreeding, self incompatibility), (Family, ...","{(Inbreeding, self incompatibility), (Brassica...","{(Endoribonucleases, pollen tube growth), (Fam..."
4,16350614,Accuracy and speed of orthographic processing ...,A study compared 39 individuals with developme...,"[(persons with developmental dyslexia, orthogr...","[(study, compared individuals with development...","{(Age Cohort, Groups), (Disabled, Laboratory P...","{(Groups, Smaller), (Participant, Study), (Pho...","{(Control Groups, age differences), (Accurate ...","{(Participant, Study), (Delta (difference), Ol..."
...,...,...,...,...,...,...,...,...,...
95,3682109,Leads from the MMWR. Premature mortality due t...,The MMWR reports on early deaths caused by bre...,[],"[(MMWR, reports on early deaths), (early death...",{},"{(Malignant neoplasm of breast, United States)...",{},"{(Malignant neoplasm of breast, United States)..."
96,21937697,High endothelial venules as traffic control po...,High endothelial venules (HEVs) serve as cruci...,"[(High endothelial venules, traffic control po...","[(High endothelial venules (HEVs), regulate th...","{(Hepatitis E virus, Steady-State), (Lymphocyt...","{(High Endothelial Venule, Lesch-Nyhan Syndrom...","{(Lymphocyte, Mammals), (High Endothelial Venu...","{(High Endothelial Venule, Lesch-Nyhan Syndrom..."
97,26985970,Screening of Toxic Chemicals in a Single Drop ...,We introduce a novel approach for detecting to...,[(Surface-enhanced laser desorption/ionization...,"[(novel approach, detecting toxic chemicals), ...","{(Carbon, Ionization), (Carbon, Choose (action...","{(Background, Carbon), (Carbon, Salt Tolerance...","{(Carbon, Compound), (Macrophage Activation Sy...","{(Background, Carbon), (Carbon, Salt Tolerance..."
98,17340633,Predicting DNA-binding amino acid residues fro...,"To predict DNA-binding amino acid residues, we...","[(DNA-binding amino acid residues, predicted f...","[(positively charged atoms, clusters), (cluste...","{(Mutation, Strategy), (DNA Binding, Residue),...","{(Methods, Post-Translational Protein Processi...","{(Methods, Post-Translational Protein Processi...","{(Mutation, Strategy), (DNA Binding, Detected ..."


In [61]:
pmids_df.to_pickle('data/021225/100_pmids_w_umls_terms.pkl')