In [1]:
import numpy as np 
import pandas as pd

### MRSTY, extract semantic type

In [5]:
with open('2020AB/META/MRSTY.RRF', 'r') as f: 
    data = f.readlines()
data = [x.split('|') for x in data]

columns = ['CUI', 'unknown1', 'unknown2', 'STY', 'unknown3', 'unknown4','unknown5']

df_umls = pd.DataFrame(data, columns = columns)
#df_umls = df_umls.query('language=="ENG"')
df_umls.head()

Unnamed: 0,CUI,unknown1,unknown2,STY,unknown3,unknown4,unknown5
0,C0000005,T116,A1.4.1.2.1.7,"Amino Acid, Peptide, or Protein",AT17648347,256,\n
1,C0000005,T121,A1.4.1.1.1,Pharmacologic Substance,AT17575038,256,\n
2,C0000005,T130,A1.4.1.1.4,"Indicator, Reagent, or Diagnostic Aid",AT17634323,256,\n
3,C0000039,T109,A1.4.1.2.1,Organic Chemical,AT45562015,256,\n
4,C0000039,T121,A1.4.1.1.1,Pharmacologic Substance,AT17567371,256,\n


In [7]:
df_sty=df_umls[['CUI','STY']]
df_sty.head()

Unnamed: 0,CUI,STY
0,C0000005,"Amino Acid, Peptide, or Protein"
1,C0000005,Pharmacologic Substance
2,C0000005,"Indicator, Reagent, or Diagnostic Aid"
3,C0000039,Organic Chemical
4,C0000039,Pharmacologic Substance


In [8]:
# get number of rows
df_sty.shape[0]

4733589

In [9]:
# get value counts
df_sty['STY'].value_counts()

Eukaryote                              979982
Bacterium                              420675
Finding                                308353
Therapeutic or Preventive Procedure    307004
Organic Chemical                       246667
                                        ...  
Chemical                                   29
Entity                                     27
Molecular Sequence                         12
Fully Formed Anatomical Structure           8
Carbohydrate Sequence                       2
Name: STY, Length: 127, dtype: int64

In [10]:
# drop NA values
df_sty.dropna(axis=0,how='any', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
df_sty.shape[0]

4733589

In [12]:
len(pd.unique(df_sty['CUI']))

4413090

In [13]:
pd.unique(df_sty['STY'])

array(['Amino Acid, Peptide, or Protein', 'Pharmacologic Substance',
       'Indicator, Reagent, or Diagnostic Aid', 'Organic Chemical',
       'Enzyme', 'Biologically Active Substance',
       'Hazardous or Poisonous Substance', 'Hormone',
       'Nucleic Acid, Nucleoside, or Nucleotide', 'Vitamin', 'Antibiotic',
       'Immunologic Factor', 'Tissue', 'Manufactured Object',
       'Intellectual Product', 'Body Location or Region',
       'Sign or Symptom', 'Finding', 'Neoplastic Process',
       'Body Part, Organ, or Organ Component', 'Virus',
       'Disease or Syndrome', 'Congenital Abnormality',
       'Functional Concept', 'Anatomical Abnormality',
       'Injury or Poisoning', 'Classification', 'Embryonic Structure',
       'Pathologic Function', 'Patient or Disabled Group',
       'Health Care Related Organization', 'Regulation or Law',
       'Therapeutic or Preventive Procedure', 'Quantitative Concept',
       'Research Activity', 'Mental Process', 'Individual Behavior',
     

In [14]:
len(pd.unique(df_sty['STY']))  #127 types of entity

127

### Disorder

Where the Disorder semantic group was defined to be all semantic types in the UMLS Disorders Semantic Group except Findings, a total of 11 types (Congenital Abnormality, Acquired Abnormality, Injury or Poisoning, Pathologic Function, Disease or Syndrome, Mental or Behavioral Dysfunction, Cell or Molecular Dysfunction, Experimental Model of Disease, Signs and Symptoms, Anatomical Abnormality, Neoplastic Process).

**Reference Paper:** Machine Learning of Patient Characteristics to Predict Admission Outcomes in the Undiagnosed Diseases Network

In [15]:
disorder_entity=['Congenital Abnormality', 
                 'Acquired Abnormality',
                 'Injury or Poisoning', 
                 'Pathologic Function', 
                 'Disease or Syndrome', 
                 'Mental or Behavioral Dysfunction', 
                 'Cell or Molecular Dysfunction', 
                 'Experimental Model of Disease', 
                 'Sign and Symptom', 
                 'Anatomical Abnormality', 
                 'Neoplastic Proces']

In [18]:
df_sty_disorder=df_sty.loc[df_sty['STY'].isin(disorder_entity)]
df_sty_disorder=pd.DataFrame(data=df_sty_disorder)
df_sty_disorder.head()

Unnamed: 0,CUI,STY
254,C0000744,Disease or Syndrome
255,C0000754,Congenital Abnormality
256,C0000762,Congenital Abnormality
261,C0000768,Congenital Abnormality
263,C0000770,Anatomical Abnormality


In [19]:
df_sty_disorder.shape[0]

288682

In [20]:
len(pd.unique(df_sty_disorder['CUI']))

288119

In [21]:
#extract all CUI
disorder_cui=df_sty_disorder['CUI']

### Disease

In [24]:
df_sty_disease=df_sty.loc[df_sty['STY']=='Disease or Syndrome']
df_sty_disease=pd.DataFrame(data=df_sty_disease)
df_sty_disease.head()

Unnamed: 0,CUI,STY
254,C0000744,Disease or Syndrome
267,C0000774,Disease or Syndrome
290,C0000809,Disease or Syndrome
293,C0000814,Disease or Syndrome
300,C0000823,Disease or Syndrome


In [25]:
df_sty_disease.shape[0]

112853

In [26]:
len(pd.unique(df_sty_disease['CUI']))

112853

In [27]:
#extract all cui
disease_cui=df_sty_disease['CUI']

### MRDEF
Extract definition.

In [28]:
with open('2020AB/META/MRDEF.RRF', 'r') as f: 
    data_def = f.readlines()
data_def = [x.split('|') for x in data_def]

In [31]:
columns_def = ['CUI', 'unknown1', 'unknown2', 
               'space', 'source', 'description',
               'unknown4','unknown5','unknown6']

df_umls_def = pd.DataFrame(data_def, columns = columns_def)
#df_umls = df_umls.query('language=="ENG"')
df_umls_def.head()

Unnamed: 0,CUI,unknown1,unknown2,space,source,description,unknown4,unknown5,unknown6
0,C0000039,A0016515,AT38152019,,MSH,Synthetic phospholipid used in liposomes and l...,N,,\n
1,C0000039,A13096036,AT254753550,,MSHPOR,Fosfolipídeo sintético utilizado em lipossomos...,N,,\n
2,C0000052,A0016535,AT38148809,,MSH,"In glycogen or amylopectin synthesis, the enzy...",N,,\n
3,C0000052,A9139513,AT254753551,,MSHPOR,"Na síntese de glicogênio ou amilopectina, a en...",N,,\n
4,C0000084,A0016576,AT38151982,,MSH,"Found in various tissues, particularly in four...",N,,\n


In [33]:
df_def=df_umls_def[['CUI','source','description']]
df_def.head()

Unnamed: 0,CUI,source,description
0,C0000039,MSH,Synthetic phospholipid used in liposomes and l...
1,C0000039,MSHPOR,Fosfolipídeo sintético utilizado em lipossomos...
2,C0000052,MSH,"In glycogen or amylopectin synthesis, the enzy..."
3,C0000052,MSHPOR,"Na síntese de glicogênio ou amilopectina, a en..."
4,C0000084,MSH,"Found in various tissues, particularly in four..."


In [34]:
df_def.shape[0]

383351

### Check all source

In [35]:
len(pd.unique(df_def['source']))

72

In [36]:
pd.unique(df_def['source'])

array(['MSH', 'MSHPOR', 'CSP', 'MSHCZE', 'NCI', 'PDQ', 'NCI_NCI-GLOSS',
       'MSHNOR', 'CHV', 'NCI_CRCH', 'NCI_CareLex', 'NCI_CDISC-GLOSS',
       'UWDA', 'FMA', 'SNOMEDCT_US', 'SCTSPA', 'HPO', 'NCI_CTCAE',
       'NCI_NICHD', 'MEDLINEPLUS', 'NCI_ACC-AHA', 'NCI_CDISC', 'NCI_FDA',
       'NCI_GAIA', 'HL7V3.0', 'NEU', 'PSY', 'MSHFRE', 'SPN', 'AIR', 'GO',
       'CCC', 'UMD', 'NIC', 'ALT', 'NCI_EDQM-HC', 'NCI_INC', 'LNC',
       'JABL', 'NUCCPT', 'ICF', 'ICF-CY', 'NCI_BRIDG_3_0_3',
       'NCI_BRIDG_5_3', 'NANDA-I', 'PNDS', 'NOC', 'OMS', 'NCI_CTEP-SDC',
       'NCI_DICOM', 'NCI_KEGG', 'NCI_BioC', 'MCM', 'AOT', 'SOP', 'MSHSCR',
       'NCI_CTCAE_5', 'NCI_CTCAE_3', 'MDR', 'MDRFRE', 'MDRGER', 'MDRITA',
       'MDRPOR', 'MDRCZE', 'MDRJPN', 'MDRHUN', 'MDRSPA', 'MDRDUT',
       'MDRRUS', 'MDRKOR', 'MDRBPO', 'NCI_BRIDG'], dtype=object)

 'MSH'(English),'MSHPOR', 'CSP'(English), 'MSHCZE', 'NCI'(English), 'PDQ'(English), 'NCI_NCI-GLOSS'(English),
       'MSHNOR', 'CHV'(English), 'NCI_CRCH'(English), 'NCI_CareLex'(English), 'NCI_CDISC-GLOSS'(English),
       'UWDA'(English), 'FMA'(English), 'SNOMEDCT_US'(English), 'SCTSPA', 'HPO'(English), 'NCI_CTCAE'(English),
       'NCI_NICHD'(English), 'MEDLINEPLUS'(English), 'NCI_ACC-AHA'(English), 'NCI_CDISC'(English), 'NCI_FDA'(English),
           'NCI_GAIA'(English), 'HL7V3.0'(English), 'NEU'(English), 'PSY'(English), 'MSHFRE', 'SPN'(English), 'AIR'(English), 'GO'(English),
       'CCC'(English), 'UMD'(English), 'NIC'(English), 'ALT'(English), 'NCI_EDQM-HC'(English), 'NCI_INC'(English), 'LNC'(English),
       'JABL'(English), 'NUCCPT'(English), 'ICF'(English), 'ICF-CY'(English), 'NCI_BRIDG_3_0_3'(English),
           'NCI_BRIDG_5_3'(English), 'NANDA-I'(English), 'PNDS'(English), 'NOC'(English), 'OMS'(English), 'NCI_CTEP-SDC'(English),
       'NCI_DICOM'(English), 'NCI_KEGG'(English), 'NCI_BioC'(English), 'MCM'(English), 'AOT'(Website), 'SOP'(English), 'MSHSCR',
       'NCI_CTCAE_5'(English), 'NCI_CTCAE_3'(English), 'MDR'(English), 'MDRFRE', 'MDRGER', 'MDRITA',
       'MDRPOR', 'MDRCZE', 'MDRJPN', 'MDRHUN', 'MDRSPA', 'MDRDUT',
       'MDRRUS', 'MDRKOR', 'MDRBPO', 'NCI_BRIDG'

In [110]:
english_source=['MSH','CSP','NCI','PDQ','NCI_NCI-GLOSS','CHV',
               'NCI_CRCH','NCI_CareLex','NCI_CDISC-GLOSS','UWDA','FMA','SNOMEDCT_US','HPO',
               'NCI_CTCAE','NCI_NICHD','MEDLINEPLUS','NCI_ACC-AHA','NCI_CDISC','NCI_FDA',
               'NCI_GAIA','HL7V3.0','NEU','PSY','SPN','AIR','GO','CCC','UMD','NIC',
               'ALT','NCI_EDQM-HC','NCI_INC','LNC','JABL','NUCCPT','ICF','ICF-CY','NCI_BRIDG_3_0_3',
               'NCI_BRIDG_5_3','NANDA-I','PNDS','NOC','OMS','NCI_CTEP-SDC','NCI_DICOM','NCI_KEGG',
               'NCI_BioC','MCM','SOP','NCI_CTCAE_5','NCI_CTCAE_3','MDR']

len(english_source) #52 English Source

52

#### Extract all English Source

In [111]:
df_def_eng=df_def.loc[df_def['source'].isin(english_source)]

In [112]:
df_def_eng=pd.DataFrame(data=df_def_eng)

In [114]:
df_def_eng.shape[0]

324660

### Disorder

In [119]:
disorder_def=df_def_eng.loc[df_def_eng['CUI'].isin(disorder_cui)]
disorder_def.shape[0]

32571

In [120]:
disorder_def.head()

Unnamed: 0,CUI,source,description
308,C0000744,MSH,An autosomal recessive disorder of lipid metab...
309,C0000744,CSP,disorder of lipid metabolism inherited as an a...
310,C0000744,NCI,An autosomal recessive disorder characterized ...
311,C0000744,HPO,An absence of low-density lipoprotein choleste...
314,C0000768,MSH,Malformations of organs or body parts during d...


In [121]:
len(pd.unique(disorder_def['CUI']))

20411

In [129]:
disorder_def['source'].value_counts()

NCI                9508
HPO                6399
MSH                4482
SNOMEDCT_US        4336
NCI_NICHD          2408
CSP                1340
CHV                 728
JABL                720
NCI_CTCAE           555
MEDLINEPLUS         551
NCI_NCI-GLOSS       392
NCI_CDISC           277
PSY                 219
NCI_FDA             160
GO                  101
HL7V3.0              99
NCI_ACC-AHA          83
PDQ                  54
NANDA-I              35
LNC                  32
AIR                  27
CCC                  26
NCI_GAIA             17
NCI_CDISC-GLOSS       7
FMA                   3
UWDA                  2
NCI_BRIDG_3_0_3       1
NCI_KEGG              1
ICF-CY                1
ICF                   1
PNDS                  1
NOC                   1
NCI_BRIDG_5_3         1
MDR                   1
NCI_EDQM-HC           1
OMS                   1
Name: source, dtype: int64

In [122]:
disorder_def.to_csv('umls_def_disorder.csv')

### Disease

In [123]:
disease_def=df_def_eng.loc[df_def_eng['CUI'].isin(disease_cui)]

In [124]:
disease_def.shape[0]

20344

In [125]:
disease_def.head()

Unnamed: 0,CUI,source,description
308,C0000744,MSH,An autosomal recessive disorder of lipid metab...
309,C0000744,CSP,disorder of lipid metabolism inherited as an a...
310,C0000744,NCI,An autosomal recessive disorder characterized ...
311,C0000744,HPO,An absence of low-density lipoprotein choleste...
383,C0000809,MSH,Three or more consecutive spontaneous abortions.


In [126]:
len(pd.unique(disease_def['CUI']))

11345

In [127]:
disease_def.to_csv('umls_def_disease.csv')