In [2]:
import pickle


class UMLS_CUIs:
    
    cui = {'SNOMEDCT_US':1, 'MSH':2, 'MDR':3, 'RXNORM':4}
    """
    This is the global dictionary of the location in the structure to get
    a the structure's CUIs from different databases
    """
    
    
    def __init__(self):
        """Initializes the class to create the UMLS dictionary of preferred terms
        """
        self.structure = {}

    def process(self, location):
        """
        Processes the file given. This was designed to process MRCONSO.rff
        on English terms and only return SNOMED, MESH, MEDDRA, and RXNORM CUIs
        
        :param location: location of MRCONSO.rff
        """
        with open(location, 'r') as fil:
            
            for line in fil:
                split = line.split('|')
                umls_cui, language, kb, term_type, kb_cui, kb_name = self.__getConcepts(split)
                self.__addCUI(umls_cui, language, kb, term_type, kb_cui, kb_name)
                
                
    def __getConcepts(self, split):
        """
        Gets the Concepts from MRCONSO.RRF
        
        Creates a multi-return of:
        - UMLS CUI as umls_cui
        - Language of the Term as language
        - Knowledgebase as kb
        - term type (preferred term as PT, synonym as SY etc.) as term_type
        - Knowledgebase CUI as kb_cui
        - Name of Term as kb_name
        
        :param split: a list of a line from MRCONSO.rff that was pre-split from a previous method
        """
        umls_cui = split[0].strip()
        language = split[1].strip()
        kb = split[11].strip()
        term_type = split[12].strip()
        kb_cui = split[13].strip()
        kb_name = split[14].strip()
        return umls_cui, language, kb, term_type, kb_cui, kb_name
    
    def __addCUI(self, umls_cui, language, kb, term_type, kb_cui, kb_name):
        """
        Adds the UMLS CUIs to the UMLS python structure
        
        The CUIs will be added if and only if the UMLS entity is English
        and a preferred term. SNOMED and MESH are assumed to all be active
        since I didnt' have any conflicts with many CUIs for them.
        
        For now, for every different kb CUI, I add it to itself pipe delimited.
        So for UMLS CUIs with multiple SNOMED CUIs -> cui1|cui2... etc.
        
        :param umls_cui: the UMLS CUI
        :param language: language of the UMLS entity (ENG etc.)
        :param kb: Type of knowledgebase UMLS is pointing to
        :param term_type: type of term given (PT etc.)
        :param kb_cui: the type of cui from the knowledgebase given
        :param kb_name: the string of the term
        """
        self.__initializeEntityIfNotPresent(umls_cui)
        
        #print term_type
        #print language
        #print kb_cui
        #print ''
        #if term_type == 'PT' and language == 'ENG' and kb in self.cui:
        if language == 'ENG' and ((kb in ('RXNORM', 'MSH')) or term_type in ('PT', 'MH', 'SCD','MIN')) and kb in self.cui:
            #if self.structure[umls_cui][self.cui[kb]] is not None:
                #print umls_cui + ' ' + kb + ' '
                #print 'old: ' + self.structure[umls_cui][self.cui[kb]]
                #print 'new: ' + kb_cui
                #self.structure[umls_cui][0] += '|' + kb_name
                #self.structure[umls_cui][self.cui[kb]] += '|' + kb_cui
                #print ''
            #else:
                #self.structure[umls_cui][self.cui[kb]] = kb_cui
            if self.structure[umls_cui][0] == None:
                self.structure[umls_cui][0] = []
                self.structure[umls_cui][0].append(kb_name)
            else:
                self.structure[umls_cui][0].append(kb_name)
            if kb_cui not in self.structure[umls_cui][self.cui[kb]]:
                self.structure[umls_cui][self.cui[kb]].append(kb_cui)
                
    def __inStructure(self, umls_cui):
        """
        This will check whether the UMLS CUI is in the structure
        
        :param umls_cui: the UMLS CUI being fed into the structure
        :return: a boolean if the CUI being fed is in the UMLS dic structure
        """
        return (umls_cui in self.structure)
    
    def __initializeEntityIfNotPresent(self, umls_cui):
        """
        Puts an empty entity into the UMLS dictionary structure if not
        present in the structure as the UMLS CUI as a string and 
        5 tuples (currently) of strings. They will be initialized as None
        and added as the file processes MRCONSO.RRF
        
        It is not expected that this will be full per entity
        
        Structure -> struc[cui] = (
                                    preferred name,
                                    SNOMED CUI,
                                    MESH CUI,
                                    MEDDRA CUI,
                                    RXNORM CUI
                                  )
        :param umls_cui: the key of the UMLS structure
        :param kb_name: the preferred UMLS entity's name
        """
        if not (self.__inStructure(umls_cui)):
            self.structure[umls_cui] = [None, [], [], [], []]
            
    def getName(self, umls_cui):
        """
        :param umls_cui: The UMLS CUI being retrieved
        :return: The name of the Concept's CUI if present, else None
        """
        if self.__inStructure(umls_cui):
            return self.structure[umls_cui][0]
        return None
        
    def getSnomedct_usCui(self, umls_cui):
        """
        :param umls_cui: The UMLS CUI being retrieved
        :return: The SNOMEDCT_US CUI if present, else None
        """
        if self.__inStructure(umls_cui):
            return self.listToPipe(self.structure[umls_cui][1])
            #return self.structure[umls_cui][1]
        return None
        
    def getMeshCui(self, umls_cui):
        """
        :umls_cui: The UMLS CUI being retrieved
        :return: The MeSH CUI if present
        """
        if self.__inStructure(umls_cui):
            return self.listToPipe(self.structure[umls_cui][2])
            #return self.structure[umls_cui][2]
        return None
        
    def getMeddraCui(self, umls_cui):
        """
        :param umls_cui: The UMLS CUI being retrieved
        :return: The MedDRA CUI if present
        """
        if self.__inStructure(umls_cui):
            return self.listToPipe(self.structure[umls_cui][3])
            #return self.structure[umls_cui][3]
        return None
        
    def getRxnormCui(self, umls_cui):
        """
        :param umls_cui: The UMLS CUI being retrieved
        :return: the RxNorm CUI if present
        """
        if self.__inStructure(umls_cui):
            return self.listToPipe(self.structure[umls_cui][4])
            #return self.structure[umls_cui][4]
        return None

    def listToPipe(self, lis):
        start = True
        string = None
        for li in lis:
            if start:
                start = False
                string = li
            else:
                string += '|' + li
                
        return string

In [3]:
inp = '../Volumes/MRCONSO.RRF'
umls_cuis = UMLS_CUIs()
umls_cuis.process(inp)

In [4]:
def select_term(List):
    count = {}
    for term in List:
        c = List.count(term)
        if c not in count.keys():
            count[c] = set()
            count[c].add(term)
        else:
            count[c].add(term)
    
    most_freq_terms = count[max(count.keys())]

    if len(most_freq_terms) == 1:
        return most_freq_terms.pop().replace(' ', '_')

    length = 0
    res = None
    for term in most_freq_terms:
        term = term.replace(' ', '_')
        if length == 0:
            length = len(term)
            res = term
        else:
            if len(term) < length:
                length = len(term)
                res = term
    
    return res.lower()

In [5]:
testCUI = 'C4762564'
print(umls_cuis.getName(testCUI))
select_term(umls_cuis.getName(testCUI))

['5 ML hyaluronidase-oysk 2000 UNT/ML / trastuzumab-oysk 120 MG/ML Injection', 'trastuzumab-hyaluronidase-oysk 600 MG / 10,000 UNT per 5 ML Injection', 'trastuzumab-hyaluronidase-oysk 600 MG / 10,000 UNT in 5 mL Injection', 'trastuzumab-oysk 600 MG / hyaluronidase-oysk 10,000 UNT per 5 ML Injection']


'trastuzumab-hyaluronidase-oysk_600_mg_/_10,000_unt_in_5_ml_injection'

In [6]:
rel_list = [
    'gene_associated_with_disease',
    'disease_has_associated_gene',
    'gene_mapped_to_disease',
    'disease_mapped_to_gene',
    'may_be_treated_by',
    'may_treat',
    'may_be_prevented_by',
    'may_prevent',
]

In [14]:
import json
from tqdm import tqdm

entity_set = set()

f = open('../TuckER/data/UMLS-PubMed/raw_data/train.json')
dc = json.load(f)

f = open('../TuckER/data/UMLS-PubMed/raw_data/test_5sent.json')
tmp = json.load(f)

dc = dc + tmp

data_corpus = []

for item in tqdm(dc):
    head, tail, relation = item['head']['id'], item['tail']['id'], item['relation']
    # if relation in rel_list:
    if head not in entity_set:
        entity_set.add(head)
    if tail not in entity_set:
        entity_set.add(tail)


f_train = open('../TuckER/data/UMLS-PubMed/raw_data/train.txt')
f_test = open('../TuckER/data/UMLS-PubMed/raw_data/test.txt')
f_dev = open('../TuckER/data/UMLS-PubMed/raw_data/valid.txt')



data_kg_raw = f_train.readlines() + f_test.readlines() + f_dev.readlines()
data_kg = []
for line in tqdm(data_kg_raw):
    items = line[:-1].split('\t')
    head, relation, tail = items[0], items[1], items[2]
    # if relation in rel_list:
    if head not in entity_set:
        entity_set.add(head)
    if tail not in entity_set:
        entity_set.add(tail)




100%|██████████| 5645558/5645558 [00:03<00:00, 1501191.48it/s]
100%|██████████| 2048286/2048286 [00:01<00:00, 1033547.25it/s]


In [15]:
import pandas as pd

van_df = pd.read_csv('../Volumes/VANDF.csv')


In [16]:
import re

van_dict = {}
for entity in tqdm(entity_set):
    try:
        label_ = van_df[van_df['CUI']==entity]['VA generic name'].iloc[0].lower()
        label = re.sub('[^a-zA-Z0-9 \n\.]', '_', label_)
        van_dict[entity] = label.replace(' ', '_')
    except:
        continue


100%|██████████| 59225/59225 [01:57<00:00, 504.31it/s]


In [17]:
len(van_dict)

4290

In [18]:
nci_dict = {}
nci = open('../Volumes/nci_code_cui_map_202208.dat.txt')
nci_ = nci.readlines()

In [19]:
for item in nci_:
    id_, term = item.split('|')[1], item.split('|')[2]
    nci_dict[id_] = term.lower().replace(' ', '_')

In [20]:
mapping = ""

unk_cnt = 0
for entity in tqdm(entity_set):
    if entity in van_dict.keys():
        tmp = entity + '\t' + van_dict[entity].lower() + '\n'
        mapping += tmp
        continue
    # print(entity)
    terms = umls_cuis.getName(entity)
    if terms != None:
        # print(english_term)
        english_term = select_term(terms)
        label = re.sub('[^a-zA-Z0-9 \n\.]', '_', english_term)
        tmp = entity + '\t' + label.lower() + '\n'
    else:
        if entity in nci_dict.keys():
            label = re.sub('[^a-zA-Z0-9 \n\.]', '_', nci_dict[entity])
            tmp = entity + '\t' + label.lower() + '\n'
        else:
            tmp = entity + '\t' + entity + '\n'
            unk_cnt += 1
    mapping += tmp

map_ = open("./entity2label.txt", 'w', encoding='utf-8')
print(mapping, file=map_)
print(unk_cnt)

100%|██████████| 59225/59225 [00:00<00:00, 127581.29it/s]

25347





In [20]:
mapping = ""

for entity in tqdm(entity_set):
    
    tmp = entity + '\t' + entity + '\n'
    mapping += tmp

map_ = open("./entity2label_code.txt", 'w', encoding='utf-8')
print(mapping, file=map_)

100%|██████████| 9737/9737 [00:00<00:00, 622038.51it/s]


In [48]:
np.argsort(doc_scores)[::-1]

array([3, 5, 4, 2, 1, 0])

In [1]:
'Macrofollicular_adenoma'.lower()

'macrofollicular_adenoma'

In [51]:
tokenized_corpus

[['Hello', 'there', 'good', 'man!'],
 ['It', 'is', 'quite', 'windy', 'in', 'London'],
 ['London', 'is', 'beautiful', 'city', 'which', 'is', 'windy', 'often'],
 ['windy', 'london'],
 ['London', 'is', 'sunny'],
 ['How', 'is', 'the', 'weather', 'today?']]

In [22]:
'C100'.startswith('B')

False