### Como manipular XML com XPath

In [10]:
from gensim.models import KeyedVectors
import unicodedata

import gzip
import xml.etree.ElementTree as ET

with gzip.open('pordesc2018-small.xml.gz') as pordesc2018:
    tree = ET.parse(pordesc2018)
    

### Monta Dicionario

In [11]:
from unicodedata import normalize
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [12]:
wordModel = KeyedVectors.load_word2vec_format('health_w2v_unigram_50.bin', binary=True)

dictMesh = {}
for d in tree.findall("./DescriptorRecord"):
    terms = []
    
    for aql in d.findall('.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo/QualifierName'):
        teste_qualifier = aql.find('./String').text
        if(teste_qualifier == 'anatomy & histology' or teste_qualifier == 'pharmacology' or teste_qualifier == 'methods' or teste_qualifier == 'diagnosis'):
            qualifier = teste_qualifier
            break
        else:
            qualifier = "#"
    
    for c in d.findall('.ConceptList/'):
        if c.find('./ScopeNote') != None:
            scope = c.find('./ScopeNote').text.replace('\n','').strip()
        for t in c.findall('./TermList/'):
            terms.append(t.find('./String').text)
            
            palavra_similar = []
            
            if t.find('./String').text.lower() in wordModel.vocab:
                sem_assento = remover_acentos(t.find('./String').text)
                palavra_similar = wordModel.most_similar_cosmul(sem_assento.lower(),topn=10)
                for similar, porcentagem in palavra_similar:
                    if(porcentagem > 0.9):
                        terms.append("<i>"+similar+"</i>")

            
    dictMesh[d.find('.DescriptorUI').text] = {
        'ID': d.find('.DescriptorUI').text,
        'name': d.find('.DescriptorName/String').text,
        'scope': scope,
        'terms': set(terms),
        'qualifier': qualifier
    }
                
    #break
len(dictMesh)

1859

### Imprime Exemplo

In [13]:
#for i in dictMesh:
#    print (i)

### Salva Dicionario

In [14]:
import gzip, pickle

with gzip.open('dictMesh.dict.gz','wb') as fp:
    pickle.dump(dictMesh,fp)
    fp.close()

### Carrega Dicionario

In [16]:
import gzip, pickle

with gzip.open('dictMesh.dict.gz','rb') as fp:
    dictMesh = pickle.load(fp)
    fp.close()
    
dictMesh['D000212']

{'ID': 'D000212',
 'name': 'Aciclovir[Acyclovir]',
 'scope': 'A GUANOSINE analog that acts as an antimetabolite. Viruses are especially susceptible. Used especially against herpes.',
 'terms': {'9-((2-Hydroxyethoxy)methyl)guanine',
  '<i>bactrim</i>',
  '<i>bactrin</i>',
  '<i>clindamicina</i>',
  '<i>fluconazol</i>',
  '<i>ganciclovir</i>',
  '<i>sulfadiazina</i>',
  'Aci Sanorania',
  'Aci-Sanorania',
  'Acic',
  'Aciclobeta',
  'Acicloguanosina',
  'Aciclostad',
  'Aciclovir',
  'Aciclovir Alonga',
  'Aciclovir Sanorania',
  'Aciclovir-Sanorania',
  'Acifur',
  'Acipen Solutab',
  'Acivir',
  'Activir',
  'Acyclo V',
  'Acyclo-V',
  'Acycloguanosine',
  'Acyclovir',
  'Acyclovir Sodium',
  'Alonga, Aciclovir',
  'Antiherpes Creme',
  'Avirax',
  'Cicloferon',
  'Clonorax',
  'Cusiviral',
  'Genvir',
  'Herpetad',
  'Herpofug',
  'Herpotern',
  'Herpoviric',
  'Isavir',
  'Laciken',
  'Mapox',
  'Maynar',
  'Milavir',
  'Opthavir',
  'Sodium, Acyclovir',
  'Solutab, Acipen',
  'Supra

In [48]:
evolucao = 'Wellcome 248U'
evolucao = evolucao.split(' ')
for i, palavra in enumerate(evolucao):

    ## Busca palavra no Mesh
    for dui in dictMesh:
        d = dictMesh[dui]
        for t in d['terms']:
            new_t = t.replace('<i>', '')
            new_t = new_t.replace('</i>', '')

            if i+1 < (len(evolucao)) and palavra.lower()+" "+evolucao[i+1].lower() == new_t.lower():
                    teste = dictMesh[dui]['terms']
                    termos = '<br/>- '.join(teste)
                    print(termos)
                    break
            elif new_t.lower() == palavra.lower():
                teste = dictMesh[dui]['terms']
                termos = '<br/>- '.join(teste)

<i>clindamicina</i><br/>- Virherpes<br/>- Acipen Solutab<br/>- Aciclobeta<br/>- Maynar<br/>- Mapox<br/>- Virolex<br/>- Acycloguanosine<br/>- Zyclir<br/>- Cusiviral<br/>- Zovirax<br/>- Isavir<br/>- Acyclovir Sodium<br/>- Cicloferon<br/>- <i>sulfadiazina</i><br/>- Herpotern<br/>- <i>fluconazol</i><br/>- Aci Sanorania<br/>- Alonga, Aciclovir<br/>- Virax Puren<br/>- Virmen<br/>- Aciclovir Alonga<br/>- Avirax<br/>- Acivir<br/>- Aciclovir-Sanorania<br/>- Virupos<br/>- Vipral<br/>- Laciken<br/>- Viclovir<br/>- Sodium, Acyclovir<br/>- Aci-Sanorania<br/>- Solutab, Acipen<br/>- Virax-Puren<br/>- Milavir<br/>- Wellcome248U<br/>- Herpetad<br/>- Acyclovir<br/>- Opthavir<br/>- aciclovir von ct<br/>- Activir<br/>- Clonorax<br/>- Zoliparin<br/>- Acyclo V<br/>- Acic<br/>- Acifur<br/>- Aciclostad<br/>- <i>bactrin</i><br/>- Virzin<br/>- 9-((2-Hydroxyethoxy)methyl)guanine<br/>- Wellcome 248U<br/>- <i>bactrim</i><br/>- ViraxPuren<br/>- Aciclovir<br/>- Herpoviric<br/>- Wellcome-248U<br/>- Antiherpes Creme<b

In [None]:
termos=" "
confirma = False
evolucao = "abd"
strr = ""

#Verifica a lista para ver se a palavra esta no dicionario
cont = 0
for palavra in evolucao:
    for dui in dictMesh:
        d = dictMesh[dui]
        for t in d['terms']:
            if t.lower() == palavra.lower():
                evolucao[cont] = '<a href="#" data-ui="das" data-term="" data-scope="'+d['scope']+'">'+palavra+'</a>'
                cont +=1

    teste = dictMesh[dui]['terms']
    termos = ' '.join(teste)
evolucao = evolucao.replace('data-terms=""', 'data-terms="'+termos+'"')#pegar os termos da lista e por aqui
strr += ' '+evolucao

#print(termos)
termos

In [None]:
d['terms']

In [None]:
strr = "meu[nome]"
strr = strr.replace('[', ' [')
print(strr)
