### Como manipular XML com XPath

In [61]:
from gensim.models import KeyedVectors
import unicodedata

import gzip
import xml.etree.ElementTree as ET

with gzip.open('pordesc2018-small.xml.gz') as pordesc2018:
    tree = ET.parse(pordesc2018)
    

### Monta Dicionario

In [62]:
from unicodedata import normalize
def remover_acentos(txt):
    return normalize('NFKD', txt).encode('ASCII', 'ignore').decode('ASCII')

In [65]:
wordModel = KeyedVectors.load_word2vec_format('health_w2v_unigram_50.bin', binary=True)

dictMesh = {}
for d in tree.findall("./DescriptorRecord"):
    terms = []
    
    qualifier = '#'
    
    for aql in d.findall('.AllowableQualifiersList/AllowableQualifier/QualifierReferredTo/QualifierName'):
        teste_qualifier = aql.find('./String').text
        if(teste_qualifier == 'anatomy & histology' or teste_qualifier == 'pharmacology' or teste_qualifier == 'methods' or teste_qualifier == 'diagnosis'):
            qualifier = teste_qualifier
            break
    
    for c in d.findall('.ConceptList/'):
        if c.find('./ScopeNote') != None:
            scope = c.find('./ScopeNote').text.replace('\n','').strip()
        for t in c.findall('./TermList/'):
            terms.append(t.find('./String').text)
            
            palavra_similar = []
            
            if t.find('./String').text.lower() in wordModel.vocab:
                sem_assento = remover_acentos(t.find('./String').text)
                palavra_similar = wordModel.most_similar_cosmul(sem_assento.lower(),topn=10)
                for similar, porcentagem in palavra_similar:
                    if(porcentagem > 0.9 and qualifier != 'pharmacology'):
                        ID = d.find('.DescriptorUI').text
                        terms.append("<i>"+similar+"</i>" + " <input type='radio' name='"+similar+"' value='1'/> Certo <input type='radio' name='"+similar+"'' value='0'/> Errado")
                    elif(porcentagem > 0.95 and qualifier == 'pharmacology'):
                        ID = d.find('.DescriptorUI').text
                        terms.append("<i>"+similar+"</i>")
    
    terms.append(" <input type='hidden' name='ID' value='"+ID+"'/> ")                         
    dictMesh[d.find('.DescriptorUI').text] = {
        'ID': d.find('.DescriptorUI').text,
        'name': d.find('.DescriptorName/String').text,
        'scope': scope,
        'terms': sorted(set(terms), reverse=True),
        'qualifier': qualifier
    }
                
    #break
dictMesh

{'D000001': {'ID': 'D000001',
  'name': 'Calcimicina[Calcimycin]',
  'scope': 'An ionophorous, polyether antibiotic from Streptomyces chartreusensis. It binds and transports CALCIUM and other divalent cations across membranes and uncouples oxidative phosphorylation while inhibiting ATPase of rat liver mitochondria. The substance is used mostly as a biochemical tool to study the role of divalent cations in various biological systems.',
  'terms': ['Calcimycin',
   'Calcimicina',
   'Antibiotic A23187',
   'A23187, Antibiotic',
   'A23187',
   'A-23187',
   'A 23187',
   " <input type='hidden' name='ID' value='D000339'/> "],
  'qualifier': 'pharmacology'},
 'D000002': {'ID': 'D000002',
  'name': 'Temefós[Temefos]',
  'scope': 'An organothiophosphate insecticide.',
  'terms': ['Temephos',
   'Temefós',
   'Temefos',
   'Difos',
   'Abate',
   " <input type='hidden' name='ID' value='D000339'/> "],
  'qualifier': 'pharmacology'},
 'D000003': {'ID': 'D000003',
  'name': 'Matadouros[Abattoirs

### Imprime Exemplo

In [57]:
sorted(terms)

[" <input type='hidden' name='ID' value='D000339'/> ",
 'Afibrinogenemia',
 'Afibrinogenemia',
 'Afibrinogenemias',
 'Deficiency, Fibrinogen',
 'Deficiência de Fibrinogênio',
 'Fibrinogen Deficiencies',
 'Fibrinogen Deficiency']

### Salva Dicionario

In [66]:
import gzip, pickle

with gzip.open('dictMesh.dict.gz','wb') as fp:
    pickle.dump(dictMesh,fp)
    fp.close()

### Carrega Dicionario

In [59]:
import gzip, pickle

with gzip.open('dictMesh.dict.gz','rb') as fp:
    dictMesh = pickle.load(fp)
    fp.close()


In [30]:
evolucao = 'Wellcome 248U'
evolucao = evolucao.split(' ')
for i, palavra in enumerate(evolucao):

    ## Busca palavra no Mesh
    for dui in dictMesh:
        d = dictMesh[dui]
        for t in d['terms']:
            new_t = t.replace('<i>', '')
            new_t = new_t.replace('</i>', '')

            if i+1 < (len(evolucao)) and palavra.lower()+" "+evolucao[i+1].lower() == new_t.lower():
                    teste = dictMesh[dui]['terms']
                    termos = '<br/>- '.join(teste)
                    print(termos)
                    break
            elif new_t.lower() == palavra.lower():
                teste = dictMesh[dui]['terms']
                termos = '<br/>- '.join(teste)

9-((2-Hydroxyethoxy)methyl)guanine<br/>- Aci Sanorania<br/>- Aci-Sanorania<br/>- Acic<br/>- Aciclobeta<br/>- Acicloguanosina<br/>- Aciclostad<br/>- Aciclovir<br/>- Aciclovir<br/>- Aciclovir Alonga<br/>- Aciclovir Sanorania<br/>- Aciclovir-Sanorania<br/>- Acifur<br/>- Acipen Solutab<br/>- Acivir<br/>- Activir<br/>- Acyclo V<br/>- Acyclo-V<br/>- Acycloguanosine<br/>- Acyclovir<br/>- Acyclovir Sodium<br/>- Alonga, Aciclovir<br/>- Antiherpes Creme<br/>- Avirax<br/>- Cicloferon<br/>- Clonorax<br/>- Cusiviral<br/>- Genvir<br/>- Herpetad<br/>- Herpofug<br/>- Herpotern<br/>- Herpoviric<br/>- Isavir<br/>- Laciken<br/>- Mapox<br/>- Maynar<br/>- Milavir<br/>- Opthavir<br/>- Sodium, Acyclovir<br/>- Solutab, Acipen<br/>- Supraviran<br/>- Viclovir<br/>- Vipral<br/>- Virax Puren<br/>- Virax-Puren<br/>- ViraxPuren<br/>- Virherpes<br/>- Virmen<br/>- Virolex<br/>- Virupos<br/>- Virzin<br/>- Wellcome 248U<br/>- Wellcome-248U<br/>- Wellcome248U<br/>- Zoliparin<br/>- Zovirax<br/>- Zyclir<br/>- aciclovir vo

In [27]:
termos=" "
confirma = False
evolucao = "abd"
strr = ""

#Verifica a lista para ver se a palavra esta no dicionario
cont = 0
for palavra in evolucao:
    for dui in dictMesh:
        d = dictMesh[dui]
        for t in d['terms']:
            if t.lower() == palavra.lower():
                evolucao[cont] = '<a href="#" data-ui="das" data-term="" data-scope="'+d['scope']+'">'+palavra+'</a>'
                cont +=1

    teste = dictMesh[dui]['terms']
    termos = ' '.join(teste)
evolucao = evolucao.replace('data-terms=""', 'data-terms="'+termos+'"')#pegar os termos da lista e por aqui
strr += ' '+evolucao

#print(termos)
termos

'Fibrinogen Deficiency Deficiência de Fibrinogênio Fibrinogen Deficiencies Afibrinogenemia Afibrinogenemias Deficiency, Fibrinogen'

In [1]:
dictMesh

NameError: name 'dictMesh' is not defined

In [108]:
import gzip, pickle
with gzip.open("dictValida.dict.gz",'rb') as fd:
    dicValida = pickle.load(fd)
    fd.close()
    
dicValida

{'adomen': {'ID': 'D000005', 'target': '1'},
 'adbome': {'ID': 'D000005', 'target': '1'},
 'abm': {'ID': 'D000005', 'target': '0'},
 'abdome': {'ID': 'D000005', 'target': '1'},
 'abd': {'ID': 'D000005', 'target': '0'}}

In [71]:
dicValida['adomen']['target']

'1'