# Create Question Words from MeSH terms

**Links úteis:**

* MeSH Browser: https://meshb.nlm.nih.gov/search

* Unified Medical Language System ®: https://uts.nlm.nih.gov/home.html

* UMLS® Reference Manual [Internet]. 
https://www.ncbi.nlm.nih.gov/books/NBK9685/

* Metathesaurus
https://www.nlm.nih.gov/research/umls/knowledge_sources/metathesaurus/index.html

* Gensim
https://www.pydoc.io/pypi/gensim-3.2.0/index.html



**Question Words:**
* Pares de palavras de domínio amplo:
https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/questions-words.txt (inglês)
https://github.com/nlx-group/LX-DSemVectors/blob/master/testsets/LX-4WAnalogiesBr.txt (português)



**Word Embeddings:**
* Embeddings de Artigos Médicos em Inlgês:
http://evexdb.org/pmresources/vec-space-models/

* Embeddings do PubMed + MIMIC:
https://github.com/ncbi-nlp/BioSentVec#biowordvec-biomedical-word-embeddings-with-fasttext

* Embeddings de Domínio Amplo Português:
http://nilc.icmc.usp.br/embeddings

* Wikipedia: https://fasttext.cc/docs/en/pretrained-vectors.html

* Pucrs: http://www.inf.pucrs.br/linatural/wordpress/recursos-e-ferramentas/word-embeddings-para-saude/






## Mount GoogleDrive

In [0]:
#Montar google drive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Imports

In [0]:
#imports
import xml.etree.ElementTree as ET
import pandas as pd
import random
import requests
import os
import logging
import tarfile


from datetime import datetime
from urllib.request import urlopen
from zipfile import ZipFile
from datetime import datetime, timezone, timedelta
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.test.utils import datapath

## Log Definitions

In [0]:
#logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [0]:
import warnings
warnings.filterwarnings("ignore")

## Files

### Create directorys

In [0]:
dataDir = "./data/"
if not os.path.exists(dataDir):
    os.makedirs(dataDir)

QWDir = "./qw/"
if not os.path.exists(QWDir):
    os.makedirs(QWDir)

### Filenames MeSH and UMLS
(google drive)


In [0]:
# Localização dos arquivos
filenameMESH="/content/drive/My Drive/Embeddings/desc2019.xml"
filenameUMLS="/content/drive/My Drive/Embeddings/MRCONSO.RRF"

#filenameMESH="/content/drive/My Drive/Embeddings - old/mesh/MESH_FILES/xmlmesh/desc2019/desc2019.xml"
#filenameUMLS="/content/drive/My Drive/Embeddings - old/UMLS/2019AA/2019AA/META/MRCONSO.RRF"

### filenames word embeddings

In [0]:
# INGLES
filePubMed= "http://evexdb.org/pmresources/vec-space-models/PubMed-w2v.bin"
fileBioWordVec="https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin"
fileWikipediaENG = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip"

#PORT
filePucrs = "http://grupopln.inf.pucrs.br/health/health_word2vec_300v1.tar.gz"
fileNilc = "http://143.107.183.175:22980/download.php?file=embeddings/fasttext/skip_s300.zip"
fileWikipediaPOR = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.pt.zip"


### filenames question words

In [0]:
QWDominioAmploENG= "https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/questions-words.txt"
QWDominioAmploPOR = "/content/drive/My Drive/Embeddings/LX-4WAnalogiesBr.txt"

### filename models


In [0]:
fileModelPucrs = "/content/data/health_word2vec_300.model"
fileModelNilc = "/content/data/skip_s300.txt"
fileModelWikipediaPOR = "/content/data/wiki.pt.vec"

fileModelWikipediaENG="/content/data/wiki.en.vec"
fileModelPubMed="/content/data/PubMed-w2v.bin"
fileModelBioWordVec="/content/data/BioWordVec_PubMed_MIMICIII_d200.vec.bin"

## Constants

In [0]:
#MESH xml Constants:
DESCRIPTOR_RECORD = "DescriptorRecord"
DESCRIPTOR_UI = "DescriptorUI"
DESCRIPTOR_NAME = "DescriptorName/String"

PHARM_ACTION_LIST = "PharmacologicalActionList/PharmacologicalAction"
DESCRIPTOR_REF_UI = "DescriptorReferredTo/DescriptorUI"
DESCRIPTOR_REF_NAME = "DescriptorReferredTo/DescriptorName/String"


#TERM_LIST = "ConceptList/Concept/TermList"
#TERM_UI = "TermUI" '''

## Functions

## Basic Functions

### log

In [0]:
def log(logInfo):
  dh = datetime.now()
  dh = dh.astimezone(timezone(timedelta(hours=-3)))
  dh = dh.strftime('%d/%m/%Y %H:%M:%S')
  logger.debug(dh + " - " + logInfo)

### logProgress

In [0]:
def logProgress(n, tot, i=6000):
  if(n % i == 0):
    p = (n/tot)*100
    logger.debug("Progress... " + "{:.0f}".format(p) + "%")  

### newEntry

In [0]:
def newEntry(heading, word1UI, word1, word2UI, word2):
  pair = {
        "heading": heading,
        "word1UI":  word1UI,
        "word1":    word1,
        "word2UI":  word2UI,
        "word2":    word2,
    }
  return pair

### clearTerm

In [0]:
def clearTerm(term):
  if("[" in term):
    i = term.index("[")
    term = term[:i]
  
  if("(" in term):
    i = term.index("(")
    term = term[:i]
  
  term = term.strip()
  return term

### downloadFile

In [0]:
def downloadFile(url, filePath, filename=""):  
  if(filename == ""):
    name = url.split("/")[-1]
  else:
    name = filename
  filePath = filePath+name
  
  if(not os.path.exists(filePath)): 
    log("Downloading... " + filePath)
    r = requests.get(url, allow_redirects=True)
    file = open(filePath, 'wb').write(r.content)
    log("File downloaded " + filePath)
  else:
    print("File already downloaded.")
  return filePath

## Dictionaries functions

### loadMeSH


In [0]:
def loadMeSHXML(filename):
  log("Loading MeSH XML")
  tree = ET.parse(filename)
  root = tree.getroot()
  log("MeSH XML Loaded")
  return root

### loadUMLSDict

In [0]:
def loadUMLSDict():
  log("Loading dictionaries file " )
  dictionaries = pd.read_csv(filenameUMLS, index_col = False, sep="|", header=0, 
                      names = ["CUI", "LAT", "TS", "LUI", "STT", "SUI", 
                               "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY",
                               "CODE", "STR", "SRL", "SUPPRESS", "CVF"],
                      error_bad_lines=False)
  
  dictionaries = dictionaries[["CUI", "LAT", "SUI", "TTY", "CODE", "STR"]]  
  
  log("dictionaries file loaded " )
  return dictionaries

### getDictLanguage

In [0]:
def getDictLanguage(lang):
  log("Loading dict " + lang)
  dic = dictionaries[dictionaries["LAT"] == lang]
  log("Dict loaded ")
  return dic


### listTerms

In [0]:
def listTerms(root):
    listTerms = []
    for pDesc in root.findall(DESCRIPTOR_RECORD):
      descriptorUI = pDesc.find(DESCRIPTOR_UI).text  
      descriptorName = pDesc.find(DESCRIPTOR_NAME).text
      listTerms.append(descriptorUI + " - " + descriptorName)  
    return listTerms;

SyntaxError: ignored

### getDictDescriptorName

In [0]:
def getDictDescriptorName(dic, descriptorUI):
  descriptorName = dic[(dic["CODE"] == descriptorUI) & (dic["TTY"] == "MH")]["STR"]
  if(not descriptorName.empty):
    descriptorName = clearTerm(descriptorName.iloc[0])
  else:
    descriptorName = ""

  return descriptorName
  

## Pairs functions

### pairDescriptorPharmacologicalActionFiltering

In [0]:
def pairDescriptorPharmacologicalActionFiltering(root, dic): 
  
  
  pairs = [] 
  notFound = 0
  moreWords = 0
  i = 0
  tot = len(root.findall(DESCRIPTOR_RECORD))
  for pDesc in root.findall(DESCRIPTOR_RECORD): 
      
    logProgress(i, tot)

    i = i + 1
      
    descriptorUI = pDesc.find(DESCRIPTOR_UI).text 
    descriptorName = dic[dic["CODE"] == descriptorUI]["STR"][0:1]
    
    if(descriptorName.empty):
      notFound = notFound + 1  
    else:      
      descriptorName = clearTerm(descriptorName.iloc[0])
      

      if(len(descriptorName.split(" ")) == 1):
        pharmList = pDesc.findall(PHARM_ACTION_LIST) 

        for pharm in pharmList: 
          pharmUI = pharm.find(DESCRIPTOR_REF_UI).text 
          pharmTerm = dic[(dic["CODE"] == pharmUI) & (dic["TTY"] == "MH")]["STR"][0:1]
          
          if(pharmTerm.empty):
            notFound = notFound + 1 
          else:
            pharmTerm = clearTerm(pharmTerm.iloc[0])
            if(len(pharmTerm.split(" ")) == 1):
              pairs.append(newEntry("", descriptorUI, descriptorName.lower(), pharmUI, pharmTerm.lower()))
        
        
      else:
        moreWords = moreWords + 1;
        

  log("Translations not found: " + str(notFound))  
  log("Descriptors with more than one word: " + str(moreWords))  
  log("Total pairs: " + str(len(pairs)))  

  
  return pairs; 


### filterTermsWordEmbbedings

In [0]:
def filterTermsWordEmbbedings(pairs, models):
  pairs2 = pairs.copy()
  
  for model in models:
    print("Filtering " + str(len(pairs2)) )
    for p in pairs:
      if((p['word1'] not in model.wv.index2word[:300000]) or (p['word2'] not in model.wv.index2word[:300000])):
        if(p in pairs2):
          pairs2.remove(p)
  print("Result " + str(len(pairs2)) )
 
  return pairs2

### questionWordsFile2PairsList

In [0]:
def questionWordsFile2PairsList(file):
  pairs = []

  for l in open(file , "r"):
    words = l.split(" ", 4)
    if(len(words) == 4):    
      pairs.append(newEntry("", "", clearTerm(words[0].lower()), "", clearTerm(words[1].lower())))
      pairs.append(newEntry("", "", clearTerm(words[2].lower()), "", clearTerm(words[3].replace("\n", ""))))

  
  return pairs




## File Functions

### createQuestionWordsFile

In [0]:
def createQuestionWordsFile(fileName, pairsList,  section="section", size=1000):
  
  log("Creating File...")
  
  if os.path.exists(fileName):
    os.remove(fileName)

  file  = open(fileName, 'a+')  
  file.write(": "+ section.lower() + " \n")
  i = 0

  lines = []
  
  while i < size:

    logProgress(i, size, 10)
      
    pair1 = random.choice(pairsList)
    pair2 = random.choice(pairsList)
    
    
    if(pair1 != pair2 and (pair1, pair2) not in lines):
      lines.append((pair1, pair2))
      file.write(pair1["word1"] + " " + pair1["word2"] + " " + pair2["word1"] + " " + pair2["word2"]+"\n")
      i = i + 1  
  
  log("File created:" + fileName)

### createCompleteQuestionWordsFile

In [0]:
def createCompleteQuestionWordsFile(fileName, pairsList, section = "section"):  
  
  log("Creating File...")

  lines = []

  if os.path.exists(fileName):
    os.remove(fileName)

  file  = open(fileName, 'a+')  
  section.replace(" ", "-")
  log(section)
  file.write(": "+ section.lower() + " \n")
  
  pairsList1 = pairsList.copy()
  totalPairs = 0  
  ignore = 0
  total = len(pairsList)
  i = 0
  log("Total pairs :" + str(total))
  while len(pairsList1) > 0:   
    i += 1

    pair1 = pairsList1[0];
    word1 = pair1["word1"]
    word2 = pair1["word2"]
    
    for pair2 in pairsList1[1:]:    
        word3 = pair2["word1"]
        word4 = pair2["word2"]

        line = (word1, word2, word3, word4)
        lines.append(line)

          
    pairsList1.pop(0)
       

  mylist = list( dict.fromkeys(lines) )
  lines = list(mylist)


  for line in lines:
    totalPairs +=  1 
    file.write(line[0] + " " + line[1] + " " + line[2] + " " + line[3] + "\n")        

  
  file.close()
  


### fileLineCount

In [0]:
def fileLineCount(fname):
    return sum(1 for line in open(fname))

## Evaluation Functions

### printScore

In [0]:
def printScore(analogy_scores, qwFile):
  log("\n ### Result ### ")
  
  totalPairs = fileLineCount(qwFile) - 1
  testedPairs = len(analogy_scores[1][0]['correct']) + len(analogy_scores[1][0]['incorrect'])
  notTestedPairs = totalPairs - testedPairs
  corrects = len(analogy_scores[1][0]['correct'])
  incorrects =  testedPairs - corrects
  
  if(totalPairs > 0):
    pTested = (testedPairs/totalPairs)*100
    pNotTestedPairs = (notTestedPairs/totalPairs)*100
  else:
    pTested = 0       
    pNotTestedPairs = 0

  if(testedPairs > 0):                                                   
    pCorrects = (corrects/testedPairs)*100
    pIncorrects = (incorrects/testedPairs)*100
  else:
    pCorrects = 0
    pIncorrects = 0
                                             
    
  print("# Score = " + str(analogy_scores[0]))
  print("# Total pairs = " + str(totalPairs))
  print("# Tested pairs = " + str(testedPairs) + " - " + "{:.4f}".format(pTested) + "%")  
  print("# Not Tested pairs = " + str(notTestedPairs) + " - " + "{:.4f}".format(pNotTestedPairs) + "%")  
  print("# Corrects = "  + str(corrects) + " - " + "{:.4f}".format(pCorrects) + "%")  
  print("# Incorrects = "  + str(incorrects) + " - " + "{:.4f}".format(pIncorrects) + "%")  
  #printCorrects(analogy_scores)
  print("\n")



### printCorrects

In [0]:
def printCorrects(analogy_scores):
  print("Corrects:")
  for c in analogy_scores[1][0]['correct']:
      print(c)

### printIncorrects

In [0]:
def printIncorrects(analogy_scores):
  print("Incorrects:")
  for i in analogy_scores[1][0]['incorrect']:
      print(i)
  

##Create Question Words Files


In [0]:
root = loadMeSHXML(filenameMESH)
dictionaries = loadUMLSDict() 

### Define idioma dos experimentos

In [0]:
lang = 'pt' # ou 'en'

### Português

In [0]:
if lang == 'pt':
  tf = tarfile.open(downloadFile(filePucrs, dataDir))
  tf.extractall(dataDir)

In [0]:
if lang == 'pt': fn = downloadFile(fileNilc, dataDir)

In [0]:
if lang == 'pt':
  zip = ZipFile(fn)
  zip.extractall(dataDir)

In [0]:
if lang == 'pt':
  zip = ZipFile(downloadFile(fileWikipediaPOR, dataDir))
  zip.extractall(dataDir)

In [0]:
if lang == 'pt':
  # Carregar modelos: PUCRS 
  modelPucrs = Word2Vec.load(fileModelPucrs)

  # Carregar modelo Nilc
  modelNilc = KeyedVectors.load_word2vec_format(fileModelNilc)

  # Carregar modelo Wikipedia
  modelWikipediaPOR = KeyedVectors.load_word2vec_format(fileModelWikipediaPOR)

In [0]:
# Separar pares (pharm - pharmAction)
if lang == 'pt': pairsPT = pairDescriptorPharmacologicalActionFiltering(root, getDictLanguage("POR"))

In [0]:
# Filtrar por pares conhecidos
if lang == 'pt':
  pairsPucrs = filterTermsWordEmbbedings(pairsPT, [modelPucrs])
  print("# Total de pares no modelo da Pucrs: " + str(len(pairsPucrs)) + "\n")

  pairsNilc = filterTermsWordEmbbedings(pairsPT, [modelNilc])
  print("# Total de pares no modelo Nilc: " + str(len(pairsNilc)) + "\n")

  pairsWiki = filterTermsWordEmbbedings(pairsPT, [modelWikipediaPOR])
  print("# Total de pares no modelo Wikipedia: " + str(len(pairsWiki)) + "\n")

  pairsTotal = filterTermsWordEmbbedings(pairsPT, [modelPucrs, modelNilc, modelWikipediaPOR])
  print("# Total de pares em todos modelos : " + str(len(pairsTotal)) + "\n")

Filtering 2002
Filtering 2002
Result 128
# Total de pares no modelo da Pucrs: 128

Filtering 2002
Result 128
# Total de pares no modelo da Pucrs: 128

Filtering 2002
Result 269
# Total de pares no modelo Nilc: 269

Filtering 2002
Result 269
# Total de pares no modelo Nilc: 269

Filtering 2002
Result 253
# Total de pares no modelo Wikipedia: 253

Filtering 2002
Result 253
# Total de pares no modelo Wikipedia: 253

Filtering 2002
Filtering 128
Filtering 128
Filtering 52
Filtering 52
Result 50
# Total de pares em todos modelos : 50

Result 50
# Total de pares em todos modelos : 50



In [0]:
# Gerar arquivo completo
if lang == 'pt':
  QW_Pharm_POR=QWDir+"QW_POR.txt"
  createCompleteQuestionWordsFile(QW_Pharm_POR, pairsTotal, section="pharm + pharmAction")
  print("Total de pares no arquivo: " + str(fileLineCount(QW_Pharm_POR) - 1))

In [0]:
if lang == 'pt': 
  pairsQW = questionWordsFile2PairsList(QWDominioAmploPOR)


  # Filtrar por pares conhecidos
if lang == 'pt':
  pairsPucrs = filterTermsWordEmbbedings(pairsQW, [modelPucrs])
  print("# Total de pares no modelo da Pucrs: " + str(len(pairsPucrs)) + "\n")

  pairsNilc = filterTermsWordEmbbedings(pairsQW, [modelNilc])
  print("# Total de pares no modelo Nilc: " + str(len(pairsNilc)) + "\n")

  pairsWiki = filterTermsWordEmbbedings(pairsQW, [modelWikipediaPOR])
  print("# Total de pares no modelo Wikipedia: " + str(len(pairsWiki)) + "\n")

  pairsTotal = filterTermsWordEmbbedings(pairsQW, [modelPucrs, modelNilc, modelWikipediaPOR])
  print("# Total de pares em todos modelos : " + str(len(pairsTotal)) + "\n")

Filtering 35116
Result 7199
# Total de pares no modelo da Pucrs: 7199

Filtering 35116
Result 24874
# Total de pares no modelo Nilc: 24874

Filtering 35116
Result 24350
# Total de pares no modelo Wikipedia: 24350

Filtering 35116
Filtering 7199
Filtering 7199
Result 7199
# Total de pares em todos modelos : 7199



In [0]:
# Gerar arquivo completo
if lang == 'pt':
  QWDominioAmploPOR_2=QWDir+"QW_POR_2.txt"
  createCompleteQuestionWordsFile(QWDominioAmploPOR_2, pairsTotal, section="pharm + pharmAction")
  print("Total de pares no arquivo: " + str(fileLineCount(QWDominioAmploPOR_2) - 1))

Total de pares no arquivo: 6947


### Inglês

In [0]:
#if lang == 'en': downloadFile(filePubMed, dataDir)

In [0]:
if lang == 'en':
  zip = ZipFile(downloadFile(fileWikipediaENG, dataDir))
  zip.extractall(dataDir)

In [0]:
# Carregar modelo PubMed
if lang == 'en': modelPubMed = KeyedVectors.load_word2vec_format(fileModelPubMed, binary=True)

In [0]:
if lang == 'en':
  # Carregar Modelo Bioword
  modelBioWordVec = KeyedVectors.load_word2vec_format(downloadFile(fileBioWordVec, dataDir), binary=True)

  # Carregar modelo Wikipedia
  modelWikipediaENG =  KeyedVectors.load_word2vec_format(fileModelWikipediaENG)

In [0]:
if lang == 'en':
  # Gerar pares com 1 palavra
  pairsENG = pairDescriptorPharmacologicalActionFiltering(root, getDictLanguage("ENG"))

  # Pares existentes no modelo PubMed
  pairsPubMed = filterTermsWordEmbbedings(pairsENG, [modelPubMed])

In [0]:
if lang == 'en':
  # Pares existentes no modelo BioWordVec
  pairsBioWordVec = filterTermsWordEmbbedings(pairsENG, [modelBioWordVec])

  # Pares existentes no modelo Wikipedia
  pairsWikipedia = filterTermsWordEmbbedings(pairsENG, [modelWikipediaENG])

  # Pares existentes nos 2 modelos
  totalPairsENG = filterTermsWordEmbbedings(pairsENG, [modelPubMed, modelBioWordVec, modelWikipediaENG])

In [0]:
# Gerar um arquivo com no máximo 1000 pares
if lang == 'en':
  QW_Pharm_ENG=QWDir + "/QW_ENG.txt"
  createQuestionWordsFile(QW_Pharm_ENG, totalPairsENG,  size=1225)
  print("Total de pares no arquivo: " + str(fileLineCount(QW_Pharm_ENG) - 1))

# Evaluate Word Embeddings

## Português

### Domínimo Específico

In [0]:
# Avaliar:
if lang == 'pt':
  asPucrsEspPOR = modelPucrs.wv.evaluate_word_analogies(QW_Pharm_POR, case_insensitive=True)
  printScore(asPucrsEspPOR, QW_Pharm_POR)

  asNilcEspPOR = modelNilc.evaluate_word_analogies(QW_Pharm_POR, case_insensitive=True)
  printScore(asNilcEspPOR, QW_Pharm_POR)

  asWikiEspPOR = modelWikipediaPOR.evaluate_word_analogies(QW_Pharm_POR, case_insensitive=True)
  printScore(asWikiEspPOR, QW_Pharm_POR)

### Domínimo Amplo

In [0]:
# Avaliar:
if lang == 'pt':
  asPucrsAmpPOR = modelPucrs.wv.evaluate_word_analogies(QWDominioAmploPOR_2, case_insensitive=True)
  printScore(asPucrsAmpPOR, QWDominioAmploPOR_2)

  asNilcAmpPOR = modelNilc.evaluate_word_analogies(QWDominioAmploPOR_2, case_insensitive=True)
  printScore(asNilcAmpPOR, QWDominioAmploPOR_2)

  asWikiAmpPOR = modelWikipediaPOR.evaluate_word_analogies(QWDominioAmploPOR_2, case_insensitive=True)
  printScore(asWikiAmpPOR, QWDominioAmploPOR_2)

# Score = 0.04563120771556067
# Total pairs = 6947
# Tested pairs = 6947 - 100.0000%
# Not Tested pairs = 0 - 0.0000%
# Corrects = 317 - 4.5631%
# Incorrects = 6630 - 95.4369%


# Score = 0.2149129120483662
# Total pairs = 6947
# Tested pairs = 6947 - 100.0000%
# Not Tested pairs = 0 - 0.0000%
# Corrects = 1493 - 21.4913%
# Incorrects = 5454 - 78.5087%


# Score = 0.25435439758168993
# Total pairs = 6947
# Tested pairs = 6947 - 100.0000%
# Not Tested pairs = 0 - 0.0000%
# Corrects = 1767 - 25.4354%
# Incorrects = 5180 - 74.5646%




## Inglês

### Domínio Específico

In [0]:
if lang == 'en':
  asPubMedEspENG = modelPubMed.evaluate_word_analogies(QW_Pharm_ENG, case_insensitive=True)
  printScore(asPubMedEspENG, QW_Pharm_ENG)


In [0]:
if lang == 'en':
  asBioWordVecEspENG = modelBioWordVec.evaluate_word_analogies(QW_Pharm_ENG, case_insensitive=True)
  printScore(asBioWordVecEspENG, filenameQW_ENG)

  asWikipediaEspENG = modelWikipediaENG.evaluate_word_analogies(QW_Pharm_ENG, case_insensitive=True)
  printScore(asWikipediaEspENG, filenameQW_ENG)

### Domínio Amplo

In [0]:
if lang == 'en':
  asPubMedAmpENG = modelPubMed.evaluate_word_analogies(datapath('questions-words.txt'), case_insensitive=True)
  printScore(asPubMedAmpENG, datapath('questions-words.txt'))

In [0]:
if lang == 'en':
  asBioWordVecAmpENG = modelBioWordVec.evaluate_word_analogies(datapath('questions-words.txt'), case_insensitive=True)
  printScore(asBioWordVecAmpENG, QWDominioAmploENG)

  asWikipediaAmpENG = modelWikipediaENG.evaluate_word_analogies(datapath('questions-words.txt'), case_insensitive=True)
  printScore(asWikipediaAmpENG, QWDominioAmploENG)

## Other Languages

In [0]:
languages = list(dictionaries["LAT"].unique())
languages.remove("ENG")
languages.remove("POR")
print(languages)


In [0]:
for lan in languages:
  print("Creating file for language: " + lan)
  pairs = pairDescriptorPharmacologicalActionFiltering(root, getDictLanguage(lan))
  if len(pairs) > 0:
    createQuestionWordsFile("qw/QW_"+lan+".txt", pairs,  size=2500)