## SHOWS PMR STATS

In [1]:
### LOAD ONTOLOGY DICTIONARIES
from nltk.corpus import stopwords
import gzip, pickle, re
file = gzip.GzipFile('dataSource/pmr_onto.gz', 'rb')
ontologies = pickle.load(file)
file.close()

### LOAD STOPWORDS
stopwords = set(stopwords.words('english'))

### SEPARATE TEXT INTO TOKENS
def tokenise(text):
    if len(text.strip()) == 0: return []
    doc = nlp(text)
    tokens = [token.text.lower() for sentence in doc.sentences for token in sentence.tokens]
    return tokens

### SEPARATE TEXT INTO TOKENS AND REMOVE STOPWORDS
def stopAndToken(text):
    tokens = tokenise(text)
    filtered_sentence = [t for t in tokens if t not in stopwords]
    return filtered_sentence

### ORGANISED K,V WHERE K IS CLASSID, V IS A SET OF TERMS IN THE ONTOLOGY CLASS
ontoClasses = {}
for ontoName, ontology in ontologies.items():
    for classID, features in ontology['data'].items():
        classID = (''.join(re.split('_|:', classID))).replace('OPB#','')
        feats = [features[0]] + features[1]
        terms = []
        for feat in feats:
            if isinstance(feat, str):
                terms += [term for term in feat.lower().split()]
        terms = set(terms) - stopwords
        ontoClasses[classID] = terms

In [None]:
# extract data source
import zipfile, os
print(os.path.abspath('.'))
with zipfile.ZipFile(os.path.join(os.path.abspath('.'),'experiment_data.zip'), 'r') as zip_ref:
    zip_ref.extractall(os.path.abspath('.'))

In [4]:
### NORMALISED LINK RELATED TO MODEL
def normalisedLink(link):
    spts = link.split('/rawfile/')
    if len(spts) > 1:
        return '/'.join([spts[0],'rawfile','HEAD',spts[-1][spts[-1].find('/')+1:]])
    return link

### NORMALISED FULL FORMAT CLASSID TO SHORTER CLASSID
def getShortID(bioClass):
    txtClass = bioClass.replace('<', '').replace('>', '').strip(' \t\n\r')
    if txtClass[0:4] == 'http':
        oboId = txtClass[txtClass.rfind('/') + 1:]
        if any(x in oboId for x in ['_', ':']):
            oboId = oboId.replace('_', ':')
            return oboId[0:oboId.find(':')] + oboId[oboId.find(':') + 1:]
    return ''

### SHOW PMR STATISTIC
import json
print("""STATISTIC PMR""")
with open('dataSource/listOfObjects.txt', 'r') as fp:
    lines = fp.readlines()
print("Number of objects: ",len(lines))
newLines = set([line[:-1] for line in lines if line[:4]=='http' and 'models' not in line])
print("Number of ontology links: ",len(newLines))
newLines = set([line[line.rfind('/')+1:].replace(':','_') for line in newLines])
print("Number of distinct ontologies: ",len(newLines),"\n")
with open('dataSource/rdfPaths.json', 'r') as fp:
    rdfPaths = json.load(fp)
print("Number of links with rdf", len(rdfPaths))

# ORGANISED LINK ANNOTATED WITH ONTOLOGY CLASSES TO validLinks
# ORGANISED ONTOLOGY CLASSES USED IN LINK TO validOnto
validLinks = {}
validOnto = []
for rdfPath in rdfPaths:
    triples = rdfPath['paths']
    for triple in triples:
        if triple['o'][:4] == 'http' and 'models.phys' not in triple['o']:
            # normalised the link
            link = normalisedLink(rdfPath['link'])
            classID = getShortID(triple['o'])
            if classID in ontoClasses:
                if link not in validLinks: validLinks[link] = {'terms':[],'classes':{}}
                validLinks[link]['terms'] += ontoClasses[classID]
                validLinks[link]['classes'][classID] = ontoClasses[classID]
                validOnto += [classID]
validOnto = set(validOnto)
for k in validLinks.keys(): 
    validLinks[k]['terms'] = set(validLinks[k]['terms'])
print("Number of models annotated to ontology class: ",len(validLinks))
print("Number of URIs Ontology Classes: ",len(validOnto))

STATISTIC PMR
Number of objects:  3739
Number of ontology links:  573
Number of distinct ontologies:  529 

Number of links with rdf 204
Number of models annotated to ontology class:  184
Number of URIs Ontology Classes:  521


## DATA TEST PREPARATION

In [7]:
### LOAD QUERY AND ANSWERS FROM QUERY LOG
import ujson as json
site = 'https://models.physiomeproject.org/'
with open('dataSource/query_workspace.json','r') as fp:
    queryWorkspaces = json.load(fp)
print('The total number of raw query and answer is: ',len(queryWorkspaces))

### NORMALISED ALL QUERY AND ANSWERS
for k, v in queryWorkspaces.items():
    for i in range(len(v)-1,-1,-1):
        v[i] = normalisedLink(v[i])

### LOAD CLUSTER
with open('dataSource/cellmlClusterer.json','r') as fp:
    cellmlClusters = json.load(fp)
    
### ENRICH QUERY AND ANSWERS WITH CLUSTER
for k, v in queryWorkspaces.items():
    links = set(v)
    for link in v:
        if link in cellmlClusters['url2Cluster']:
            links.update(set(cellmlClusters['cluster'][cellmlClusters['url2Cluster'][link]]))
    queryWorkspaces[k] = links

### REMOVE QUERY AND ANSWERS WHERE THE ANSWERS IS NOT ANNOTATED WITH ONTOLOGY CLASSES
for k, v in queryWorkspaces.copy().items():
    for link in v.copy():
        if (site + link) not in validLinks: 
            v.remove(link)
    if len(v) == 0:
        queryWorkspaces.pop(k)

print('The number of query and answer annotated with ontology classes: ',len(queryWorkspaces))

The total number of raw query and answer is:  771
The number of query and answer annotated with ontology classes:  261


In [9]:
### FILTER FOR QUERIES HAVING TERMS INDEXED BY ONTOLOGY CLASSES (CONSIDERING PREFERRED LABEL AND SYNONYM ONLY)
from NLIMED import NLIMED
# nli = NLIMED(repo='pmr', parser='benepar', pl=1, alpha=3, beta=1, gamma=1, delta=1, theta=1, cutoff=0, quite=True)
nli = NLIMED(repo='pmr', parser='CoreNLP', pl=1, alpha=3, beta=2, gamma=0, delta=0, theta=0, cutoff=0, tfMode=1, quite=True)

native_QR_PMR = {}
for count, (k, v) in enumerate(queryWorkspaces.items()):
    annotated = nli.getAnnotated(k)
    if len(annotated['result']) > 0:
        native_QR_PMR[k] = {link:validLinks[site+link] for link in v if site+link in validLinks}
        
    if count%20==0: print(count,end=' ')

print('The number of query and answes annotated with ontology classes, considering preferred label and synonym only: ', len(native_QR_PMR))

0 20 40 60 80 100 120 140 160 180 200 220 240 260 The number of query and answes annotated with ontology classes, considering preferred label and synonym only:  110


In [10]:
# FILTER NATIVE QUERY TO REMOVE A MODEL WHERE IT'S CLASS ONTOLOGY FEATURES DO NOT CONTAIN TERMS IN ITS QUERY
onto_native_QR_PMR = {}
for q, vals in native_QR_PMR.items():
    for link in vals:
        if site+link in validLinks:
            for t in q.split():
                if t in validLinks[site+link]['terms']:
                    onto_native_QR_PMR[q] = native_QR_PMR[q]
print('The number of query and answer where terms in ontology classes appear in query: ', len(onto_native_QR_PMR))

The number of query and answer where terms in ontology classes appear in query:  46


In [21]:
### GET A CLASS WITH MAXIMUM PROPORTION OF TERMS IN QUERY TO TERMS IN CLASS ONTOLOGY
# onto_native_QR_PMR_max ==> query to max poportion in onto_native_QR_PMR
# native_QR_PMR_max ==> query to max poportion in native_QR_PMR

onto_native_QR_PMR_max = {}
for q, v in onto_native_QR_PMR.items():
    onto_native_QR_PMR_max[q] = 0
    qTerms = set(q.split())
    for link in v:
        data = validLinks[site+link]
        for classID, classTerms in data['classes'].items():
            if len(qTerms&classTerms)/len(classTerms) > 0.: 
                if len(qTerms&classTerms)/len(classTerms) > onto_native_QR_PMR_max[q]:
                    onto_native_QR_PMR_max[q] = len(qTerms&classTerms)/len(classTerms)

native_QR_PMR_max = onto_native_QR_PMR_max.copy()
native_QR_PMR_max.update({q:0 for q in native_QR_PMR if q not in onto_native_QR_PMR_max})

In [11]:
### SAVE / LOAD THE FOUNDED QUERY-RESULTS PAIRS FROM QUERY LOGS
if 'native_QR_PMR' in globals():
    # modify set data type in pairs to list so it can be saved to json
    native_QR_PMR = {k:list(v) for k, v in native_QR_PMR.items()}
    with open('saveJson/native_QR_PMR.json', 'w') as fp:
        json.dump(native_QR_PMR, fp)
else:
    with open('saveJson/native_QR_PMR.json', 'r') as fp:
        native_QR_PMR = json.load(fp)

In [12]:
### GET QUERY RESULTS UTILISING CoreNLP NLIMED (SUPPOSSED TO BE THE BEST SETTING)
def getNliResults(nli):
    nliResults = {}
    for i, (k, v) in enumerate(native_QR_PMR.items()):
        if i%20==0: print(i, end=' ')
        models = nli.getModels(k)
        nliResults[k] = []
        for result in models['results']:
            model = result['graph'].replace('https://models.physiomeproject.org/','') \
                    + '/rawfile/HEAD/' + result['Model_entity'][:result['Model_entity'].find('#')]
            if model not in nliResults[k] and not model.endswith('sedml'): nliResults[k] += [model]
    return nliResults


In [13]:
### GET MEAN AVERAGE PRECISION (MAP)
def getMAP(results, reverence, k, isVerbose = False):
    totIdentify = 0
    totAP = 0
    for query, relModels in reverence.items():
        relevances = [0]*k
        totRelevant = 0
        ap = 0
        for i in range(k):
            if len(results[query]) > i:
                if results[query][i] in relModels:
                    relevances[i] = 1
                    ap += sum(relevances[0:i+1])/(i+1)
                    if isVerbose:
                        print(i+1, ap, sum(relevances[0:i+1]),relevances, query)
                    totRelevant += 1
        if totRelevant > 0:
            totAP += ap/totRelevant
            totIdentify += 1
#     print(totIdentify, totIdentify/len(reverence)*100,'%')
#     return {'map':totAP/max(totIdentify,1), 'rate':totIdentify/len(reverence)}
    return {'map':totAP/len(reverence), 'rate':totIdentify/len(reverence), 'identify':totIdentify, 'totRel':len(reverence)}
                

# CALCULATE MAP FOR EACH SETTINGS

In [14]:
# LOAD NLIMED BEST SETTINGS FROM NLIMED SUMMARY RESULTS
import  ujson as  json
with open('saveJson/nlimedSummaryResults.json', 'r') as fp:
    nliSettings = json.load(fp)

In [15]:
# ANALYSIS OF THE BEST CUTOFF AND THE NUMBER OF RETURN
# RETURNING DATAFRAME OF CUTOFF AND NUMBER RETURN, AND THE BEST SETTING
import numpy as np
import pandas as pd
dataAucPR = nliSettings
def analyseData(idxType, tfMode, parser, recallType, limit=0):
    print(idxType, tfMode, parser, recallType)
    maxResult = dataAucPR[idxType][tfMode][parser][recallType]
    print('Best settings: ',maxResult['settings'])
    meanMaxSetting = np.mean(np.array(maxResult['settings']),axis=0)
    print('Mean of best setting: ', meanMaxSetting)
    idx = dataAucPR[idxType][tfMode][parser][recallType]['settings'].index(maxResult['settings'][0])
    precisions = dataAucPR[idxType][tfMode][parser][recallType]['precisions']
    recalls = dataAucPR[idxType][tfMode][parser][recallType]['recalls']
    cutoffs = dataAucPR[idxType][tfMode][parser]['cutoffs'][:len(recalls)]
    data = []
    for i, cutoff in enumerate(cutoffs):
        fmeasure = (precisions[i]*recalls[i])/(precisions[i]+recalls[i])*2
        precLimit = 1/(1+(recalls[i]*limit))
        data += [{'cutoff':cutoff, 'recall':recalls[i], 'precision':precisions[i], 'fmeasure':fmeasure, 'preclimit':precLimit}]
    df = pd.DataFrame(data)
    df = df.sort_values(['fmeasure', 'precision'], ascending=False)
    
    # get rate of cutoff and return number
    maxFmeasure = df['fmeasure'][:50].max()
    dfMax = df.loc[df['fmeasure'] == maxFmeasure]
    maxCutoff = dfMax['cutoff'].map(lambda x: x[0]).mean()
    maxReturn = math.floor(dfMax['cutoff'].map(lambda x: x[1]).mean())
    maxCutoff, maxReturn
    
    setting = list(meanMaxSetting[:-1]) + [maxCutoff, maxReturn]
    return df, setting

In [16]:
### GET RESULTS FROM NCBO
nliNcbo = NLIMED(repo='pmr', parser='ncbo')
nliNcboResult = getNliResults(nliNcbo)
ncboMap5 = getMAP(nliNcboResult, native_QR_PMR,5)
ncboMap10 = getMAP(nliNcboResult, native_QR_PMR,10)
ncboMap1000 = getMAP(nliNcboResult, native_QR_PMR,10)

0 	http://bhi.washington.edu/OPB#OPB_00506 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/NCBITaxon_9606 is not in cellml, it is ignored
	http://bhi.washington.edu/OPB#OPB_00506 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma7088 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma9637 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma62343 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma7088 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_24870 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/UBERON_0000467 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/UBERON_0003978 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma62343 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/PR_000003197 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_53439 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma83109 i

In [17]:
#  GET MAP FOR EACH SETTING
import numpy as np
import shutil
import os
import math
import copy
from NLIMED import NLIMED, __file__

dest = os.path.join(os.path.dirname(__file__), 'indexes')
source = os.path.join(os.path.dirname(__file__), 'indexes/pmr_inv')

def moveIndex(idxMode):
    files = os.listdir(os.path.join(source,idxMode))
    for file in files:
        fileSrc = os.path.join(source,idxMode,file)
        fileDest = os.path.join(dest,file)
        shutil.copy(fileSrc,fileDest)
        
def getMapForAllSettings(isBest = False, cutoff=0, pl=0, multipliers=None, resultSettings={}):
    for idxMode, v1 in resultSettings.items():
        if idxMode=='ncbo': continue
        moveIndex(idxMode)
        for tfMode, v2 in v1.items():
            for parser, v3 in v2.items():
                if isinstance(v3,list): continue
                if 'maxAuc100' not in v3: continue
                df, setting = analyseData(idxMode, tfMode, parser,'maxAuc100')
                if pl > 0:
                    setting[-1] = pl
                if not isBest:
                    setting[-2] = cutoff
                    if multipliers != None:
                        for i, v in enumerate(multipliers): setting[i]=v
                if 'results' in v3['maxAuc100']: 
                    print(idxMode, tfMode, parser, setting, 'SKIPPED')
                    continue 
                print(idxMode, tfMode, parser, setting)
                nli = NLIMED(repo='pmr', parser=parser)
                nli.setWeighting(*setting)

                nliResult = getNliResults(nli)
                v3['maxAuc100']['results'] = nliResult
                v3['maxAuc100']['map5'] = getMAP(nliResult, native_QR_PMR,5)
                v3['maxAuc100']['map10'] = getMAP(nliResult, native_QR_PMR,10)
                v3['maxAuc100']['map1000'] = getMAP(nliResult, native_QR_PMR,1000)
                print('\nMAP5:',v3['maxAuc100']['map5'],' \tMAP10', v3['maxAuc100']['map10'],'\n')
    ### SET RESULTS FROM NCBO
    resultSettings['ncbo'] = {'results':nliNcboResult, 'map5':ncboMap5, 'map10':ncboMap10, 'map1000':ncboMap1000}
    ### SAVE TO FILE
    addName = '_'+'_'.join(str(m) for m in multipliers) if multipliers != None else ''
    addName += '_best' if isBest else '_'+str(cutoff)
    addName += '_best' if pl == 0 else '_'+str(pl)
    with open('saveJson/nlimedSummaryResultsComplete'+addName+'.json', 'w') as fp:
        json.dump(resultSettings, fp)
        
# initialised resultSettings
resultSettings = []
for i in range(6):
    resultSettings += [copy.deepcopy(nliSettings)]

In [33]:
getMapForAllSettings(isBest=True, resultSettings=resultSettings[0])

wple mode3 stanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 stanza [3.0, 0.5, 0.0, 0.0, 0.21, 0.0, 1]
0 

KeyboardInterrupt: 

In [339]:
getMapForAllSettings(isBest=True, pl=5, resultSettings=resultSettings[1])

wple mode3 stanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 stanza [3.0, 0.5, 0.0, 0.0, 0.21, 0, 5]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.090551500405515, 'rate': 0.13138686131386862}  	MAP10 {'map': 0.09012570965125709, 'rate': 0.13138686131386862} 

wple mode3 stanford maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.31, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.31 0.  ]
wple mode3 stanford [3.0, 0.5, 0.0, 0.0, 0.31, 1.4, 5]
Stanford server has been started
0 1 2 3 4 5 6 7 8 9 10 11 12

  'with `validate_args=False` to turn off validation.')


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.09314679643146796, 'rate': 0.1386861313868613}  	MAP10 {'map': 0.09161163248754489, 'rate': 0.15328467153284672} 

wple mode3 mixed maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 mixed [3.0, 0.5, 0.0, 0.0, 0.21, 0.1, 5]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.07120843471208434, 'rate': 0.10948905109489052}  	MAP10 {'map': 0.07042637006140656, 'rate': 0.10948905109489052} 

wpl mode3 nltk maxAuc100
Best settings:  [[3.0, 3.0, 0.0, 0.0, 1.5, 0]]
Mean of best setting:  [3.  3.  0.  0.  1.5 0. ]
wpl mode3 nltk [3.0, 3.0, 0.0, 0.0, 1.5, 2.6, 5]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.090551500405515, 'rate': 0.13138686131386862}  	MAP10 {'map': 0.09012570965125709, 'rate': 0.13138686131386862} 

wpure mode3 stanford maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.43, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.43 0.  ]
wpure mode3 stanford [3.0, 0.5, 0.0, 0.0, 0.43, 0.1, 5]
Stanford server has been started
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72

  'with `validate_args=False` to turn off validation.')


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.0024330900243309003, 'rate': 0.0072992700729927005}  	MAP10 {'map': 0.00194647201946472, 'rate': 0.0072992700729927005} 

wpure mode2 mixed maxAuc100
Best settings:  [[3.0, 0.0, 0.0, 0.0, 0.19, 0]]
Mean of best setting:  [3.   0.   0.   0.   0.19 0.  ]
wpure mode2 mixed [3.0, 0.0, 0.0, 0.0, 0.19, 3.1, 5]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83

61 	http://purl.obolibrary.org/obo/CHEBI_26523 is not in cellml, it is ignored
62 	http://purl.obolibrary.org/obo/GO_0006099 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/GO_0006119 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_25212 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/GO_0006810 is not in cellml, it is ignored
63 	http://purl.obolibrary.org/obo/GO_0005739 is not in cellml, it is ignored
64 	http://purl.obolibrary.org/obo/CL_0000084 is not in cellml, it is ignored
65 66 	http://purl.obolibrary.org/obo/GO_0006096 is not in cellml, it is ignored
67 68 	http://purl.obolibrary.org/obo/PR_000000791 is not in cellml, it is ignored
69 	http://purl.obolibrary.org/obo/CHEBI_22984 is not in cellml, it is ignored
70 71 72 73 74 75 76 	http://purl.org/sig/ont/fma/fma67969 is not in cellml, it is ignored
77 	http://purl.obolibrary.org/obo/NCBITaxon_9606 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma68646 is not 

In [29]:
getMapForAllSettings(isBest=False, cutoff=0, pl=5, resultSettings=resultSettings[2])

wple mode3 stanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 stanza [3.0, 0.5, 0.0, 0.0, 0.21, 0, 5] SKIPPED
wple mode3 coreNLP maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.31, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.31 0.  ]
wple mode3 coreNLP [3.0, 0.5, 0.0, 0.0, 0.31, 0, 5] SKIPPED
wple mode3 xStanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 xStanza [3.0, 0.5, 0.0, 0.0, 0.21, 0, 5] SKIPPED
wple mode3 benepar maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.1, 0.86, 0]]
Mean of best setting:  [3.   0.5  0.   0.1  0.86 0.  ]
wple mode3 benepar [3.0, 0.5, 0.0, 0.1, 0.86, 0, 5] SKIPPED
wple mode2 stanza maxAuc100
Best settings:  [[3.0, 0.7, 0.0, 0.0, 2.67, 0]]
Mean of best setting:  [3.   0.7  0.   0.   2.67 0.  ]
wple mode2 stanza [3.0, 0.7, 0.0, 0.0, 2.67, 0, 5] SKIPPED
wple mode2 coreNLP maxAuc100
Best settings: 

  'with `validate_args=False` to turn off validation.')


20 40 60 80 100 
MAP5: {'map': 0.10202020202020202, 'rate': 0.15454545454545454, 'identify': 17, 'totRel': 110}  	MAP10 {'map': 0.10428030303030303, 'rate': 0.18181818181818182, 'identify': 20, 'totRel': 110} 

wpl mode2 stanza maxAuc100
Best settings:  [[3.0, 0.7, 0.0, 0.0, 2.67, 0]]
Mean of best setting:  [3.   0.7  0.   0.   2.67 0.  ]
wpl mode2 stanza [3.0, 0.7, 0.0, 0.0, 2.67, 0, 5]
repo pmr
parser stanza
0 20 40 60 80 100 
MAP5: {'map': 0.08914141414141415, 'rate': 0.11818181818181818, 'identify': 13, 'totRel': 110}  	MAP10 {'map': 0.0940638528138528, 'rate': 0.15454545454545454, 'identify': 17, 'totRel': 110} 

wpl mode2 coreNLP maxAuc100
Best settings:  [[3.0, 1.0, 0.0, 0.3, 2.0, 0]]
Mean of best setting:  [3.  1.  0.  0.3 2.  0. ]
wpl mode2 coreNLP [3.0, 1.0, 0.0, 0.3, 2.0, 0, 5]
repo pmr
parser coreNLP
CoreNLP server has been started
0 20 40 60 80 100 
MAP5: {'map': 0.09292929292929292, 'rate': 0.12727272727272726, 'identify': 14, 'totRel': 110}  	MAP10 {'map': 0.093901515151

0 20 40 60 80 100 
MAP5: {'map': 0.11222222222222222, 'rate': 0.16363636363636364, 'identify': 18, 'totRel': 110}  	MAP10 {'map': 0.11205808080808081, 'rate': 0.17272727272727273, 'identify': 19, 'totRel': 110} 



In [342]:
getMapForAllSettings(isBest=False, cutoff=0, pl=0, resultSettings=resultSettings[3])

wple mode3 stanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 stanza [3.0, 0.5, 0.0, 0.0, 0.21, 0, 1]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.038929440389294405, 'rate': 0.043795620437956206}  	MAP10 {'map': 0.038929440389294405, 'rate': 0.043795620437956206} 

wple mode3 stanford maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.31, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.31 0.  ]
wple mode3 stanford [3.0, 0.5, 0.0, 0.0, 0.31, 0, 1]
Stanford server has been started
0 1 2 3 4 5 6 7 8 9 10 1

  'with `validate_args=False` to turn off validation.')


1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.04987834549878345, 'rate': 0.058394160583941604}  	MAP10 {'map': 0.04508457884370293, 'rate': 0.072992700729927} 

wple mode3 mixed maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 mixed [3.0, 0.5, 0.0, 0.0, 0.21, 0, 1]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 8

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.04744525547445255, 'rate': 0.051094890510948905}  	MAP10 {'map': 0.04173908006024794, 'rate': 0.058394160583941604} 

wpl mode3 nltk maxAuc100
Best settings:  [[3.0, 3.0, 0.0, 0.0, 1.5, 0]]
Mean of best setting:  [3.  3.  0.  0.  1.5 0. ]
wpl mode3 nltk [3.0, 3.0, 0.0, 0.0, 1.5, 0, 1]
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.038929440389294405, 'rate': 0.043795620437956206}  	MAP10 {'map': 0.038929440389294405, 'rate': 0.043795620437956206} 

wpure mode3 stanford maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.43, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.43 0.  ]
wpure mode3 stanford [3.0, 0.5, 0.0, 0.0, 0.43, 0, 1]
Stanford server has been started
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 7

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 
MAP5: {'map': 0.046228710462287104, 'rate': 0.051094890510948905}  	MAP10 {'map': 0.046228710462287104, 'rate': 0.051094890510948905} 

0 	http://bhi.washington.edu/OPB#OPB_00506 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/NCBITaxon_9606 is not in cellml, it is ignored
1 	http://bhi.washington.edu/OPB#OPB_00506 is not in cellml, it is ignored
2 	http://purl.org/sig/ont/fma/fma7088 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma9637 is not in cellml, it is ignored
3 	http://purl.org/sig/ont/fma/fma62343 is not in cellml

116 	http://purl.obolibrary.org/obo/PR_P22700 is not in cellml, it is ignored
117 	http://purl.obolibrary.org/obo/NCBITaxon_10114 is not in cellml, it is ignored
118 	http://purl.obolibrary.org/obo/UBERON_0003978 is not in cellml, it is ignored
119 	http://purl.org/sig/ont/fma/fma82743 is not in cellml, it is ignored
120 	http://purl.obolibrary.org/obo/CHEBI_30216 is not in cellml, it is ignored
121 	http://purl.org/sig/ont/fma/fma67328 is not in cellml, it is ignored
122 123 124 125 126 	http://purl.obolibrary.org/obo/NCBITaxon_10090 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma83109 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/GO_0001508 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_22984 is not in cellml, it is ignored
127 	http://purl.obolibrary.org/obo/GO_0005131 is not in cellml, it is ignored
128 	http://purl.obolibrary.org/obo/UBERON_0000060 is not in cellml, it is ignored
129 	http://purl.obolibrary.org/obo/PR_00001

In [354]:
getMapForAllSettings(isBest=False, cutoff=0, pl=5, multipliers=[3,3,1,1,1], resultSettings=resultSettings[4])

In [90]:
getMapForAllSettings(isBest=False, cutoff=0, pl=5, multipliers=[3,3,0.5,0.5,0.5], resultSettings=resultSettings[5])

wple mode3 stanza maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 stanza [3, 3, 0.5, 0.5, 0.5, 0, 5] SKIPPED
wple mode3 stanford maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.31, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.31 0.  ]
wple mode3 stanford [3, 3, 0.5, 0.5, 0.5, 0, 5] SKIPPED
wple mode3 nltk maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.1, 0.86, 0]]
Mean of best setting:  [3.   0.5  0.   0.1  0.86 0.  ]
wple mode3 nltk [3, 3, 0.5, 0.5, 0.5, 0, 5] SKIPPED
wple mode3 mixed maxAuc100
Best settings:  [[3.0, 0.5, 0.0, 0.0, 0.21, 0]]
Mean of best setting:  [3.   0.5  0.   0.   0.21 0.  ]
wple mode3 mixed [3, 3, 0.5, 0.5, 0.5, 0, 5] SKIPPED
wple mode2 stanza maxAuc100
Best settings:  [[3.0, 0.7, 0.0, 0.0, 2.67, 0]]
Mean of best setting:  [3.   0.7  0.   0.   2.67 0.  ]
wple mode2 stanza [3, 3, 0.5, 0.5, 0.5, 0, 5] SKIPPED
wple mode2 stanford maxAuc100
Best settings:  [[3.0, 1.0, 0.3, 0.3, 1.67, 0]]

20 	http://purl.obolibrary.org/obo/GO_0001508 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma14067 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma63841 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_22984 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/GO_0007049 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/NCBITaxon_10114 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/NCBITaxon_10114 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/GO_0001508 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/NCBITaxon_9606 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma63877 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma7088 is not in cellml, it is ignored
	http://purl.obolibrary.org/obo/CHEBI_27732 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma9670 is not in cellml, it is ignored
	http://purl.org/sig/ont/fma/fma8

# ANALYSIS MAP RESULT

In [18]:
# COMPARE QUERY RESULT TO REFERRENCE
import pprint
import operator
def compareTo(nlimedResultsFile, dataToCompare, isVerbose=True):
    with open(nlimedResultsFile, 'r') as fp:
        nliSettings = json.load(fp)
    parsers = {'benepar', 'stanza', 'coreNLP', 'xStanza'}
    tfModes = {'mode1':'mode_1', 'mode2':'mode_2', 'mode3':'mode_3'}
    mapDf = pd.DataFrame(columns=['tfIndex','tfMode', 'parser','mAP@5','mAP@10','mAP'])
    commonFound = None
    statCommonFound = {k:0 for k,v in dataToCompare.items()}
    for idxMode, v1 in nliSettings.items():
        if idxMode=='ncbo': continue
        for tfMode, v2 in v1.items():
            if tfMode == 'ncbo': continue
            for parser, v3 in v2.items():
                map5 = getMAP(v3['maxAuc100']['results'], dataToCompare,5)
                map10 = getMAP(v3['maxAuc100']['results'], dataToCompare,10)
                map1000 = getMAP(v3['maxAuc100']['results'], dataToCompare,1000)
                mapDf.loc[len(mapDf.index)] = [idxMode, tfModes[tfMode], parser, map5['map'], map10['map'], map1000['map']]
    return mapDf

In [22]:
# SEPARATE BASED ON THE MAX OF TERMS IN QUERY AND TERMS IN ONTOLOGY CLASS PROPORTION
import pandas as pd
mapDf = pd.DataFrame(columns=['tfIndex','tfMode', 'parser','mAP@5','mAP@10','mAP','proportion(p)'])
propLabels = {'0':'p=0', '0.5':'0<p<=0.5', '1.0':'0.5<p<=1.0'}
propSetting = [0,0.5,1.0]
native_QR_PMR_prop ={ps:{} for ps in propSetting}
for q, v in native_QR_PMR.items():
    prop = native_QR_PMR_max[q]
    for ps in propSetting:
        if prop <= ps:
            native_QR_PMR_prop[ps].update({q:v})
            break

# GET MAP FOR EACH PROPORTION and different nlimedResults
nlimedResultsFiles=[
                   'saveJson/nlimedSummaryResultsComplete_0_5.json',
                   'saveJson/nlimedSummaryResultsComplete_3_3_1_1_1_0_5.json',
                   'saveJson/nlimedSummaryResultsComplete_3_3_0.5_0.5_0.5_0_5.json',
                    ]
for nlimedResultsFile in nlimedResultsFiles:
    for prop, native_prop in native_QR_PMR_prop.items():
        df = compareTo(nlimedResultsFile, native_prop, isVerbose=False)
        df['proportion(p)'] = [propLabels[str(prop)]] * len(df.index)
        mapDf = mapDf.append(df)

In [23]:
### PLOT RESULTS
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="ticks", color_codes=True)

p = sns.catplot(x='proportion(p)',y='mAP@10', 
            hue='parser', aspect=1, height=4, 
            row='tfMode', col='tfIndex', 
            margin_titles=True,
            data=mapDf,
            hue_order = ['benepar','stanza','coreNLP','xStanza'],
            kind = 'box', 
#             split=True;
           )
# sns.set_theme(style="ticks")
p.set_titles(row_template = '{row_name}', col_template = '{col_name}')
p._legend.set_title('')
plt.savefig('saveFigures/nlimed_native_behaviour.pdf',dpi=300, bbox_inches="tight")