In [1]:
from NLIMED import NLIMED

In [2]:
import json
with open('DataTest.json', 'r') as fp:
    dataTest = json.load(fp)


In [12]:
"""NLIMED - PMR - Stanford Parser"""
nlimed = NLIMED(repo='pmr', parser='stanford', pl=2, alpha=0.4, beta=0.1, gamma=1.0, delta=1.0)
query = 'flux of sodium'

Stanford server has been started


In [13]:
annotation = nlimed.getAnnotated(query=query,format='json')
print(annotation)

{'phrases': ['flux', 'sodium'], 'result': [[['http://identifiers.org/opb/OPB_00593', 'http://purl.obolibrary.org/obo/CHEBI_29101'], 1.4553906641369228], [['https://identifiers.org/opb/OPB_00593', 'http://purl.obolibrary.org/obo/CHEBI_29101'], 1.4537913504562157], [['http://identifiers.org/opb/OPB_00593', 'http://identifiers.org/chebi/CHEBI:29101'], 1.4539927349265542], [['https://identifiers.org/opb/OPB_00593', 'http://identifiers.org/chebi/CHEBI:29101'], 1.4523934212458471]]}


In [9]:
sparql = nlimed.getSparql(query=query,format='json')
print(sparql)

Stanford server has been started
{'phrases': ['flux', 'sodium'], 'result': [[['http://identifiers.org/opb/OPB_00593', 'http://purl.obolibrary.org/obo/CHEBI_29101'], 1.4553906641369228], [['https://identifiers.org/opb/OPB_00593', 'http://purl.obolibrary.org/obo/CHEBI_29101'], 1.4537913504562157], [['http://identifiers.org/opb/OPB_00593', 'http://identifiers.org/chebi/CHEBI:29101'], 1.4539927349265542], [['https://identifiers.org/opb/OPB_00593', 'http://identifiers.org/chebi/CHEBI:29101'], 1.4523934212458471]]}


In [None]:
models = nlimed.getModels(query=query,format='json')
print(models)

In [None]:
"""EXERIMENT'S FUNCTIONS"""
import time
import matplotlib.pyplot as plt
import string
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

def timeMeasurement(rigid, repeat, **vargs):
    print('EXPERIMENT: EXECUTION TIME')
    print('Approach \t time rate (%d)'%repeat)
    for annType, annotator in vargs.items():
        startTime = time.time()
        for i in range(repeat):
            for key, data in dataTest.items():
                ann = annotator.annotate(data['query'])
        endTime = time.time()
        print('%s \t %f second'%(annType,(endTime-startTime)/repeat))
    print('\n')
        
import re
def getURICode(uris):
    newUris = []
    for uri in uris:
        partUri = uri[uri.rfind('/')+1:].lower()
        regex = re.compile('[^a-z0-9]')
        partUri = regex.sub('', partUri)
        newUris += [partUri]
    return newUris
    
def compareAnnotator(rigid, **vargs):
    statQueryLen = {}
    statGeneral = {}
    statPhraseNum = {}
    for annType in vargs:
        statQueryLen[annType] = {}
        statGeneral[annType] = {}
        statPhraseNum[annType] = {}
        
    for key, data in dataTest.items():
        data['numPhrase'] = len(data['phrase'])
        data['wordLength'] = len(data['query'].translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))).lower().split())
        correctResult = data['annotation'] if rigid else getURICode(data['annotation'])
        for annType, annotator in vargs.items():
            """collect per query statistic"""
            ann = annotator.getAnnotated(query=data['query'],format='json')
            predictResult = ann['result'][0][0] if rigid else getURICode(ann['result'][0][0])
            annCorrect = 0
            for cResult in correctResult:
                annCorrect +=  1 if cResult in predictResult else 0
            lenRet = len(predictResult)
            data[annType] = {'phrase':ann['phrases'],'numPhrase':len(ann['phrases']),'annotation':ann['result'][0][0],'correct':annCorrect,'false':lenRet-annCorrect}
            """collect element and query level statistic"""
            shouldRetrieved = data['numPhrase']
            retrievedNum = len(ann['phrases'])
            correct = annCorrect
            false = lenRet-annCorrect
            queryCorrect = 1 if data['numPhrase'] == len(ann['phrases']) and annCorrect == len(ann['phrases']) else 0
            queryFalse = 0 if queryCorrect == 1 else 1
            """collect num of words based statistic"""
            wordLength = data['wordLength']
            if wordLength in statQueryLen[annType]:
                shouldRetrieved_r = statQueryLen[annType][wordLength]['shouldRetrieved']+shouldRetrieved
                retrievedNum_r = statQueryLen[annType][wordLength]['retrievedNum']+retrievedNum
                correct_r = statQueryLen[annType][wordLength]['correct']+correct
                false_r = statQueryLen[annType][wordLength]['false']+false
                queryCorrect_r = statQueryLen[annType][wordLength]['queryCorrect']+queryCorrect
                queryFalse_r = statQueryLen[annType][wordLength]['queryFalse']+queryFalse
                statQueryLen[annType][wordLength] = {'shouldRetrieved':shouldRetrieved_r,'retrievedNum':retrievedNum_r,'correct':correct_r,'false':false_r,'queryCorrect':queryCorrect_r,'queryFalse':queryFalse_r}
            else :
                statQueryLen[annType][wordLength] = {'shouldRetrieved':shouldRetrieved,'retrievedNum':retrievedNum,'correct':correct,'false':false,'queryCorrect':queryCorrect,'queryFalse':queryFalse}
            """collect num of phrases based statistic"""
            phraseNum = data['numPhrase']
            if phraseNum in statPhraseNum[annType]:
                shouldRetrieved_r = statPhraseNum[annType][phraseNum]['shouldRetrieved']+shouldRetrieved
                retrievedNum_r = statPhraseNum[annType][phraseNum]['retrievedNum']+retrievedNum
                correct_r = statPhraseNum[annType][phraseNum]['correct']+correct
                false_r = statPhraseNum[annType][phraseNum]['false']+false
                queryCorrect_r = statPhraseNum[annType][phraseNum]['queryCorrect']+queryCorrect
                queryFalse_r = statPhraseNum[annType][phraseNum]['queryFalse']+queryFalse
                statPhraseNum[annType][phraseNum] = {'shouldRetrieved':shouldRetrieved_r,'retrievedNum':retrievedNum_r,'correct':correct_r,'false':false_r,'queryCorrect':queryCorrect_r,'queryFalse':queryFalse_r}
            else :
                statPhraseNum[annType][phraseNum] = {'shouldRetrieved':shouldRetrieved,'retrievedNum':retrievedNum,'correct':correct,'false':false,'queryCorrect':queryCorrect,'queryFalse':queryFalse}
            """collect general statistic"""
            if len(statGeneral[annType]) > 0:
                shouldRetrieved_r = statGeneral[annType]['shouldRetrieved']+shouldRetrieved
                retrievedNum_r = statGeneral[annType]['retrievedNum']+retrievedNum
                correct_r = statGeneral[annType]['correct']+correct
                false_r = statGeneral[annType]['false']+false
                queryCorrect_r = statGeneral[annType]['queryCorrect']+queryCorrect
                queryFalse_r = statGeneral[annType]['queryFalse']+queryFalse
                statGeneral[annType] = {'shouldRetrieved':shouldRetrieved_r,'retrievedNum':retrievedNum_r,'correct':correct_r,'false':false_r,'queryCorrect':queryCorrect_r,'queryFalse':queryFalse_r}
            else:
                statGeneral[annType] = {'shouldRetrieved':shouldRetrieved,'retrievedNum':retrievedNum,'correct':correct,'false':false,'queryCorrect':queryCorrect,'queryFalse':queryFalse}
    returnData = {'statGeneral':statGeneral,'statQueryLen':statQueryLen,'statPhraseNum':statPhraseNum,'detilData':dataTest}
    return returnData

def drawPlot(showData,xlabel):
    linestyles = ['--', ':', '-.', '-']
    pattern=0
    for label, stat in showData.items():
        pairData = []
        for numOfWord, val in stat.items():
            precision = val['correct']/val['retrievedNum']
            recall = val['correct']/val['shouldRetrieved']
            fmeasure = 2*precision*recall/(precision+recall) if (precision+recall)>0 else 0
            pairData += [(numOfWord,fmeasure)]
        pairData.sort(key=lambda tup: tup[0])
        label = label+' parser' if label != 'NCBO' else 'NCBO Annotator'
        plt.plot(*zip(*pairData),linestyles[pattern],label=label,linewidth=1.5)
        pattern += 1
    plt.xlabel(xlabel)
    plt.ylabel('F-Measure')
    plt.legend()
    plt.savefig(xlabel+'.pdf',dpi=400)
    plt.show()

def getGeneralResult(result):
    showData = result['statGeneral']
    summary = {}
    for key, stat in showData.items():
        precision = stat['correct'] / stat['retrievedNum']
        recall = stat['correct'] / stat['shouldRetrieved']
        fmeasure = 2 * precision * recall / (precision + recall)
        qAccuracy = stat['queryCorrect'] / (stat['queryCorrect']+stat['queryFalse'])
        summary[key] = {'precision':precision, 'recall':recall, 'fmeasure':fmeasure, 'qAccuracy':qAccuracy}
    return summary
        
def printGeneralResult(result, **kwargs):
    """show general result at element level"""
    print('Performance of all approaches at element level')
    print('Approach \t Precision \t Recall \t F-Measure \t Query accuracy')
    summary = getGeneralResult(result)
    for key, summ in summary.items():
        print('%s \t %f \t %f \t %f \t %f'%(key, summ['precision'], summ['recall'], summ['fmeasure'], summ['qAccuracy']))    
    
    """print and save plot"""
    if 'plot' in kwargs:
        if kwargs['plot'] == True:
            """show general result at query level"""
            print('Performance of all approaches based on query length')
            drawPlot(result['statQueryLen'],'number of terms per query')

            print('Performance of all approaches based on num of phrases per query')
            drawPlot(result['statPhraseNum'],'number of phrases per query')
    
    """save to file"""
    if 'save' in kwargs:
        if kwargs['save'] == True:
            with open('result_annotation.json', 'w') as fp:
                json.dump(result, fp)
                
def print4DPlot(config):
    def printPlot(parser):
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.set_xlabel('\N{greek small letter beta}:'+' synonym')
        ax.set_ylabel('\N{greek small letter gamma}:'+' definition')
        ax.set_zlabel('\N{greek small letter delta}:'+' description')
        cm = plt.cm.get_cmap('Spectral')
        cm.name = 'F-measure'
        img = ax.scatter(config['synonym'], config['definition'], config['description'], c=config[parser], cmap=cm)
        fig.colorbar(img)
        plt.savefig('multiplierSetup'+parser+'.pdf',dpi=400)
        plt.show()
    # print for stanford
    printPlot('stanford')
    # print for nltk
    printPlot('nltk')

In [None]:
"""ANNOTATION EXPERIMENTS"""
"""BEST MULTIPLIER"""

print('PrefDef \t Synonym \t Definition \t Mentioned \t STANFORD \t NLTK')
config = {'prefDef':[],'synonym':[],'definition':[],'description':[],'stanford':[],'nltk':[]}
precision = 0.1
maxRange = 11
maxFMeasure = {'stanford':{'value':0,'combination':[]},'nltk':{'value':0,'combination':[]}}
for m_prefDef in range(1,maxRange):
    print("current preffered def is %d"%m_prefDef)
    for syn in range(maxRange):
        m_synonym = round(syn * precision,1)
        for define in range(maxRange):
            m_definition = round(define * precision,1)
            for mention in range(maxRange):
                m_mention = round(mention * precision * 3,1)
                annoSt = NLIMED(repo='pmr', parser='stanford', pl=1, alpha=m_prefDef, beta=m_synonym, gamma=m_definition, delta=m_mention)
                annoNLTK = NLIMED(repo='pmr', parser='nltk', pl=1, alpha=m_prefDef, beta=m_synonym, gamma=m_definition, delta=m_mention)
                result = compareAnnotator(False,STANFORD=annoSt,NLTK=annoNLTK)
                summary = getGeneralResult(result)
                config['prefDef'] += [m_prefDef]
                config['synonym'] += [m_synonym]
                config['definition'] += [m_definition]
                config['description'] += [m_mention]
                config['stanford'] += [summary['STANFORD']['fmeasure']]
                config['nltk'] += [summary['NLTK']['fmeasure']]
                if summary['STANFORD']['fmeasure'] > maxFMeasure['stanford']['value']:
                    maxFMeasure['stanford']['value'] = summary['STANFORD']['fmeasure']
                    maxFMeasure['stanford']['combination'] = [(m_prefDef,m_synonym,m_definition,m_mention)]
                elif summary['STANFORD']['fmeasure'] == maxFMeasure['stanford']['value']:
                    maxFMeasure['stanford']['combination'] += [(m_prefDef,m_synonym,m_definition,m_mention)]
                if summary['NLTK']['fmeasure'] > maxFMeasure['nltk']['value']:
                    maxFMeasure['nltk']['value'] = summary['NLTK']['fmeasure']
                    maxFMeasure['nltk']['combination'] = [(m_prefDef,m_synonym,m_definition,m_mention)]
                elif summary['NLTK']['fmeasure'] == maxFMeasure['nltk']['value']:
                    maxFMeasure['nltk']['combination'] += [(m_prefDef,m_synonym,m_definition,m_mention)]
    print("finish preffered def %d"%m_prefDef)
    
print4DPlot(config)
with open('maxFMeasureSetup.json', 'w') as fp:
    json.dump(maxFMeasure, fp)
with open('configSetup.json', 'w') as fp:
    json.dump(config, fp)
                

In [None]:
"""ANNOTATION EXPERIMENTS"""
"""MULTIPLIER EFFECT"""
print('PrefDef \t Synonym \t Definition \t Mentioned \t STANFORD \t NLTK')
for alpha in range(1,11):
    for other1 in range(11):
        oth = other1 * 0.1
        annoSt = NLIMED(repo='pmr', parser='stanford', pl=1, alpha=alpha, beta=oth, gamma=0, delta=0)
        annoNLTK = NLIMED(repo='pmr', parser='nltk', pl=1, alpha=alpha, beta=oth, gamma=0, delta=0)
        result = compareAnnotator(False,STANFORD=annoSt,NLTK=annoNLTK)
        summary = getGeneralResult(result)
        print('%d \t%f \t%f \t%f \t%f \t%f'%(m_prefDef,oth,0,0,summary['STANFORD']['fmeasure'],summary['NLTK']['fmeasure']))
        
        annoSt = NLIMED(repo='pmr', parser='stanford', pl=1, alpha=alpha, beta=0, gamma=oth, delta=0)
        annoNLTK = NLIMED(repo='pmr', parser='nltk', pl=1, alpha=alpha, beta=0, gamma=oth, delta=0)
        result = compareAnnotator(False,STANFORD=annoSt,NLTK=annoNLTK)
        summary = getGeneralResult(result)
        print('%d \t%f \t%f \t%f \t%f \t%f'%(m_prefDef,0,oth,0,summary['STANFORD']['fmeasure'],summary['NLTK']['fmeasure']))
        
        annoSt = NLIMED(repo='pmr', parser='stanford', pl=1, alpha=alpha, beta=0, gamma=0, delta=oth)
        annoNLTK = NLIMED(repo='pmr', parser='nltk', pl=1, alpha=alpha, beta=0, gamma=0, delta=oth)
        result = compareAnnotator(False,STANFORD=annoSt,NLTK=annoNLTK)
        summary = getGeneralResult(result)
        print('%d \t%f \t%f \t%f \t%f \t%f'%(m_prefDef,0,0,oth,summary['STANFORD']['fmeasure'],summary['NLTK']['fmeasure']))

In [None]:
"""ANNOTATION EXPERIMENTS"""
"""COMPARE PARSER"""
annoSt = NLIMED(repo='pmr', parser='stanford', pl=1, alpha=0.4, beta=0.1, gamma=1.0, delta=1.0)
annoNLTK = NLIMED(repo='pmr', parser='nltk', pl=1, alpha=1.0, beta=0.7, gamma=0.0, delta=0.7)
annoNCBO = NLIMED(repo='pmr', parser='ncbo', pl=1)
result = compareAnnotator(False,STANFORD=annoSt,NLTK=annoNLTK,NCBO=annoNCBO)
summary = getGeneralResult(result)
printGeneralResult(result, plot=True, save=True)
