In [None]:
from IPython.display import clear_output 

""" ORGINAL
This is for testing performance of different wikification methods.
"""

def getWiki5000Entities(annotationData):
    """
    Description:
        A helper method to get the entities of wiki5000 into the right form.
    Args:
        annotationData: The json data that has info that needs to be converted.
    Return:
        The entities in the usual format of [[something, entity],...].
    """
    
    entities = []
    for item in json.loads(annotationData):
        entities.append([None, item['url'].replace(' ', '_')])
    
    return entities

#pathStrt = '/users/cs/amaral/wsd-datasets'
pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]

# short for quick tests
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]

methods = ['popular']

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name']
    
    # run each method on the data set
    for mthd in methods:
        # reset counters
        totalPrecS = 0
        totalPrecM = 0
        totalRecS = 0
        totalRecM = 0
        totalLines = 0
        
        # each method tests all lines
        for line in dataLines:
            # different structure for wiki
            if dataset['name'] == 'wiki5000':
                resultS = None # no pre-split text
                resultM = wikifyEval(line['opening_text'].encode('utf-8').strip(), False, mthd)
                
                # for unification of format for statistical testing
                trueEntities = getWiki5000Entities(line['opening_annotation'])
            else:
                # original split string
                resultS = wikifyEval(line['text'], True, mthd)
                # unsplit string
                resultM = wikifyEval((" ".join(line['text'])).encode('utf-8').strip(), False, mthd)
                
                trueEntities = line['mentions']
                
            ## get statistical results from true entities and results S and M
            
            # wiki5000 exception
            if resultS <> None:
                precS = precision(trueEntities, resultS[1]) # precision of pre-split
            else:
                precS = 0
                
            precM = precision(trueEntities, resultM[1]) # precision of manual split
            
            # wiki5000 exception
            if resultS <> None:
                recS = recall(trueEntities, resultS[1]) # recall of pre-split
            else:
                recS = 0
                
            recM = recall(trueEntities, resultM[1]) # recall of manual split
            
            clear_output() # delete this after
            print str(precS) + ' ' + str(precM) + ' ' + str(recS) + ' ' + str(recM)
            
            # track results
            totalPrecS += precS
            totalPrecM += precM
            totalRecS += recS
            totalRecM += recM
            totalLines += 1
            
            print str(totalLines) + '\n'
        
        # record results for this method on this dataset
        # [avg precision split, avg precision manual, avg recall split, avg recall manual]
        performances[dataset['name']][mthd] = [totalPrecS/totalLines, totalPrecM/totalLines,
                                              totalRecS/totalLines, totalRecM/totalLines,]
            
print performances

In [None]:
""" ORIGINAL
Evaluation of wikifications
"""

from wikipedia import *
from operator import itemgetter
import requests
import json
from __future__ import division

MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored
     
def splitWords(phrase):
    """
    Description:
        Takes in a phrase and splits it into the different word / possible entities.
    Args:
        phrase: The text to be split.
    Return:
        The text split it into the different word / possible entities.
    """
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'ALL', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=phrase)
    textData = r.json()['tags']
    
    splitPhrase = []
    
    for item in textData:
        splitPhrase.append(phrase[item[1]:item[3]])
    
    return splitPhrase

def precision(truthSet, mySet):
    """
    Description:
        Calculates the precision of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The precision: (# of correct entities)/(# of found entities)
    """
    
    numFound = len(mySet)
    numCorrect = 0 # incremented in for loop
    
    # find all correct
    for entity1 in mySet:
        for entity2 in truthSet:
            if entity1[1] == title2id(entity2[1]):
                numCorrect += 1
                break
                
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(truthSet, mySet):
    """
    Description:
        Calculates the recall of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(truthSet)
    numCorrect = 0 # incremented in for loop
    
    # find all correct
    for entity1 in mySet:
        for entity2 in truthSet:
            if entity1[1] == title2id(entity2[1]):
                numCorrect += 1
                break
                
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)
    
def wikifyPopular(phrase):
    """
    Description:
        Takes in a phrase split into different words or anchors and returns the most likely entities
        in the phrase based on popularity method.
    Args:
        phrase: The original phrase in split form: [w1, w2, ...]
    Return:
        The anchors/word ids along with the corresponding entity id. Of the form: 
        [[wid, entityId, entityFreq],...]
    """
    
    proposedEntities = []

    i = 0 # counter to see what word this is on
    for word in phrase:
        results = sorted(anchor2concept(word), key = itemgetter(1), 
                          reverse = True)
        if len(results) > 0: # some results
            entity = results[0]
            proposedEntities.append([i, entity[0], entity[1]])
            i += 1
            
    return proposedEntities

def wikifyEval(phrase, preSplitWords, method='popular', strict = True):
    """
    Description:
        Takes the phrase string, and wikifies it for evaluation purposes using the desired method.
    Args:
        phrase: The string to wikify. Either as just the original string to be modified, or in the 
            form of: [[w1,w2,...], [[wid,entityId,freq],...], with the second part blank and filled in by 
            this method.
        preSplitWords: Whether to use pre-split words from the dataset, or use our own method of
            splitting words.
        method: The method used to wikify.
        strict: Whether to use such rules as minimum metion length, or minimum frequency of concept.
    Return:
        The original split text and the anchors along with their best matched concept from wikipedia.
        Of the form: [[w1,w2,...], [[wid,entityId,freq],...]]
    """
    
    # words are not in pre-split form
    if not(preSplitWords):
        phrase = splitWords(phrase) # modify phrase into split form
    
    #wikified has form: [[w1,w2,...], [[wid,entityId,freq],...], with the second part blank and filled in by 
    #one of the methods below.
    wikified = [phrase]
    
    if method == 'popular':
        wikified.append(wikifyPopular(phrase))
    
    # small clean-up to do
    if strict:
        wikified[1] = [item for item in wikified[1]
                    if not (item[2] < MIN_FREQUENCY or len(phrase[item[0]]) < MIN_MENTION_LENGTH)]
    
    # remove duplicates
    idsHad = [] # a list of entities to check for duplicates
    newWikified1 = [] # to replace old wikified[1]
    for item in wikified[1]:
        if item[1] not in idsHad:
            newWikified1.append(item)
            idsHad.append(item[1])
    wikified[1] = newWikified1
        
    return wikified

In [None]:
%%bash
curl -X POST \
  'http://localhost:8983/solr/enwikianchors20160305/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'I met David in Spain'

In [None]:
def getMostFrequentConcept(anchorSearchRepresentations):
    """
    Description:
        Finds the anchor-search representation (asr) with the most frequency in the list 
        of anchorSeachRepresentations.
    Args:
        anchorSearchRepresentations: a list of possible representations of an anchor
        for searching purposes.
    Return:
        The anchor-search representation that gives the concept with the most
        frequencies along with the concept and its frequency.
    """
    
    # The inputted asrs along with the frequency of thier most popular concept
    asrFrequencies = []
    for asr in anchorSearchRepresentations:
        # gets the most frequent concept from the current asr
        mostFrequent = sorted(anchor2concept(asr), key = itemgetter(1), reverse = True)[0]
        # (asr, concept, concept frequency)
        asrFrequencies.append((asr, mostFrequent[0], mostFrequent[1]))
        
    # get and return the asr with the highest freqency
    return sorted(asrFrequencies, key = itemgetter(2), reverse = True)[0]

In [None]:
def getAnchorPossibilities(data):
    """
    Description: 
        Extracts all potential anchors from each record in the data.
    Args:
        data: The data to get potential anchors from.
    Return: 
        An array of possible anchors for the source word in the following format:
        (start index, end index, source word, possible anchors)
    """
    
    # array of (start index, end index, source word, possible anchors)
    anchorsPossibilities = [] 
    for record in data:
        # don't take words below the threshold
        if record[3] - record[1] >= minWordLength:
            anchorsPossibilities.append((record[1], record[3], 
                                        q[record[1]:record[3]], record[5]))
            
    return anchorsPossibilities

In [None]:
print(q)

minAcceptedFrequency = 20 # do not use entities with less frequencies than this
minWordLength = 3 # for efficiency do not even look at words with less size than this

potentialMentionsAmount = len(queryResult) # the total amount of potential mentions including overlaps

# records appear to be sorted by start index

# get each word and the corresponding variations of that potential anchor
anchorsPossibilities = getAnchorPossibilities(queryResult)
#print anchorsPossibilities

# get the most popular concept for the variations of each potential anchor
anchorPossibilityFrequencies = []
for i in range(len(anchorsPossibilities)):
    anchorPossibilityFrequencies.append((
        anchorsPossibilities[i][0], anchorsPossibilities[i][1], 
        anchorsPossibilities[i][2], getMostFrequentConcept(anchorsPossibilities[i][3])))
    
#print anchorPossibilityFrequencies

prunedPotentialAnchors = [] # store all with high enough frequency here
# ditch all with frequency under threshold
for potentialAnchor in anchorPossibilityFrequencies:
    if(potentialAnchor[3][2] >= minAcceptedFrequency):
        prunedPotentialAnchors.append(potentialAnchor)
        
#print prunedPotentialAnchors

# display final results
for anchor in prunedPotentialAnchors:
    print(anchor[2] + "-->" + "https://en.wikipedia.org/wiki/" 
          + id2title(anchor[3][1]))

In [None]:
from wikipedia import *
from operator import itemgetter
import requests

MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored

def stripSmallMentions(potAnchors):
    """
    Description:
        Removes potential anchors with mentions that are too small from the list.
    Args:
        potAnchors: The list of potential anchors, along with some additional information.
            [{'start', 'end', 'mention', 'mention variations'},...]
    Return:
        A new list of potential anchors.
    """
    newPotAnchors = [] # the new list
    for potAnchor in potAnchors:
        if potAnchor['end'] - potAnchor['start'] >= MIN_MENTION_LENGTH:
            newPotAnchors.append(potAnchor)
    
    return newPotAnchors

def getMostFrequentConcept(mentions):
    """
    Description:
        Finds the mention with the candidate concept with the most frequency.
    Args:
        mentions: A list of mentions to look for the most popular in.
    Return:
        A dictionary of the form {'mention', 'conceptId', 'freq'}.
    """
    
    # The inputted mentions along with the frequency of thier most popular concept
    mentionConceptFreqs = []
    for mention in mentions:
        # gets the most frequent concept from the current asr
        mostFrequent = sorted(anchor2concept(mention), key = itemgetter(1), reverse = True)[0]
        # mostFrequent[0] is conceptId, mostFrequent[1] is frequency of that concept
        mentionConceptFreqs.append((mention, mostFrequent[0], mostFrequent[1]))
        
    # get the mention with the highest freqency
    bestMention = sorted(mentionConceptFreqs, key = itemgetter(2), reverse = True)[0]
    
    bestMentionDict = {}
    bestMentionDict['mention'] = bestMention[0]
    bestMentionDict['conceptId'] = bestMention[1]
    bestMentionDict['freq'] = bestMention[2]
    
    return bestMentionDict

def wikifyPopular(potAnchors, useOriginalMention):
    """
    Description:
        Takes in a list potential anchors and returns the resulting anchors.
    Args:
        potAnchors: The list of potential anchors, along with some additional information.
            [{'start', 'end', 'mention', 'mentionVars'},...]
        useOriginalMention: Whether to use the mention from the original or one of the word forms.
    Return:
        The potential anchors along with the corresponding concept and frequency of that concept.
    """
    
    # if not using original mention, use the mention variation with the most frequent results
    # either way adds 'conceptId', and 'freq'
    if useOriginalMention == False:
        for potAnchor in potAnchors:
            bestMention = getMostFrequentConcept(potAnchor['mentionVars'])
            potAnchor['conceptId'] = bestMention['conceptId']
            potAnchor['freq'] = bestMention['freq']
    else:
        for potAnchor in potAnchors:
            mentionData = sorted(anchor2concept(potAnchor['mention']), 
                                 key = itemgetter(1), reverse = True)[0]
            potAnchor['conceptId'] = mentionData[0]
            potAnchor['freq'] = mentionData[1]
            
    return potAnchors

def wikify(query, useOriginalMention, method='popular'):
    """
    Description:
        Takes the query string, and wikifies it using the desired method.
    Args:
        query: The string to wikify.
        mehtod: The method used to wikify.
        useOriginalMention: Whether to use mention from the original or a potential variation.
    Return:
        The anchors along with their best matched concept from wikipedia.
    """
    
    # first get the potential anchors from solr
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'NO_SUB', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=query)
    queryResult = r.json()['tags']
    
    # an array of dictionaries to hold the data of each potential anchor
    potAnchors = [] 
    # convert queryResult to potAnchors (much cleaner)
    for record in queryResult:
        potAnchors.append({'start':record[1], 'end':record[3], 
                            'mention':query[record[1]:record[3]],
                          'mentionVars':record[5]})
        
    # don't use any potential anchors below size threshold
    potAnchors = stripSmallMentions(potAnchors)
    
    if method == 'popular':
        potAnchors = wikifyPopular(potAnchors, useOriginalMention)
    
    # deal with overlap on anchors here
    
    # return anchors in finalized form
    anchors = []
    for potAnchor in potAnchors:
        if potAnchor['freq'] >= MIN_FREQUENCY: 
            anchors.append({'start':potAnchor['start'], 'end':potAnchor['end'],
                            'mention':potAnchor['mention'],
                           'wikiTitle':id2title(potAnchor['conceptId'])})
    
    return anchors

In [5]:
"""This is for seeing how often correct entity shows up in candidates"""

from IPython.display import clear_output
import copy
from datetime import datetime
import tagme
import numpy as np
import matplotlib.pyplot as plt
from wikipedia import *
from operator import itemgetter
import requests
import json
from __future__ import division


def generateCandidates1(textData, maxC, oText):
    """
    Description:
        Generates up to maxC candidates for each possible mention word in phrase.
    Args:
        textData: A text in split form along with its suspected mentions.
        maxC: The max amount of candidates to accept.
    Return:
        The top maxC candidates for each possible mention word in textData.
    """
    
    candidates = []
    #print textData['text']
    for mention in textData['mentions']:
        
        # get all concepts for the anchor
        concepts = anchor2concept(textData['text'][mention[0]])
        
        # get the ids as string for solr query
        strIds = ['id:' +  str(strId[0]) for strId in concepts]
        
        context = getMentionSentence(oText, mention)
        context = escapeStringSolr(context)
        mentionStr = escapeStringSolr(textData['text'][mention[0]])
        
        # gets the relevancy scores of all of the given potential concepts
        addr = 'http://localhost:8983/solr/enwiki20160305/select'
        params={'fl':'id score', 'indent':'on', 'start': '0', 'rows': str(maxC),
                'fq':" ".join(strIds),
                'q':'text:('+context.encode('utf-8')+')^1 title:(' + mentionStr.encode('utf-8')+')^1.35',
                'wt':'json'}
        r = requests.get(addr, params = params)
        
        solrRes = []
        try:
            if not ('response' not in r.json()
                   or 'docs' not in r.json()['response']
                   or len(r.json()['response']['docs']) == 0):
                for doc in r.json()['response']['docs'][:maxC]:
                    freq = 0
                    for concept in concepts:
                        # find concept frequency
                        if concept[0] == int(doc['id']):
                            freq = concept[1]
                    solrRes.append([long(doc['id']), freq, doc['score']])
        except:
            solrRes = []
                
        # sort by frequency
        solrRes = sorted(solrRes, key = itemgetter(1), reverse = True)
        
        #print '\nMention: ' + textData['text'][mention[0]]
        #for res in solrRes:
        #    print '[' + id2title(res[0]) + '] -> freq: ' + str(res[1]) + ', rel: ' + str(res[2])
        
        candidates.append(solrRes) # take up to maxC of the results
    
    return candidates

def generateCandidates2(textData, maxC):
    """
    Description:
        Generates up to maxC candidates for each possible mention word in phrase (most frequent).
    Args:
        textData: A text in split form along with its suspected mentions.
        maxC: The max amount of candidates to accept.
    Return:
        The top maxC candidates for each possible mention word in textData.
    """
    candidates = []
    
    for mention in textData['mentions']:
        results = sorted(anchor2concept(textData['text'][mention[0]]), key = itemgetter(1), 
                          reverse = True)
        candidates.append(results[:maxC]) # take up to maxC of the results
    
    return candidates

def generateCandidates3(textData, maxC):
    
    candidates = []
    
    for mention in textData['mentions']:
        anchors = anchor2concept(textData['text'][mention[0]])
        entities = []
        
        for anchor in anchors:
            wanchors = id2anchor(anchor[0]) # get all anchors of the id in this anchor
            totalFreq = 0
            for wanchor in wanchors:
                totalFreq += wanchor[1]
            
            entities.append([anchor[0], totalFreq])
        
        results = sorted(entities, key = itemgetter(1), reverse = True)
        
        candidates.append(results[:maxC]) # take up to maxC of the results
    
    return candidates

def generateCandidates5(textData, maxC):
    
    candidates = []
    #print textData['text']
    for mention in textData['mentions']:
        
        # get all concepts for the anchor
        concepts = anchor2concept(textData['text'][mention[0]])
        
        # get the ids as string for solr query
        strIds = ['id:' +  str(strId[0]) for strId in concepts]
        
        context = []
        
        for mention2 in textData['mentions']:
            if mention2 <> mention:
                context += escapeStringSolr(textData['text'][mention2[0]])
        context = " ".join(context)
        mentionStr = escapeStringSolr(textData['text'][mention[0]])
        
        # gets the relevancy scores of all of the given potential concepts
        addr = 'http://localhost:8983/solr/enwiki20160305/select'
        params={'fl':'id score', 'indent':'on', 'start': '0', 'rows': str(maxC),
                'fq':" ".join(strIds),
                'q':'text:('+context.encode('utf-8')+')^1 title:(' + mentionStr.encode('utf-8')+')^1.35',
                'wt':'json'}
        r = requests.get(addr, params = params)
        
        solrRes = []
        try:
            if not ('response' not in r.json()
                   or 'docs' not in r.json()['response']
                   or len(r.json()['response']['docs']) == 0):
                for doc in r.json()['response']['docs'][:maxC]:
                    freq = 0
                    for concept in concepts:
                        # find concept frequency
                        if concept[0] == int(doc['id']):
                            freq = concept[1]
                    solrRes.append([long(doc['id']), freq, doc['score']])
        except:
            solrRes = []
        
        #print '\nMention: ' + textData['text'][mention[0]]
        #for res in solrRes:
        #    print '[' + id2title(res[0]) + '] -> freq: ' + str(res[1]) + ', rel: ' + str(res[2])
        
        candidates.append(solrRes) # take up to maxC of the results
    
    return candidates

pathStrt = '/users/cs/amaral/wsd-datasets'
#pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki-mentions.5000.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},{'name':'wiki500', 'path':os.path.join(pathStrt,'wiki-mentions.500.json')}]

maxC = 20
correctCands1 = {} # dictionary containing the amount of correct entities found at each index
correctCands2 = {} # dictionary containing the amount of correct entities found at each index
correctCands3 = {} # dictionary containing the amount of correct entities found at each index
correctCands4 = {} # dictionary containing the amount of correct entities found at each index
correctCands5 = {} # dictionary containing the amount of correct entities found at each index
for i in range(-1, maxC):
    correctCands1[str(i)] = 0
    correctCands2[str(i)] = 0
    correctCands3[str(i)] = 0
    correctCands4[str(i)] = 0
    correctCands5[str(i)] = 0

totalMentions = 0

for dataset in datasets:
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    
    # put in all lines
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name'] + '\n'
    #print str(datetime.now()) + '\n'
    totalLines = 0
    
    for line in dataLines: 
        print str(totalLines + 1)
        
        totalMentions += len(line['mentions'])
        
        oMentions = mentionStartsAndEnds(copy.deepcopy(line), forTruth = True)
        
        # get in right format
        line['mentions'] = mentionStartsAndEnds(line) # put mentions in right form
        oText = " ".join(line['text'])
            
        cands1 = generateCandidates1(line, maxC, oText)
        cands2 = generateCandidates2(line, maxC)
        cands3 = generateCandidates3(line, maxC)
        cands4 = []
        for cands in cands1:
            cands4.append(sorted(cands, key = itemgetter(2), reverse = True))
        cands5 = generateCandidates5(line, maxC)
        
        i = 0
        for cand in cands1:
            corCand = False
            for j in range(0, len(cand)):
                if cand[j][0] == oMentions[i][2]:
                    correctCands1[str(j)] += 1
                    corCand = True
                    break
            if corCand == False:
                correctCands1['-1'] += 1
            i += 1
            
        i = 0
        for cand in cands2:
            corCand = False
            for j in range(0, len(cand)):
                if cand[j][0] == oMentions[i][2]:
                    correctCands2[str(j)] += 1
                    corCand = True
                    break
            if corCand == False:
                correctCands2['-1'] += 1
            i += 1
            
        i = 0
        for cand in cands3:
            corCand = False
            for j in range(0, len(cand)):
                if cand[j][0] == oMentions[i][2]:
                    correctCands3[str(j)] += 1
                    corCand = True
                    break
            if corCand == False:
                correctCands3['-1'] += 1
            i += 1
            
        i = 0
        for cand in cands4:
            corCand = False
            for j in range(0, len(cand)):
                if cand[j][0] == oMentions[i][2]:
                    correctCands4[str(j)] += 1
                    corCand = True
                    break
            if corCand == False:
                correctCands4['-1'] += 1
            i += 1
            
        i = 0
        for cand in cands5:
            corCand = False
            for j in range(0, len(cand)):
                if cand[j][0] == oMentions[i][2]:
                    correctCands5[str(j)] += 1
                    corCand = True
                    break
            if corCand == False:
                correctCands5['-1'] += 1
            i += 1
            
            
        totalLines += 1

kore

1


NameError: name 'mentionStartsAndEnds' is not defined

In [2]:
"""Shows the effectiveness of different candidate generating functions"""

candsPopRel = []
for i in range(-1,maxC):
    candsPopRel.append(correctCands1[str(i)])
    
candsPop = []
for i in range(-1,maxC):
    candsPop.append(correctCands2[str(i)])
    
candsPopPop = []
for i in range(-1,maxC):
    candsPopPop.append(correctCands3[str(i)])
    
candsRelSentence = []
for i in range(-1,maxC):
    candsRelSentence.append(correctCands4[str(i)])
    
candsRelMentions = []
for i in range(-1,maxC):
    candsRelMentions.append(correctCands5[str(i)])
    
x = range(-1, maxC)

print 'Total Mentions: ' + str(totalMentions)


plt.bar(x, candsPopRel, 0.5, color='red')
plt.show()
print 'Most popular of most relevant'
print str(candsPopRel) + '\n\n'


plt.bar(x, candsPop, 0.5, color='orange')
plt.show()
print 'Most popular, mention total'
print str(candsPop) + '\n\n'


plt.bar(x, candsPopPop, 0.5, color='green')
plt.show()
print 'Most poplular, page total'
print str(candsPopPop) + '\n\n'


plt.bar(x, candsRelSentence, 0.5, color='blue')
plt.show()
print 'Most relevant with sentence context'
print str(candsRelSentence) + '\n\n'


plt.bar(x, candsRelMentions, 0.5, color='purple')
plt.show()
print 'Most relevant with all mentions as context'
print str(candsRelMentions) + '\n\n'

NameError: name 'maxC' is not defined