In [None]:
from wikipedia import *
from operator import itemgetter
import requests
anchor2concept('Hello')

In [None]:
id2title(48324759L)

```curl -X POST \
  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'Hello New York City'```
 

In [15]:
"""
Wikification for evaluation purposes
"""

from wikipedia import *
from operator import itemgetter
import requests
import json
from __future__ import division

MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored
     
def splitWords(phrase):
    """
    Description:
        Takes in a phrase and splits it into the different words/mentions.
    Args:
        phrase: The text to be split.
    Return:
        The text split it into the different words / mentions.
    """
    
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=phrase)
    textData = r.json()['tags']
    
    splitText = []
    
    for datum in textData:
        splitText.append(phrase[datum[1]:datum[3]])
    
    mentions = []
    
    for i in range(len(splitText)):
        mentions.append([i, '0'])
    
    # get in same format as dataset provided data
    newTextData = {'text':splitText, 'mentions':mentions}
    
    return newTextData

def generateCandidates(phrase, maxC):
    """
    Description:
        Generates up to maxC candidates for each possible mention word in phrase.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
    Return:
        The top maxC candidates for each possible mention word in phrase.
    """
    candidates = []
    
    for mention in phrase['mentions']:
        results = sorted(anchor2concept(phrase['text'][mention[0]]), key = itemgetter(1), 
                          reverse = True)
        candidates.append(results[:maxC]) # take up to maxC of the results
    
    return candidates

def precision(truthSet, mySet):
    """
    Description:
        Calculates the precision of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The precision: (# of correct entities)/(# of found entities)
    """
    
    numFound = len(mySet)
    numCorrect = 0 # incremented in for loop
    
    # find all correct
    for entity1 in mySet:
        for entity2 in truthSet:
            if entity1[1] == title2id(entity2[1]):
                numCorrect += 1
                break
    print 'correct: ' + str(numCorrect) + '\nfound: ' + str(numFound)
    if numFound == 0:
        return 0
    else:
        return (numCorrect/numFound)

def recall(truthSet, mySet):
    """
    Description:
        Calculates the recall of mySet against the truthSet.
    Args:
        truthSet: The 'right' answers for what the entities are.
        mySet: My code's output for what it thinks the right entities are.
    Return:
        The recall: (# of correct entities)/(# of actual entities)
    """
    
    numActual = len(truthSet)
    numCorrect = 0 # incremented in for loop
    
    # find all correct
    for entity1 in mySet:
        for entity2 in truthSet:
            if entity1[1] == title2id(entity2[1]):
                numCorrect += 1
                break
                
    if numActual == 0:
        return 0
    else:
        return (numCorrect/numActual)
    
def getSurroundingWords(phrase, axis, branchSize):
    """
    Description:
        Returns the words as a list that surround the given axis. Expanding out branchSize elements
        on both sides.
    Args:
        phrase: A list of words.
        axis: The index of the word that is the center of where to get surrounding words.
        branchSize: The amount of words to the left and right to get.
    Return:
        The words as a list that surround the given axis. Expanding out branchSize elements
        on both sides.
    """
    
    imin = axis - branchSize
    imax = axis + branchSize
    
    # fix extreme bounds
    if imin < 0:
        imin = 0
    if imax > len(phrase):
        imax = len(phrase)
        
    # return surrounding part of word minus the axis word
    return (phrase[imin:axis] + phrase[axis+1:imax])

def escapeStringSolr(text):
    """
    Description:
        Escapes a given string for use in Solr.
    Args:
        text: The string to escape.
    Return:
        The escaped text.
    """
    
    text = text.replace("\\", "\\\\\\")
    text = text.replace('+', r'\+')
    text = text.replace("-", "\-")
    text = text.replace("&&", "\&&")
    text = text.replace("||", "\||")
    text = text.replace("!", "\!")
    text = text.replace("(", "\(")
    text = text.replace(")", "\)")
    text = text.replace("{", "\{")
    text = text.replace("}", "\}")
    text = text.replace("[", "\[")
    text = text.replace("]", "\]")
    text = text.replace("^", "\^")
    text = text.replace("\"", "\\\"")
    text = text.replace("~", "\~")
    text = text.replace("*", "\*")
    text = text.replace("?", "\?")
    text = text.replace(":", "\:")
    
    return text

def bestContextMatch(mention, context, candidates):
    """
    Description:
        Uses Solr to find the candidate that gives the highest relevance when given the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that suround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The index of the candidate with the best relevance score from the context.
    """
    
    # put text in right format
    text = (" ".join(context)).encode('utf-8')
    text = escapeStringSolr(text)
    mention = escapeStringSolr(mention.encode('utf-8'))
    
    strIds = ['id:' +  str(strId[0]) for strId in candidates]
    
    # select all the docs from Solr with the best scores, highest first.
    addr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'fl':'id score', 'fq':" ".join(strIds), 'indent':'on',
            'q':'text:('+text.decode('string_escape')+') title:(' + mention.decode('string_escape') + ')^0.5',
            'wt':'json'}
    r = requests.get(addr, params = params)
    
    if 'response' not in r.json():
        print 'fail: error'
        return 0 # default to most popular
    
    results = r.json()['response']['docs']
    if len(results) == 0:
        print 'fail: no results'
        return 0 # default to most popular
    
    bestId = long(r.json()['response']['docs'][0]['id'])
    print r.json()['response']['docs'][0]['score']
    
    # find which index has bestId
    bestIndex = 0
    for cand in candidates:
        if cand[0] == bestId:
            return bestIndex
        else:
            bestIndex += 1
            
    return bestIndex # in case it was missed
    
def wikifyPopular(phrase, candidates):
    """
    Description:
        Chooses the most popular candidate for each mention.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The word index, entity id, and entity frequency of each winning candidate.
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in phrase['mentions']:
        if len(candidates[i]) > 0:
            topCandidates.append([mention[0], candidates[i][0][0], candidates[i][0][1]])
        else:
            topCandidates.append([mention[0], 0, 0]) # a bad mention
        i += 1 # move to list of candidates for next mention
            
    return topCandidates

def wikifyContext(phrase, candidates, ctxBrchSz = 5):
    """
    Description:
        Chooses the candidate that has the highest relevance with the surrounding contextBranchSize words.
    Args:
        phrase: A phrase in split form along with its suspected mentions.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
        ctxBrchSz: How many words on both sides of a mention to search.
    Return:
        The word index, entity id, and entity frequency of each winning candidate.
    """
    
    topCandidates = []
    i = 0 # track which mention's candidates we are looking at
    # for each mention choose the top candidate
    for mention in phrase['mentions']:
        if len(candidates[i]) > 0:
            # get the 
            context = getSurroundingWords(phrase['text'], mention[0], ctxBrchSz)
            bestIndex = bestContextMatch(phrase['text'][mention[0]], context, candidates[i])
            topCandidates.append([mention[0], candidates[i][bestIndex][0], candidates[i][bestIndex][1]])
        else:
            topCandidates.append([mention[0], 0, 0]) # a bad mention
        i += 1 # move to list of candidates for next mention
        
    return topCandidates

def wikifyEval(phrase, mentionsGiven, maxC = 20, method='popular', strict = False):
    """
    Description:
        Takes the phrase string, and wikifies it for evaluation purposes using the desired method.
    Args:
        phrase: The string to wikify. Either as just the original string to be modified, or in the 
            form of: [[w1,w2,...], [[wid,entityId],...] if the mentions are given.
        mentionsGiven: Whether the mentions are given to us and the text is already split.
        maxC: The max amount of candidates to extract.
        method: The method used to wikify.
        strict: Whether to use such rules as minimum metion length, or minimum frequency of concept.
    Return:
        The original split text and the anchors along with their best matched concept from wikipedia.
        Of the form: [[w1,w2,...], [[wid,entityId],...]]
    """
    
    # words are not in pre-split form
    if not(mentionsGiven):
        phrase = splitWords(phrase) # modify phrase into split form
        
    wikified = [phrase['text']] # second index with proposed entities filled later
    
    # get rid of small mentions
    if strict:
        phrase['mentions'] = [item for item in phrase['mentions']
                    if  len(phrase['text'][item[0]]) >= MIN_MENTION_LENGTH]
    
    candidates = generateCandidates(phrase, maxC)
    
    if method == 'popular':
        wikified.append(wikifyPopular(phrase, candidates))
    elif method == 'context':
        wikified.append(wikifyContext(phrase, candidates, ctxBrchSz = len(phrase['text'])))
    
    # get rid of very unpopular mentions
    if strict:
        wikified[1] = [item for item in wikified[1]
                    if item[2] >= MIN_FREQUENCY]
    
    """# remove duplicates
    idsHad = [] # a list of entities to check for duplicates
    newWikified1 = [] # to replace old wikified[1]
    for item in wikified[1]:
        if item[1] not in idsHad:
            newWikified1.append(item)
            idsHad.append(item[1])
    wikified[1] = newWikified1"""
        
    return wikified

In [16]:
from IPython.display import clear_output

"""
This is for testing performance of different wikification methods.
"""

def getWiki5000Entities(annotationData):
    """
    Description:
        A helper method to get the entities of wiki5000 into the right form.
    Args:
        annotationData: The json data that has info that needs to be converted.
    Return:
        The entities in the usual format of [[something, entity],...].
    """
    
    entities = []
    for item in json.loads(annotationData):
        entities.append([None, item['url'].replace(' ', '_')])
    
    return entities

#pathStrt = '/users/cs/amaral/wsd-datasets'
pathStrt = 'C:\\Temp\\wsd-datasets'

# the data sets for performing on
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')},
            {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')},
            {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')},
            {'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]

# short for quick tests
#datasets = [{'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}]
#datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}]
#datasets = [{'name':'wiki5000', 'path':os.path.join(pathStrt,'wiki.5000.json')}]
datasets = [{'name':'kore', 'path':os.path.join(pathStrt,'kore.json')}, {'name':'AQUAINT', 'path':os.path.join(pathStrt,'AQUAINT.txt.json')}, {'name':'MSNBC', 'path':os.path.join(pathStrt,'MSNBC.txt.json')}]

methods = ['context']

performances = {}

# for each dataset, run all methods
for dataset in datasets:
    performances[dataset['name']] = {}
    # get the data from dataset
    dataFile = open(dataset['path'], 'r')
    dataLines = []
    for line in dataFile:
        dataLines.append(json.loads(line.decode('utf-8').strip()))
        
    print dataset['name']
    
    # run each method on the data set
    for mthd in methods:
        print mthd
        
        # reset counters
        totalPrecS = 0
        totalPrecM = 0
        totalRecS = 0
        totalRecM = 0
        totalLines = 0
        
        # each method tests all lines
        for line in dataLines:
            # different structure for wiki
            if dataset['name'] == 'wiki5000':
                resultS = None # no pre-split text
                resultM = wikifyEval(line['opening_text'].encode('utf-8').strip(), False, method = mthd)
                
                # for unification of format for statistical testing
                trueEntities = getWiki5000Entities(line['opening_annotation'])
            else:
                # original split string
                resultS = wikifyEval(line, True, method = mthd)
                # unsplit string
                #resultM = wikifyEval((" ".join(line['text'])).encode('utf-8').strip(), False, method = mthd)
                
                #print str(resultS) + '\n'
                #print str(resultM) + '\n\n'
                
                trueEntities = line['mentions']
                
            resultM = [[],[]]
                
            ## get statistical results from true entities and results S and M
            
            # wiki5000 exception
            if resultS <> None:
                precS = precision(trueEntities, resultS[1]) # precision of pre-split
            else:
                precS = 0
                
            precM = precision(trueEntities, resultM[1]) # precision of manual split
            
            # wiki5000 exception
            if resultS <> None:
                recS = recall(trueEntities, resultS[1]) # recall of pre-split
            else:
                recS = 0
                
            recM = recall(trueEntities, resultM[1]) # recall of manual split
            
            #clear_output() # delete this after
            print str(precS) + ' ' + str(precM) + ' ' + str(recS) + ' ' + str(recM)
            #print str(precS) + ' ' + str(recS)
            
            # track results
            totalPrecS += precS
            totalPrecM += precM
            totalRecS += recS
            totalRecM += recM
            totalLines += 1
            
            print str(totalLines) + '\n'
        
        # record results for this method on this dataset
        # [avg precision split, avg precision manual, avg recall split, avg recall manual]
        performances[dataset['name']][mthd] = {'Pre-Split Precision':totalPrecS/totalLines, 
                                               'Manual Split Precision':totalPrecM/totalLines,
                                              'Pre-Split Recall':totalRecS/totalLines, 
                                               'Manual Split Recall':totalRecM/totalLines}
            
print performances

kore
popular
correct: 0
found: 2
correct: 0
found: 0
0.0 0 0.0 0.0
1

correct: 0
found: 2
correct: 0
found: 0
0.0 0 0.0 0.0
2

correct: 1
found: 2
correct: 0
found: 0
0.5 0 0.5 0.0
3

correct: 0
found: 2
correct: 0
found: 0
0.0 0 0.0 0.0
4

correct: 1
found: 1
correct: 0
found: 0
1.0 0 1.0 0.0
5

correct: 1
found: 2
correct: 0
found: 0
0.5 0 0.5 0.0
6

correct: 0
found: 3
correct: 0
found: 0
0.0 0 0.0 0.0
7

correct: 1
found: 3
correct: 0
found: 0
0.333333333333 0 0.333333333333 0.0
8

correct: 0
found: 2
correct: 0
found: 0
0.0 0 0.0 0.0
9

correct: 2
found: 5
correct: 0
found: 0
0.4 0 0.4 0.0
10

correct: 2
found: 4
correct: 0
found: 0
0.5 0 0.5 0.0
11

correct: 1
found: 3
correct: 0
found: 0
0.333333333333 0 0.333333333333 0.0
12

correct: 0
found: 3
correct: 0
found: 0
0.0 0 0.0 0.0
13

correct: 3
found: 5
correct: 0
found: 0
0.6 0 0.6 0.0
14

correct: 3
found: 6
correct: 0
found: 0
0.5 0 0.5 0.0
15

correct: 0
found: 3
correct: 0
found: 0
0.0 0 0.0 0.0
16

correct: 1
found: 3
corr

In [None]:
"""
Test individual text on wikification.
"""

data = json.loads("""{"text": ["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."], "mentions": [[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]}
""".decode('utf-8').strip())

print str(data) + '\n'

print " ".join(data['text']).encode('utf-8').strip()

#results = wikifyEval(data['text'], True, 'popular', True)
results = wikifyEval(" ".join(data['text']).encode('utf-8').strip(), False, method='popular')
print results[0]
for result in results[1]:
    print id2title(result[1])

prec = precision(data['mentions'], results[1])
rec = recall(data['mentions'], results[1])

print '\nprecision: ' + str(prec) + ', rec: ' + str(rec) + '\n'

In [None]:
"""
This is for testing if the wikification works.
"""

from IPython.core.display import display, HTML

phrase = 'Three of the greatest guitarists started their career in a single band : Clapton , Beck , and Page'
print phrase + "\n"

anchors = wikify(phrase, False)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print

anchors = wikify(phrase, True)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print
    
newText = ""

anchors = sorted(anchors, key=itemgetter('start')) # make sure anchors are sorted
anchorIndex = 0 # keep track of current anchor added
i = 0 
while i < len(phrase):
    if anchorIndex < len(anchors) and i == anchors[anchorIndex]['start']:
        anchor = anchors[anchorIndex]
        newText += ("<a href=\"https://en.wikipedia.org/wiki/" + anchor['wikiTitle']
                   + "\" target=\"_blank\">" + anchor['mention'] + "</a>")
        i = anchors[anchorIndex]['end']
        anchorIndex += 1
    else:
        newText += phrase[i]
        i += 1
    
display(HTML(newText))

In [None]:
"""
Ideas:
    -anchor frequency adjuster
    -use similarity with other anchors

Sample Querries:
    'I walked down to the park and found a duck and a pebble'
    'I walked into an electronic store and bought a pebble'
    'I walked down to the park and found a duck studying quantum mechanics'
    'I walked down to the park and found a duck studying quantum mechanical systems'
    'I met David in Spain'
    'An entomologist spots what might be a rare subspecies of beetle, due to the pattern on its back'
"""

In [None]:
tmp = sorted(anchor2concept("David Edgar"), key = itemgetter(1), 
                          reverse = True)

for tmpp in tmp:
    print 'id: ' + str(tmpp[0]) + ', title: ' + id2title(tmpp[0])

In [None]:
split = splitWords('I walked down to the park and found a duck studying quantum mechanical systems')
print split
cands = generateCandidates(split, 20)

print cands

In [None]:
id2title(33509L)

In [None]:
text = " ".join(["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."])
print text

text = text.replace("\\", "\\\\\\")
text = text.replace('+', r'\+')
text = text.replace("-", "\-")
text = text.replace("&&", "\&&")
text = text.replace("||", "\||")
text = text.replace("!", "\!")
text = text.replace("(", "\(")
text = text.replace(")", "\)")
text = text.replace("{", "\{")
text = text.replace("}", "\}")
text = text.replace("[", "\[")
text = text.replace("]", "\]")
text = text.replace("^", "\^")
text = text.replace("\"", "\\\"")
text = text.replace("~", "\~")
text = text.replace("*", "\*")
text = text.replace("?", "\?")
text = text.replace(":", "\:")

text = text.decode('string_escape')

print text + '\n\n'

addr = 'http://localhost:8983/solr/enwiki20160305/select'
params={'fl':'title id score', 'fq':'id:8551 id:8618', 'indent':'on', 'q':'\text:('+text.decode('string_escape')+')', 'wt':'json'}
r = requests.get(addr, params = params)
textData = r.json()

print textData