In [1]:
from wikipedia import *
from operator import itemgetter
import requests
anchor2concept('Hello')

((173129L, 1L),
 (436657L, 1L),
 (561291L, 1L),
 (619375L, 9L),
 (829828L, 2L),
 (1122016L, 33L),
 (1231628L, 7L),
 (1929746L, 5L),
 (2677658L, 1L),
 (2952280L, 17L),
 (3333421L, 4L),
 (3545769L, 3L),
 (3600798L, 1L),
 (4139463L, 7L),
 (5631711L, 1L),
 (5829200L, 37L),
 (5991011L, 4L),
 (6710844L, 7L),
 (7956284L, 4L),
 (8273746L, 3L),
 (8579043L, 13L),
 (10111613L, 2L),
 (10220560L, 1L),
 (11908540L, 1L),
 (11937084L, 1L),
 (12066052L, 3L),
 (12559739L, 7L),
 (17073430L, 2L),
 (18994196L, 1L),
 (19020847L, 3L),
 (25156814L, 2L),
 (26217848L, 1L),
 (26256152L, 2L),
 (28061309L, 5L),
 (29972030L, 25L),
 (31112163L, 2L),
 (32314734L, 1L),
 (33825268L, 3L),
 (33942076L, 3L),
 (34655850L, 6L),
 (36482433L, 4L),
 (40049488L, 2L),
 (42098234L, 3L),
 (43826902L, 6L),
 (47757964L, 2L),
 (48324759L, 53L),
 (48443879L, 1L))

In [2]:
id2title(48324759L)

'Hello_(Adele_song)'

```curl -X POST \
  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'Hello New York City'```
 

In [32]:
MIN_MENTION_LENGTH = 3 # mentions must be at least this long
MIN_FREQUENCY = 20 # anchor with frequency below is ignored

def stripSmallMentions(potAnchors):
    """
    Description:
        Removes potential anchors with mentions that are too small from the list.
    Args:
        potAnchors: The list of potential anchors, along with some additional information.
            [{'start', 'end', 'mention', 'mention variations'},...]
    Return:
        A new list of potential anchors.
    """
    newPotAnchors = [] # the new list
    for potAnchor in potAnchors:
        if potAnchor['end'] - potAnchor['start'] >= MIN_MENTION_LENGTH:
            newPotAnchors.append(potAnchor)
    
    return newPotAnchors

def getMostFrequentConcept(mentions):
    """
    Description:
        Finds the mention with the candidate concept with the most frequency.
    Args:
        mentions: A list of mentions to look for the most popular in.
    Return:
        A dictionary of the form {'mention', 'conceptId', 'freq'}.
    """
    
    # The inputted mentions along with the frequency of thier most popular concept
    mentionConceptFreqs = []
    for mention in mentions:
        # gets the most frequent concept from the current asr
        mostFrequent = sorted(anchor2concept(mention), key = itemgetter(1), reverse = True)[0]
        # mostFrequent[0] is conceptId, mostFrequent[1] is frequency of that concept
        mentionConceptFreqs.append((mention, mostFrequent[0], mostFrequent[1]))
        
    # get the mention with the highest freqency
    bestMention = sorted(mentionConceptFreqs, key = itemgetter(2), reverse = True)[0]
    
    bestMentionDict = {}
    bestMentionDict['mention'] = bestMention[0]
    bestMentionDict['conceptId'] = bestMention[1]
    bestMentionDict['freq'] = bestMention[2]
    
    return bestMentionDict

def wikifyPopular(potAnchors, useOriginalMention):
    """
    Description:
        Takes in a list potential anchors and returns the resulting anchors.
    Args:
        potAnchors: The list of potential anchors, along with some additional information.
            [{'start', 'end', 'mention', 'mentionVars'},...]
        useOriginalMention: Whether to use the mention from the original or one of the word forms.
    Return:
        The potential anchors along with the corresponding concept and frequency of that concept.
    """
    
    # if not using original mention, use the mention variation with the most frequent results
    # either way adds 'conceptId', and 'freq'
    if useOriginalMention == False:
        for potAnchor in potAnchors:
            bestMention = getMostFrequentConcept(potAnchor['mentionVars'])
            potAnchor['conceptId'] = bestMention['conceptId']
            potAnchor['freq'] = bestMention['freq']
    else:
        for potAnchor in potAnchors:
            mentionData = sorted(anchor2concept(potAnchor['mention']), 
                                 key = itemgetter(1), reverse = True)[0]
            potAnchor['conceptId'] = mentionData[0]
            potAnchor['freq'] = mentionData[1]
            
    return potAnchors

def wikify(query, useOriginalMention, method='popular'):
    """
    Description:
        Takes the query string, and wikifies it using the desired method.
    Args:
        query: The string to wikify.
        mehtod: The method used to wikify.
        useOriginalMention: Whether to use mention from the original or a potential variation.
    Return:
        The anchors along with their best matched concept from wikipedia.
    """
    
    # first get the potential anchors from solr
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'ALL', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
    r = requests.post(addr, params=params, data=query)
    queryResult = r.json()['tags']
    
    # an array of dictionaries to hold the data of each potential anchor
    potAnchors = [] 
    # convert queryResult to potAnchors (much cleaner)
    for record in queryResult:
        potAnchors.append({'start':record[1], 'end':record[3], 
                            'mention':query[record[1]:record[3]],
                          'mentionVars':record[5]})
        
    # don't use any potential anchors below size threshold
    potAnchors = stripSmallMentions(potAnchors)
    
    if method == 'popular':
        potAnchors = wikifyPopular(potAnchors, useOriginalMention)
    
    # deal with overlap on anchors here
    
    # return anchors in finalized form
    anchors = []
    for potAnchor in potAnchors:
        if potAnchor['freq'] >= MIN_FREQUENCY: 
            anchors.append({'mention':potAnchor['mention'],
                           'wikiTitle':id2title(potAnchor['conceptId'])})
    
    return anchors

In [None]:
"""
Ideas:
    -anchor frequency adjuster
    -use similarity with other anchors

Sample Querries:
    'I walked down to the park and found a duck and a pebble'
    'I walked into an electronic store and bought a pebble'
    'I walked down to the park and found a duck studying quantum mechanics'
    'I walked down to the park and found a duck studying quantum mechanical systems'
    'I met David in Spain'
    'An entomologist spots what might be a rare subspecies of beetle, due to the pattern on its back'
"""

In [38]:
phrase = 'I met David in Spain'
print phrase + "\n"

anchors = wikify(phrase, True)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']
    
print

anchors = wikify(phrase, False)
for anchor in anchors:
    print anchor['mention'] + '-->' + anchor['wikiTitle']

I met David in Spain

David-->David
Spain-->Spain

David-->David
Spain-->Spain


In [13]:
print(q)

minAcceptedFrequency = 20 # do not use entities with less frequencies than this
minWordLength = 3 # for efficiency do not even look at words with less size than this

potentialMentionsAmount = len(queryResult) # the total amount of potential mentions including overlaps

# records appear to be sorted by start index

# get each word and the corresponding variations of that potential anchor
anchorsPossibilities = getAnchorPossibilities(queryResult)
#print anchorsPossibilities

# get the most popular concept for the variations of each potential anchor
anchorPossibilityFrequencies = []
for i in range(len(anchorsPossibilities)):
    anchorPossibilityFrequencies.append((
        anchorsPossibilities[i][0], anchorsPossibilities[i][1], 
        anchorsPossibilities[i][2], getMostFrequentConcept(anchorsPossibilities[i][3])))
    
#print anchorPossibilityFrequencies

prunedPotentialAnchors = [] # store all with high enough frequency here
# ditch all with frequency under threshold
for potentialAnchor in anchorPossibilityFrequencies:
    if(potentialAnchor[3][2] >= minAcceptedFrequency):
        prunedPotentialAnchors.append(potentialAnchor)
        
#print prunedPotentialAnchors

# display final results
for anchor in prunedPotentialAnchors:
    print(anchor[2] + "-->" + "https://en.wikipedia.org/wiki/" 
          + id2title(anchor[3][1]))

I walked down to the park and found a duck studying quantum mechanical systems
[(2, 8, 'walked', [u'walked']), (9, 13, 'down', [u'down', u'"down"']), (17, 20, 'the', [u'the', u'"the"']), (17, 25, 'the park', [u'the park']), (21, 25, 'park', [u'park']), (26, 29, 'and', [u'and', u'"+" and "\u2212"']), (30, 35, 'found', [u'found']), (36, 42, 'a duck', [u'a duck']), (38, 42, 'duck', [u'duck', u"duck's"]), (43, 51, 'studying', [u'studying']), (52, 59, 'quantum', [u'quantum', u'"quantum"']), (52, 70, 'quantum mechanical', [u'quantum mechanical', u'quantum-mechanical']), (52, 78, 'quantum mechanical systems', [u'quantum mechanical systems']), (60, 70, 'mechanical', [u'mechanical']), (60, 78, 'mechanical systems', [u'mechanical systems']), (71, 78, 'systems', [u'systems', u'.systems', u"systems'"])]
[(2, 8, 'walked', (u'walked', 3802L, 206L)), (9, 13, 'down', (u'down', 1084904L, 102L)), (21, 25, 'park', (u'park', 166459L, 1330L)), (38, 42, 'duck', (u'duck', 37674L, 1284L)), (52, 59, 'quantum',

In [3]:
def getAnchorPossibilities(data):
    """
    Description: 
        Extracts all potential anchors from each record in the data.
    Args:
        data: The data to get potential anchors from.
    Return: 
        An array of possible anchors for the source word in the following format:
        (start index, end index, source word, possible anchors)
    """
    
    # array of (start index, end index, source word, possible anchors)
    anchorsPossibilities = [] 
    for record in data:
        # don't take words below the threshold
        if record[3] - record[1] >= minWordLength:
            anchorsPossibilities.append((record[1], record[3], 
                                        q[record[1]:record[3]], record[5]))
            
    return anchorsPossibilities

In [4]:
def getMostFrequentConcept(anchorSearchRepresentations):
    """
    Description:
        Finds the anchor-search representation (asr) with the most frequency in the list 
        of anchorSeachRepresentations.
    Args:
        anchorSearchRepresentations: a list of possible representations of an anchor
        for searching purposes.
    Return:
        The anchor-search representation that gives the concept with the most
        frequencies along with the concept and its frequency.
    """
    
    # The inputted asrs along with the frequency of thier most popular concept
    asrFrequencies = []
    for asr in anchorSearchRepresentations:
        # gets the most frequent concept from the current asr
        mostFrequent = sorted(anchor2concept(asr), key = itemgetter(1), reverse = True)[0]
        # (asr, concept, concept frequency)
        asrFrequencies.append((asr, mostFrequent[0], mostFrequent[1]))
        
    # get and return the asr with the highest freqency
    return sorted(asrFrequencies, key = itemgetter(2), reverse = True)[0]

In [None]:
%%bash
curl -X POST \
  'http://localhost:8983/solr/enwikianchors20160305/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'I met David in Spain'