In [131]:
from wikipedia import *
from operator import itemgetter
anchor2concept('I\'s')

((1053836L, 1L),)

In [132]:
id2title(1053836L)

'I"s'

```curl -X POST \
  'http://localhost:8983/solr/geonames/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id,name,countrycode&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'Hello New York City'```
 

In [148]:
import requests
qstr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
q='I walked down to the park and found a duck and a pebble'
#q='I walked into an electronic store and bought a pebble'
#q='I met David in Spain'
#q='An entomologist spots what might be a rare subspecies of beetle, due to the pattern on its back.'
params={'overlaps':'ALL', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on'}
r = requests.post(qstr, params=params, data=q)
#print(r.json()['tags'])
queryResult = r.json()['tags']

minAcceptedFrequency = 20 # do not use entities with less frequencies than this
minWordLength = 3 # for efficiency do not even look at words with less size than this

potentialMentionsAmount = len(queryResult) # the total amount of potential mentions including overlaps

# records appear to be sorted by start index

# get each word and the corresponding variations of that potential anchor
anchorsPossibilities = getAnchorPossibilities(queryResult)
#print anchorsPossibilities

# get the most popular concept for the variations of each potential anchor
anchorPossibilityFrequencies = []
for i in range(len(anchorsPossibilities)):
    anchorPossibilityFrequencies.append((
        anchorsPossibilities[i][0], anchorsPossibilities[i][1], 
        anchorsPossibilities[i][2], getMostFrequentConcept(anchorsPossibilities[i][3])))
    
#print anchorPossibilityFrequencies

prunedPotentialAnchors = [] # store all with high enough frequency here
# ditch all with frequency under threshold
for potentialAnchor in anchorPossibilityFrequencies:
    if(potentialAnchor[3][2] >= minAcceptedFrequency):
        prunedPotentialAnchors.append(potentialAnchor)
        
print prunedPotentialAnchors

# display final results
for anchor in prunedPotentialAnchors:
    print(anchor[2] + "-->" + "https://en.wikipedia.org/wiki/" 
          + id2title(anchor[3][1]))

[(2, 8, 'walked', (u'walked', 3802L, 206L)), (9, 13, 'down', (u'down', 1084904L, 102L)), (21, 25, 'park', (u'park', 166459L, 1330L)), (38, 42, 'duck', (u'duck', 37674L, 1284L)), (49, 55, 'pebble', (u'pebble', 314610L, 128L))]
walked-->https://en.wikipedia.org/wiki/Base_on_balls
down-->https://en.wikipedia.org/wiki/Down_feather
park-->https://en.wikipedia.org/wiki/Park
duck-->https://en.wikipedia.org/wiki/Duck
pebble-->https://en.wikipedia.org/wiki/Pebble


In [127]:
def getAnchorPossibilities(data):
    """
    Description: 
        Extracts all potential anchors from each record in the data.
    Args:
        data: json data to get potential anchors.
    Return: 
        An array of possible anchor for the source word in the following format:
        (start index, end index, source word, possible anchors)
    """
    
    # array of (start index, end index, source word, possible anchors)
    anchorsPossibilities = [] 
    for record in data:
        # don't take words below the threshold
        if record[3] - record[1] >= minWordLength:
            anchorsPossibilities.append((record[1], record[3], 
                                        q[record[1]:record[3]], record[5]))
            
    return anchorsPossibilities

In [133]:
def getMostFrequentConcept(anchorSearchRepresentations):
    """
    Description:
        Finds the anchor-search representation (asr) with the most frequency in the list 
        of anchorSeachRepresentations.
    Args:
        anchorSearchRepresentations: a list of possible representations of an anchor
        for searching purposes.
    Return:
        The anchor-search representation that gives the concept with the most
        frequencies along with the concept and its frequency.
    """
    
    # The inputted asrs along with the frequency of thier most popular concept
    asrFrequencies = []
    for asr in anchorSearchRepresentations:
        # gets the most frequent concept from the current asr
        mostFrequent = sorted(anchor2concept(asr), key = itemgetter(1), reverse = True)[0]
        # (asr, concept, concept frequency)
        asrFrequencies.append((asr, mostFrequent[0], mostFrequent[1]))
        
    # get and return the asr with the highest freqency
    return sorted(asrFrequencies, key = itemgetter(2), reverse = True)[0]

In [4]:
%%bash
curl -X POST \
  'http://localhost:8983/solr/enwikianchors20160305/tag?overlaps=NO_SUB&tagsLimit=5000&fl=id&wt=json&indent=on' \
  -H 'Content-Type:text/plain' -d 'I met David in Spain'

{
  "responseHeader":{
    "status":0,
    "QTime":3},
  "tagsCount":4,
  "tags":[[
      "startOffset",0,
      "endOffset",1,
      "ids",["I's",
        "I",
        "^I",
        "\"I\"",
        "[I",
        "∃I",
        "I`",
        "I^",
        "I ...",
        "I ♥",
        "I ♥...",
        "I-",
        "I,",
        "I:",
        "I.",
        "I'",
        "I£",
        "I*",
        "I++",
        "I′",
        "I♥...",
        "Í",
        "Ì",
        "Ĭ",
        "Î",
        "Ǐ",
        "Ï",
        "Ḯ",
        "Ï»¿",
        "Ĩ",
        "Į",
        "Ī",
        "Ỉ",
        "Ȉ",
        "Ȋ",
        "Ị",
        "Ḭ",
        "İ",
        "Ï¿½",
        " I "]],
    [
      "startOffset",2,
      "endOffset",5,
      "ids",["met"]],
    [
      "startOffset",6,
      "endOffset",11,
      "ids",["David's",
        "David’s",
        "David",
        "\"David",
        "\"David\"",
        "David)",
        "Dávid",
        "Davíd",
        "Davið",
        "Da

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  1870  100  1851  100    19   120k   1266 --:--:-- --:--:-- --:--:--  120k
