In [None]:
# Editing Ryan's code
def context_to_profile_sim(mention, context, candidates):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    
    
    # put text in right format
    if not context:
        return [0]*len(candidates)
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['id:' +  str(tid) for tid,_ in candidates])
        

    # select all the docs from Solr with the best scores, highest first.
    qst = 'http://localhost:8983/solr/enwiki20160305/select'
    q='text:('+context+')^1 title:(' + mention+')^1.35'
    
    params={'fl':'id score', 'fq':filter_ids, 'indent':'on',
            'q':q, 'wt':'json'}
    
    #print params
    
    r = requests.get(qst, params = params).json()['response']['docs']
    id_score_map=defaultdict(float, {long(ri['id']):ri['score'] for ri in r})
    id_score=[id_score_map[c] for c,_ in candidates]
    return id_score

# Important TODO
# This queriy is very much skewed toward popularity, better to replace space with AND
#!!!! I don't like this implementation, instead of retrieving and counting, better to let the 
# solr does the counting, 
def context_to_context_sim(mention, context, candidates, rows=10):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mentionStr: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    if not context:
        return [0]*len(candidates)
    
    # put text in right format
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['entityid:' +  str(tid) for tid,_ in candidates])
    
    
    # select all the docs from Solr with the best scores, highest first.
    qstr = 'http://localhost:8983/solr/enwiki20160305_context/select'
    q="_context_:(%s) entity:(%s)" % (context,mention)
    
    params={'fl':'entityid', 'fq':filter_ids, 'indent':'on',
            'q':q,'wt':'json', 'rows':rows}
    #print params
    r = requests.get(qstr, params = params)
    cnt = Counter()
    
    for doc in r.json()['response']['docs']:
        cnt[long(doc['entityid'])] += 1
    
    id_score=[cnt[c] for c,_ in candidates]
    return id_score



In [18]:
from wsd_util import *
S = "I like programming with Python".split()
M = [[2,'UNKNOWN'], [4,'UNKNOWN']]
     
print S
print M
C=generate_candidates(S,M)
print C

['I', 'like', 'programming', 'with', 'Python']
[[2, 'UNKNOWN'], [4, 'UNKNOWN']]
[[(5311L, 0.4923599320882852), (585746L, 0.2461799660441426), (23015L, 0.050933786078098474), (1068736L, 0.050933786078098474), (52033L, 0.044142614601018676), (19508643L, 0.03735144312393888), (103661L, 0.03395585738539898), (773853L, 0.03225806451612903), (311632L, 0.008488964346349746), (5783L, 0.003395585738539898)], [(23862L, 0.9369318181818181), (83036L, 0.025), (18942L, 0.010227272727272727), (4920126L, 0.007954545454545454), (645111L, 0.00625), (23329L, 0.003977272727272727), (317752L, 0.003977272727272727), (19309718L, 0.0022727272727272726), (6081823L, 0.0017045454545454545), (6548215L, 0.0017045454545454545)]]


In [25]:
# put text in right format
mention = 'Python'
context = 'I like programming with'
candidates= C[1]
filter_ids = " ".join(['id:' +  str(tid) for tid,_ in candidates])


# select all the docs from Solr with the best scores, highest first.
qst = 'http://localhost:8983/solr/enwiki20160305/select'
q='text:('+context+')^1 title:(' + mention+')^1.35'
print q

params={'fl':'id score', 'fq':filter_ids, 'indent':'on',
        'q':q, 'wt':'json','rows':100}

#print params

r = requests.get(qst, params = params).json()['response']['docs']
print len(r)
id_score_map=defaultdict(float, {long(ri['id']):ri['score'] for ri in r})
id_score=[id_score_map[c] for c,_ in candidates]


text:(I like programming with)^1 title:(Python)^1.35
9


In [22]:
id_score

[16.369732,
 9.627789,
 17.094542,
 9.627789,
 11.230347,
 0.0,
 13.628618,
 7.6420765,
 12.981809,
 9.941278]

In [15]:
candidates

[[(5311L, 0.4923599320882852),
  (585746L, 0.2461799660441426),
  (23015L, 0.050933786078098474),
  (1068736L, 0.050933786078098474),
  (52033L, 0.044142614601018676),
  (19508643L, 0.03735144312393888),
  (103661L, 0.03395585738539898),
  (773853L, 0.03225806451612903),
  (311632L, 0.008488964346349746),
  (5783L, 0.003395585738539898)],
 [(23862L, 0.9369318181818181),
  (83036L, 0.025),
  (18942L, 0.010227272727272727),
  (4920126L, 0.007954545454545454),
  (645111L, 0.00625),
  (23329L, 0.003977272727272727),
  (317752L, 0.003977272727272727),
  (19309718L, 0.0022727272727272726),
  (6081823L, 0.0017045454545454545),
  (6548215L, 0.0017045454545454545)]]

In [17]:
candidates

[[(5311L, 0.4923599320882852),
  (585746L, 0.2461799660441426),
  (23015L, 0.050933786078098474),
  (1068736L, 0.050933786078098474),
  (52033L, 0.044142614601018676),
  (19508643L, 0.03735144312393888),
  (103661L, 0.03395585738539898),
  (773853L, 0.03225806451612903),
  (311632L, 0.008488964346349746),
  (5783L, 0.003395585738539898)],
 [(23862L, 0.9369318181818181),
  (83036L, 0.025),
  (18942L, 0.010227272727272727),
  (4920126L, 0.007954545454545454),
  (645111L, 0.00625),
  (23329L, 0.003977272727272727),
  (317752L, 0.003977272727272727),
  (19309718L, 0.0022727272727272726),
  (6081823L, 0.0017045454545454545),
  (6548215L, 0.0017045454545454545)]]