In [None]:
%%writefile wikify_util.py 
"""A few general modules for disambiguation
"""
import sys
from itertools import chain
from itertools import product
from itertools import combinations


sys.path.insert(0,'..')
from wikisim.config import *

from wikisim.calcsim import *

def generate_candidates(S, M, max_t=10, enforce=False):
    """ Given a sentence list (S) and  a mentions list (M), returns a list of candiates
        Inputs:
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
            max_t: maximum candiate per mention
            enforce: Makes sure the "correct" entity is among the candidates
        Outputs:
         Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
             where cij is the jth candidate for ith mention and pij is the relative frequency of cij
    
    """
    candslist=[]
    for m in M:
        
        clist = anchor2concept(S[m[0]])
        if not clist:
            clist=((0L,1L),)
        
        clist = sorted(clist, key=lambda x: -x[1])
        clist = clist[:max_t]
        
        smooth=0    
        if enforce:          
            wid = title2id(m[1])            
    #         if wid is None:
    #             raise Exception(m[1].encode('utf-8') + ' not found')
            
                        
            trg = [(i,(c,f)) for i,(c,f) in enumerate(clist) if c==wid]
            if not trg:
                trg=[(len(clist), (wid,0))]
                smooth=1

                
            if smooth==1 or trg[0][0]>=max_t: 
                if clist:
                    clist.pop()
                clist.append(trg[0][1])
            
        s = sum(c[1]+smooth for c in clist )        
        clist = [(c,float(f+smooth)/s) for c,f in clist ]
            
        candslist.append(clist)
    return  candslist 

def get_tp(gold_titles, ids):
    """Returns true positive number
       Inputs: goled_titles: The correct titles
               ids: The given ids
       Outputs: returns a tuple of (true_positives, total_number_of_ids)
    
    """
    tp=0
    for m,id2 in zip(gold_titles, ids):
        if title2id(m[1]) == id2:
            tp += 1
    return [tp, len(ids)]

def get_prec(tp_list):
    """Returns precision
       Inputs: a list of (true_positive and total number) lists
       Output: Precision
    """
    overall_tp = 0
    simple_count=0
    overall_count=0
    macro_prec = 0;
    for tp, count in tp_list:
        if tp is None:
            continue
        simple_count +=1    
        overall_tp += tp
        overall_count += count
        macro_prec += float(tp)/count
        
    macro_prec = macro_prec/simple_count
    micro_prec = float(overall_tp)/overall_count
    
    return micro_prec, macro_prec




In [None]:
"""Diiferent coherence (context, key-entity) calculation, and 
    disambiguation.
"""
%%writefile coherence.py 

from wikify_util import *
import numpy as np

def get_candidate_representations(candslist, direction, method):
    '''returns an array of vector representations. 
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
      Outputs
           cvec_arr: Candidate embeddings, a two dimensional array, each column 
                   is the representation of a candidate
           cveclist_bdrs: a list of pairs (beginning, end), to indicate where 
                   the embeddings for a concepts indicates start and end. In other words
                   The embedding of candidates [ci1...cik] in candslist is
                   cvec_arr[cveclist_bdrs[i][0]:cveclist_bdrs[i][1]] 
    '''
    
    cframelist=[]
    cveclist_bdrs = []
    ambig_count=0
    for cands in candslist:
        if len(candslist)>1:
            ambig_count += 1
        cands_rep = [conceptrep(encode_entity(c[0], method, get_id=False), method=method, direction=direction, get_titles=False) for c in cands]
        cveclist_bdrs += [(len(cframelist), len(cframelist)+len(cands_rep))]
        cframelist += cands_rep

    #print "ambig_count:", ambig_count
        
    cvec_fr = pd.concat(cframelist, join='outer', axis=1)
    cvec_fr.fillna(0, inplace=True)
    cvec_arr = cvec_fr.as_matrix().T
    return cvec_arr, cveclist_bdrs

def entity_to_context_scores(candslist, direction, method):
    ''' finds the similarity between each entity and its context representation
        Inputs:
            candslist: the list of candidates [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            direction: embedding direction
            method: similarity method
        Returns:
           cvec_arr: Candidate embeddings, a two dimensional array, each column 
           cveclist_bdrs: a list of pairs (beginning, end), to indicate where the 
                   reperesentation of the candidates for cij reside        
           cands_score_list: scroes in the form of [[s11,...s1k],...[sn1,...s1m]]
                    where sij  is the similarity of c[i,j] to to ci-th context
                    
            '''
    cvec_arr, cveclist_bdrs =  get_candidate_representations(candslist, direction, method)    
    
    aggr_cveclist = np.zeros(shape=(len(candslist),cvec_arr.shape[1]))
    for i in range(len(cveclist_bdrs)):
        b,e = cveclist_bdrs[i]
        aggr_cveclist[i]=cvec_arr[b:e].sum(axis=0)
    
    from itertools import izip
    resolved = 0
    cands_score_list=[]        
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        convec=aggr_cveclist[:i].sum(axis=0) + aggr_cveclist[i+1:].sum(axis=0)
        S=[]    
        for v in cvec:
            try:
                s = 1-sp.spatial.distance.cosine(convec, v);
            except:
                s=0                
            if np.isnan(s):
                s=0
            S.append(s)
        cands_score_list.append(S)

    return cvec_arr, cveclist_bdrs, cands_score_list

def key_criteria(cands_score):
    ''' helper function for find_key_concept: returns a score indicating how good a key is x
        Input:
            scroes for candidates [ci1, ..., cik] in the form of (i, [(ri1, si1), ..., (rik, sik)] ) 
            where (rij,sij) indicates that sij is the similarity of c[i][rij] to to cith context
            
    '''
    if len(cands_score[1])==0:
        return -float("inf")    
    if len(cands_score[1])==1 or cands_score[1][1][1]==0:
        return float("inf")
    
    return (cands_score[1][0][1]-cands_score[1][1][1]) / cands_score[1][1][1]

def find_key_concept(candslist, direction, method):
    ''' finds the key entity in the candidate list
        Inputs:
            candslist: the list of candidates [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            cvec_arr: the array of all embeddings for the candidates
            cveclist_bdrs: The embedding vector for each candidate: [[c11,...c1k],...[cn1,...c1m]]
        Returns:
            cvec_arr: Candidate embeddings, a two dimensional array, each column 
            cveclist_bdrs: a list of pairs (beginning, end), to indicate where the 
            key_concept: the concept forwhich one of the candidates is the key entity
            key_entity: candidate index for key_cancept that is detected to be key_entity
            key_entity_vector: The embedding of key entity
            '''
    cvec_arr, cveclist_bdrs, cands_score_list = entity_to_context_scores(candslist, direction, method);
    S=[sorted(enumerate(S), key=lambda x: -x[1]) for S in cands_score_list]
        
    key_concept, _ = max(enumerate(S), key=key_criteria)
    key_entity = S[key_concept][0][0]
    
    b,e = cveclist_bdrs[key_concept]
    
    key_entity_vector =  cvec_arr[b:e][key_entity]    
    return cvec_arr, cveclist_bdrs, key_concept, key_entity, key_entity_vector

def keyentity_candidate_scores(candslist, direction, method, ver):
    '''returns entity scores using key-entity scoring 
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
           ver: 1 for the method explained in the paper
           
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    
    cvec_arr, cveclist_bdrs, key_concept, key_entity, key_entity_vector = find_key_concept(candslist, direction, method)
    
    # Iterate 
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        cand_scores=[]

        for v in cvec:
            try:
                d = 1-sp.spatial.distance.cosine(key_entity_vector, v);
            except:
                d=0                
            if np.isnan(d):
                d=0
            
            cand_scores.append(d)    
        candslist_scores.append(cand_scores) 
    return candslist_scores



def coherence_scores_driver(C, ws, method='rvspagerank', direction=DIR_BOTH, op_method="keydisamb"):
    """ Assigns a score to every candidate 
        Inputs:
            C: Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: Windows size for chunking
            method: similarity method
            direction: embedding type
            op_method: disambiguation method, either keyentity or entitycontext
            
        
    """
    
    windows = [[start, min(start+ws, len(C))] for start in range(0,len(C),ws) ]
    last = len(windows)
    if last > 1 and windows[last-1][1]-windows[last-1][0]<2:
        windows[last-2][1] = len(C)
        windows.pop()
    scores=[]    
    for w in windows:
        chunk_c = C[w[0]:w[1]]
        if op_method == 'keydisamb':
            scores += keyentity_candidate_scores(chunk_c, direction, method,4)
            
        if op_method == 'entitycontext':
            _, _, candslist_scores = entity_to_context_scores(chunk_c, direction, method);
            scores += candslist_scores
            
    return scores


In [None]:
"""Testing Coherence
"""

# S=["Carlos", "met", "David", "and" , "Victoria", "in", "Madrid"]
# M=[[0, "Roberto_Carlos"], [2, "David_Beckham"], [4, "Victoria_Beckham"], [6, "Madrid"]]


S=["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."]
M=[[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]

# S=["Phoenix, Arizona"] 
# M=[[0, "Phoenix,_Arizona"]]

C = generate_candidates(S, M, max_t=5, enforce=False)
print "Candidates: ", C, "\n"


coh_scores = coherence_scores_driver(C, ws=5, method='rvspagerank', direction=DIR_BOTH, op_method="entitycontext")
print coh_scores

In [None]:
"""Context-based disambiguation and also Learning-To-Rank combination
    of several features.
"""
%%writefile wikify.py 


from __future__ import division

from collections import Counter
import cPickle as pickle
import sys
from coherence import *
#sys.path.insert(0,'..')

#from wikisim.calcsim import *
#from wsd.wsd import *
# My methods
#from senseembed_train_test.ipynb

disam_model_file_name = os.path.join(home,'backup/datasets/ner/ltr.pkl')
disam_model = pickle.load(open(disam_model_file_name, 'rb'))    

def get_context(anchor, eid):
    """Returns the context
       Inputs: 
           anchor: the anchor text
           eid: The id of the entity this anchor points to
       Output:
           The context (windows size is, I guess, 20)       
    """
    params={'wt':'json', 'rows':'50000'}
    anchor = solr_escape(anchor)
    
    q='anchor:"%s" AND entityid:%s' % (anchor, eid)
    params['q']=q
    
#     session = requests.Session()
#     http_retries = Retry(total=20,
#                     backoff_factor=.1)
#     http = requests.adapters.HTTPAdapter(max_retries=http_retries)
#     session.mount('http://localhost:8983/solr', http)
    
    r = session.get(qstr, params=params).json()
    if 'response' not in r: 
        print "[terminating]\t%s",(str(r),)
        sys.stdout.flush()
        os._exit(0)
        
    if not r:
        return []
    return r['response']['docs']

#from wsd
def word2vec_context_candidate_scores (S, M, candslist, ws):
    '''returns entity scores using the similarity with their context
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: word size
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        pos = M[i][0]
        #print "At: ", M[i]
        context = S[max(pos-ws,0):pos]+S[pos+1:pos+ws+1]
        #print context
        #print candslist[i], pos,context
        context_vec = sp.zeros(getword2vec_model().vector_size)
        for c in context:
            #print "getting vector for: " , c
            context_vec += getword2vector(c).as_matrix()
        #print context_vec
        cand_scores=[]

        for c in cands:
            try:
                cand_vector = getentity2vector(encode_entity(c[0],'word2vec', get_id=False))
                d = 1-sp.spatial.distance.cosine(context_vec, cand_vector);
            except:
                d=0                
            if np.isnan(d):
                d=0
            
            cand_scores.append(d)    
        candslist_scores.append(cand_scores) 

    return candslist_scores

#from wsd
def word2vec_context_disambiguate(S, M, candslist, ws ):
    '''Disambiguate a sentence using word-context similarity
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           
       Returns: 
           a list of entity ids and a list of titles
    '''
    
        
    candslist_scores = word2vec_context_candidate_scores (S, M, candslist, ws)
                      
    # Iterate 
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles 


def solr_escape(s):
    """
        Escape a string for solr
    """
    #ToDo: probably && and || nead to be escaped as a whole, and also AND, OR, NOT are not included
    to_sub=re.escape(r'+-&&||!(){}[]^"~*?:\/')
    return re.sub('[%s]'%(to_sub,), r'\\\g<0>', s)

#from wikisim
def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """
    q='+text:(%s)'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    D = r.json()['response']
    return D['numFound']



# Editing Ryan's code
def context_to_profile_sim(mention, context, candidates):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    
    
    # put text in right format
    if not context:
        return [0]*len(candidates)
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['id:' +  str(tid) for tid,_ in candidates])
        

    # select all the docs from Solr with the best scores, highest first.
    qst = 'http://localhost:8983/solr/enwiki20160305/select'
    q='text:('+context+')^1 title:(' + mention+')^1.35'
    
    params={'fl':'id score', 'fq':filter_ids, 'indent':'on',
            'q':q, 'wt':'json'}
    
    #print params
    
    r = requests.get(qst, params = params).json()['response']['docs']
    id_score_map=defaultdict(float, {long(ri['id']):ri['score'] for ri in r})
    id_score=[id_score_map[c] for c,_ in candidates]
    return id_score

# Important TODO
# This queriy is very much skewed toward popularity, better to replace space with AND
#!!!! I don't like this implementation, instead of retrieving and counting, better to let the 
# solr does the counting, 
def context_to_context_sim(mention, context, candidates, rows=10):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mentionStr: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    if not context:
        return [0]*len(candidates)
    
    # put text in right format
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['entityid:' +  str(tid) for tid,_ in candidates])
    
    
    # select all the docs from Solr with the best scores, highest first.
    qstr = 'http://localhost:8983/solr/enwiki20160305_context/select'
    q="_context_:(%s) entity:(%s)" % (context,mention)
    
    params={'fl':'entityid', 'fq':filter_ids, 'indent':'on',
            'q':q,'wt':'json', 'rows':rows}
    #print params
    r = requests.get(qstr, params = params)
    cnt = Counter()
    
    for doc in r.json()['response']['docs']:
        cnt[long(doc['entityid'])] += 1
    
    id_score=[cnt[c] for c,_ in candidates]
    return id_score

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    return sum(c for _,c in anchor2concept(s))  

def mention_prob(text):
    """
    Description:
        Returns the probability that the text is a mention in Wikipedia.
    Args:
        text: 
    Return:
        The probability that the text is a mention in Wikipedia.
    """
    
    total_mentions = get_mention_count(text)
    total_appearances = get_solr_count(text.replace(".", ""))
    if total_appearances == 0:
        return 0 # a mention never used probably is not a good link
    return float(total_mentions)/total_appearances


def context_candidate_scores (S, M, candslist, ws, method='c2c', rows=10):
    '''returns entity scores using  context seatch
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: word size
            method: Either 'c2p': for context to profile, or 'c2c' for context to context
            rows: How many rows to retrieve 
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        pos = M[i][0]
        mention=S[pos]
        context = S[max(pos-ws,0):pos]+S[pos+1:pos+ws+1]
        context=" ".join(context)
        #print "mention: ",mention
        #print "context: ",context
        
        if method == 'c2p':
            cand_scores=context_to_profile_sim(mention, context, cands)
        if method == 'c2c':
            cand_scores=context_to_context_sim(mention, context, cands, rows=rows)
            
        candslist_scores.append(cand_scores) 

    return candslist_scores

def popularity_score(candslist):
    """Retrieves the popularity score from the candslist
    """
    scores=[[s for _, s in cands] for cands in candslist]
    return scores

def normalize(scores_list):
    """Normalize a matrix, row-wise
    """
    normalized_scoreslist=[]
    for scores in scores_list:
        smooth=0
        if 0 in scores:
            smooth=1
        sum_s = sum(s+smooth for s in scores )        
        n_scores = [float(s+smooth)/sum_s for s in scores]
        normalized_scoreslist.append(n_scores)
    return normalized_scoreslist
        
def normalize_minmax(scores_list):
    """Normalize a matrix, row-wise, using minmax technique
    """
    normalized_scoreslist=[]
    for scores in scores_list:
        scores_min = min(scores)        
        scores_max = max(scores)        
        if scores_min == scores_max:
            n_scores = [0]*len(scores)
        else:
            n_scores = [(float(s)-scores_min)/(scores_max-scores_min) for s in scores]
        normalized_scoreslist.append(n_scores)
    return normalized_scoreslist

def find_max(candslist,candslist_scores):
    '''Disambiguate a sentence using a list of candidate-score tuples
       Inputs: 
           candslist: candidate list [[(c11, s11),...(c1k, s1k)],...[(cn1, sn1),...(c1m, s1m)]]
       Returns: 
           a list of entity ids and a list of titles
    '''
            
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles        


def get_scores(S, M, C, ws, method, rows=10):
    """ Disambiguate C list using a disambiguation method 
        Inputs:
            C: Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            method: similarity method
            direction: embedding type
            op_method: disambiguation method 
                        most important ones: ilp (integer linear programming), 
                                             key: Key Entity based method
        
    """
    scores=None
    if method == 'popularity'  :
        scores = popularity_score(C)
    if method == 'keydisamb'  :
        scores = coherence_scores_driver(C, ws, method='rvspagerank', direction=DIR_BOTH, op_method="keydisamb")
    if method == 'entitycontext'  :
        scores = coherence_scores_driver(C, ws, method='rvspagerank', direction=DIR_BOTH, op_method="entitycontext")
    if method == 'context2context'  :
        scores = context_candidate_scores (S, M, C, ws, method='c2c', rows=rows)
    if method == 'context2profile'  :
        scores = context_candidate_scores (S, M, C, ws, method='c2p')    
    if method == 'learned'  :
        scores = learned_scores (S, M, C, ws)    
        
    scores = normalize_minmax(scores)    
    return scores

def formated_scores(scores):
    """Only for pretty-printing
    """
    scores = [['{0:.2f}'.format(s) for s in cand_scores] for cand_scores in scores]
    return scores

def formated_all_scores(scores):
    """Only for pretty-printing
    """
    scores = [[tuple('{0:.2f}'.format(s) for s in sub_scores) for sub_scores in cand_scores] for cand_scores in scores]
    return scores

def get_all_scores(S, M, C, ws, rows=10):
    """Give
        Inputs:
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
            C: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: windows size
            rows: number of rows, for calculating context-based similarities

        Output:
            Scores, in this format [[(c111, c11s),...(c1k1, c1ks)],...[(cn11, pn1s),...(c1m1, p1ms)]]
            where cijk is the k-th scores for cij candidate
    """
    all_scores= [get_scores(S, M, C, ws, method, rows) for method in \
           ['popularity','keydisamb','entitycontext','context2context','context2profile']]
    return [zip(*s) for s in zip(*all_scores)]



def keyentity_disambiguate(candslist, direction=DIR_OUT, method='rvspagerank', ver=4):
    '''Disambiguate a sentence using key-entity method
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
           ver: 1 for the method explained in the paper
       Returns: 
           a list of entity ids and a list of titles
    '''
    
        
    candslist_scores = keyentity_candidate_scores (candslist, direction, method, ver)
    # Iterate 
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles  

def learned_scores (S, M, candslist, ws):
    '''returns entity scores using the learned (learned-to-rank method)
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: word size
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    all_scores=get_all_scores(S,M,candslist, ws, 10)
    return [disam_model.predict(cand_scores) for cand_scores in all_scores] 

def wikify(S, M, C, ws, method, rows=10):
    candslist_scores = get_scores(S, M, C, ws, method, rows)
    return find_max(C,candslist_scores)

In [None]:
%%writefile gen_trainrep.py 
""" Create a train-set 
    entity_id, query_id, scores1, score2, ..., scoren, true/false (is it a correct entity)
"""
from wikify import *
sys.stdout.flush()
max_t=5
ws=5
outdir = os.path.join(baseresdir, 'wikify')
outfile = os.path.join(home,'backup/datasets/ner/trainrepository.30000.5000.tsv')

dsname = os.path.join(home,'backup/datasets/ner/wiki-mentions.30000.json')

max_count=5000
count = 0          
with open(dsname,'r') as ds, open(outfile,'w') as outf:
    qid=0
    for line in ds:                           
        js = json.loads(line.decode('utf-8').strip());
        S = js["text"]
        M = js["mentions"]
        count +=1        
        print "%s:\tS=%s\n\tM=%s" % (count, json.dumps(S, ensure_ascii=False).encode('utf-8'),json.dumps(M, ensure_ascii=False).encode('utf-8'))        
        C = generate_candidates(S, M, max_t=max_t, enforce=False)
        #print C
        all_scores=get_all_scores(S,M,C, ws, 10)
        for i in  range(len(C)):
            m=M[i]
            cands = C[i]
            cand_scores = all_scores[i]
            wid = title2id(m[1]) 
            for (eid,_),scores in zip (cands, cand_scores):
                is_true_eid = (wid == eid)
                string_scores=[str(s) for s in scores]
                outf.write("\t".join([str(eid), str(qid)]+string_scores+[str(int(is_true_eid))])+"\n")
            qid += 1
        if count >= max_count:
            break
print "Done"             
        

        

In [None]:
%%writefile train_ltr.py 
""" Train a LambdaMart (LTR) Method
"""
import pyltr
import pandas as pd
import os
from wikify import *

outdir = os.path.join(baseresdir, 'wikify')
tr_file_name = os.path.join(home,'backup/datasets/ner/trainrepository.30000.5000.tsv')
data=pd.read_table(tr_file_name, header=None)
#data=data.head(100)

grouped=data.groupby(1)
total_len=len(grouped)
group = grouped.filter(lambda x:x.iloc[0,1] >= 0 and x.iloc[0,1] < 0.6*total_len)
TrX = group.iloc[:,2:7].as_matrix()
TrY = group.iloc[:,7].as_matrix()
Trqid = group.iloc[:,1].as_matrix()

group=grouped.filter(lambda x:x.iloc[0,1] >= 0.6*total_len and x.iloc[0,1] < 0.8*total_len)
VaX = group.iloc[:,2:7].as_matrix()
VaY = group.iloc[:,7].as_matrix()
Vaqid = group.iloc[:,1].as_matrix()


group=grouped.filter(lambda x:x.iloc[0,1] >= 0.8*total_len and x.iloc[0,1] < 1.0*total_len)
TsX = group.iloc[:,2:7].as_matrix()
TsY = group.iloc[:,7].as_matrix()
Tsqid = group.iloc[:,1].as_matrix()

monitor = pyltr.models.monitors.ValidationMonitor(
     VaX, VaY, Vaqid, metric=pyltr.metrics.NDCG(k=10), stop_after=250)
model = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 0)
#lmart.fit(TX, TY, Tqid, monitor=monitor)
model.fit(TrX, TrY, Trqid, monitor=monitor)

metric = pyltr.metrics.NDCG(k=10)
Ts_pred = model.predict(TsX)
print 'Random ranking:', metric.calc_mean_random(Tsqid, TsY)
print 'Our model:', metric.calc_mean(Tsqid, TsY, Ts_pred)

import cPickle as pickle
model_file_name = os.path.join(home,'backup/datasets/ner/ltr.pkl')

pickle.dump(model, open(model_file_name, 'wb'))

print 'Model saved'

In [None]:
# %load_ext autoreload
# %autoreload

# %aimport wsd
# import sys
from __future__ import division
from wikify import *

import sys
sys.path.insert(0,'..')
#from wikisim.calcsim import *
from wsd.wsd import *

import time
ws=5
S=["Carlos", "met", "David", "and" , "Victoria", "in", "Madrid"]
M=[[0, "Roberto_Carlos"], [2, "David_Beckham"], [4, "Victoria_Beckham"], [6, "Madrid"]]

# S=["Carlos", "met", "David", "and" , "Victoria", "in", "Madrid"]
# M=[[2, "David_Beckham"], [4, "Victoria_Beckham"], [6, "Madrid"]]

# S=["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."]
# M=[[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]

# S=["Phoenix, Arizona"] 
# M=[[0, "Phoenix,_Arizona"]]

start = time.time()
C = generate_candidates(S, M, max_t=5, enforce=False)
print "Candidates: ", C, "\n"



pop_scores = get_scores(S,M,C, ws, method="popularity")
print "Key Scores_method_1: ", formated_scores(pop_scores), "\n"

candslist_scores = get_scores(S,M,C, ws, method="keydisamb")
print "Key Scores_method_2: ", formated_scores(candslist_scores), "\n"

candslist_scores = get_scores(S,M,C, ws, method='entitycontext')
print "Key Scores_method_3: ", formated_scores(candslist_scores), "\n"

candslist_scores = get_scores(S,M,C, ws, method="context2context")
print "Key Scores_method_4: ", formated_scores(candslist_scores), "\n"

candslist_scores = get_scores(S,M,C, ws, method='context2profile')
print "Key Scores_method_5: ", formated_scores(candslist_scores), "\n"


# candslist_scores = get_scores(S,M,C, ws, method='learned')
# print "Key Scores_method_learned: ", formated_scores(candslist_scores), "\n"
    

In [4]:
"""Mention Detection Methods"""
from pycorenlp import StanfordCoreNLP

core_nlp = StanfordCoreNLP('http://localhost:9000')
mentions = core_nlp.annotate("I like Python programming language", properties={
    'annotators': 'entitymentions',
    'outputFormat': 'json'})
mentions['sentences']

[{u'entitymentions': [],
  u'index': 0,
  u'tokens': [{u'after': u' ',
    u'before': u'',
    u'characterOffsetBegin': 0,
    u'characterOffsetEnd': 1,
    u'index': 1,
    u'lemma': u'I',
    u'ner': u'O',
    u'originalText': u'I',
    u'pos': u'PRP',
    u'word': u'I'},
   {u'after': u' ',
    u'before': u' ',
    u'characterOffsetBegin': 2,
    u'characterOffsetEnd': 6,
    u'index': 2,
    u'lemma': u'like',
    u'ner': u'O',
    u'originalText': u'like',
    u'pos': u'VBP',
    u'word': u'like'},
   {u'after': u' ',
    u'before': u' ',
    u'characterOffsetBegin': 7,
    u'characterOffsetEnd': 13,
    u'index': 3,
    u'lemma': u'Python',
    u'ner': u'O',
    u'originalText': u'Python',
    u'pos': u'NNP',
    u'word': u'Python'},
   {u'after': u' ',
    u'before': u' ',
    u'characterOffsetBegin': 14,
    u'characterOffsetEnd': 25,
    u'index': 4,
    u'lemma': u'programming',
    u'ner': u'O',
    u'originalText': u'programming',
    u'pos': u'NN',
    u'word': u'programmi