In [None]:
%%writefile wsd_util.py 
"""A few general modules for disambiguation
"""
from __future__ import division

import sys
from itertools import chain
from itertools import product
from itertools import combinations
import unicodedata

sys.path.insert(0,'..')
from wikisim.config import *

from wikisim.calcsim import *
def generate_candidates(S, M, max_t=20, enforce=False):
    """ Given a sentence list (S) and  a mentions list (M), returns a list of candiates
        Inputs:
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
            max_t: maximum candiate per mention
            enforce: Makes sure the "correct" entity is among the candidates
        Outputs:
         Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
             where cij is the jth candidate for ith mention and pij is the relative frequency of cij
    
    """
    candslist=[]
    for m in M:
        
        clist = anchor2concept(S[m[0]])
        if not clist:
            clist=((0L,1L),)
        
        clist = sorted(clist, key=lambda x: -x[1])
        clist = clist[:max_t]
        
        smooth=0    
        if enforce:          
            wid = title2id(m[1])            
    #         if wid is None:
    #             raise Exception(m[1].encode('utf-8') + ' not found')
            
                        
            trg = [(i,(c,f)) for i,(c,f) in enumerate(clist) if c==wid]
            if not trg:
                trg=[(len(clist), (wid,0))]
                smooth=1

                
            if smooth==1 or trg[0][0]>=max_t: 
                if clist:
                    clist.pop()
                clist.append(trg[0][1])
            
        s = sum(c[1]+smooth for c in clist )        
        clist = [(c,float(f+smooth)/s) for c,f in clist ]
            
        candslist.append(clist)
    return  candslist 

def get_tp(gold_titles, ids):
    """Returns true positive number
       Inputs: goled_titles: The correct titles
               ids: The given ids
       Outputs: returns a tuple of (true_positives, total_number_of_ids)
    
    """
    tp=0
    for m,id2 in zip(gold_titles, ids):
        if title2id(m[1]) == id2:
            tp += 1
    return [tp, len(ids)]

def get_prec(tp_list):
    """Returns precision
       Inputs: a list of (true_positive and total number) lists
       Output: Precision
    """
    overall_tp = 0
    simple_count=0
    overall_count=0
    macro_prec = 0;
    for tp, count in tp_list:
        if tp is None:
            continue
        simple_count +=1    
        overall_tp += tp
        overall_count += count
        macro_prec += float(tp)/count
        
    macro_prec = macro_prec/simple_count
    micro_prec = float(overall_tp)/overall_count
    
    return micro_prec, macro_prec

def solr_escape(s):
    """
        Escape a string for solr
    """
    #ToDo: probably && and || nead to be escaped as a whole, and also AND, OR, NOT are not included
    to_sub=re.escape(r'+-&&||!(){}[]^"~*?:\/')
    return re.sub('[%s]'%(to_sub,), r'\\\g<0>', s)

def solr_unescape(s):
    """
        Escape a string for solr
    """
    #ToDo: probably && and || nead to be escaped as a whole, and also AND, OR, NOT are not included
    to_sub=re.escape(r'+-&&||!(){}[]^"~*?:\/')
    return re.sub('\\\([%s])'%(to_sub,), r'\g<1>', s)

def solr_encode(inputstr):
    '''This function "ideally" should prepare the text in the correct encoding
        which is utf-16, but I couldn't (cf. my encoding notes)
        so for know, just make everything ascii!
        Input: 
            A unicode string with any encoding
        Output: 
            Ascii encoded string
    '''
    return unicodedata.normalize('NFKD', inputstr).encode('ascii', 'ignore')

In [None]:
%%writefile coherence.py 
"""Diiferent coherence (context, key-entity) calculation, and 
    disambiguation.
"""
from __future__ import division

from wsd_util import *
import numpy as np

def get_candidate_representations(candslist, direction, method):
    '''returns an array of vector representations. 
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
      Outputs
           cvec_arr: Candidate embeddings, a two dimensional array, each column 
                   is the representation of a candidate
           cveclist_bdrs: a list of pairs (beginning, end), to indicate where 
                   the embeddings for a concepts indicates start and end. In other words
                   The embedding of candidates [ci1...cik] in candslist is
                   cvec_arr[cveclist_bdrs[i][0]:cveclist_bdrs[i][1]] 
    '''
    
    cframelist=[]
    cveclist_bdrs = []
    ambig_count=0
    for cands in candslist:
        if len(candslist)>1:
            ambig_count += 1
        cands_rep = [conceptrep(encode_entity(c[0], method, get_id=False), method=method, direction=direction, get_titles=False) for c in cands]
        cveclist_bdrs += [(len(cframelist), len(cframelist)+len(cands_rep))]
        cframelist += cands_rep
        
    cvec_fr = pd.concat(cframelist, join='outer', axis=1)
    cvec_fr.fillna(0, inplace=True)
    cvec_arr = cvec_fr.as_matrix().T
    return cvec_arr, cveclist_bdrs

def entity_to_context_scores(candslist, direction, method):
    ''' finds the similarity between each entity and its context representation
        Inputs:
            candslist: the list of candidates [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            direction: embedding direction
            method: similarity method
        Returns:
           cvec_arr: Candidate embeddings, a two dimensional array, each column 
           cveclist_bdrs: a list of pairs (beginning, end), to indicate where the 
                   reperesentation of the candidates for cij reside        
           cands_score_list: scroes in the form of [[s11,...s1k],...[sn1,...s1m]]
                    where sij  is the similarity of c[i,j] to to ci-th context
                    
            '''
    cvec_arr, cveclist_bdrs =  get_candidate_representations(candslist, direction, method)    
    
    aggr_cveclist = np.zeros(shape=(len(candslist),cvec_arr.shape[1]))
    for i in range(len(cveclist_bdrs)):
        b,e = cveclist_bdrs[i]
        aggr_cveclist[i]=cvec_arr[b:e].sum(axis=0)
    
    from itertools import izip
    resolved = 0
    cands_score_list=[]        
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        convec=aggr_cveclist[:i].sum(axis=0) + aggr_cveclist[i+1:].sum(axis=0)
        S=[]    
        for v in cvec:
            try:
                s = 1-sp.spatial.distance.cosine(convec, v);
            except:
                s=0                
            if np.isnan(s):
                s=0
            S.append(s)
        cands_score_list.append(S)

    return cvec_arr, cveclist_bdrs, cands_score_list

def key_criteria(cands_score):
    ''' helper function for find_key_concept: returns a score indicating how good a key is x
        Input:
            scroes for candidates [ci1, ..., cik] in the form of (i, [(ri1, si1), ..., (rik, sik)] ) 
            where (rij,sij) indicates that sij is the similarity of c[i][rij] to to cith context
            
    '''
    if len(cands_score[1])==0:
        return -float("inf")    
    if len(cands_score[1])==1 or cands_score[1][1][1]==0:
        return float("inf")
    
    return (cands_score[1][0][1]-cands_score[1][1][1]) / cands_score[1][1][1]

def find_key_concept(candslist, direction, method):
    ''' finds the key entity in the candidate list
        Inputs:
            candslist: the list of candidates [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            cvec_arr: the array of all embeddings for the candidates
            cveclist_bdrs: The embedding vector for each candidate: [[c11,...c1k],...[cn1,...c1m]]
        Returns:
            cvec_arr: Candidate embeddings, a two dimensional array, each column 
            cveclist_bdrs: a list of pairs (beginning, end), to indicate where the 
            key_concept: the concept forwhich one of the candidates is the key entity
            key_entity: candidate index for key_cancept that is detected to be key_entity
            key_entity_vector: The embedding of key entity
            '''
    cvec_arr, cveclist_bdrs, cands_score_list = entity_to_context_scores(candslist, direction, method);
    S=[sorted(enumerate(S), key=lambda x: -x[1]) for S in cands_score_list]
        
    key_concept, _ = max(enumerate(S), key=key_criteria)
    key_entity = S[key_concept][0][0]
    
    b,e = cveclist_bdrs[key_concept]
    
    key_entity_vector =  cvec_arr[b:e][key_entity]    
    return cvec_arr, cveclist_bdrs, key_concept, key_entity, key_entity_vector

def keyentity_candidate_scores(candslist, direction, method):
    '''returns entity scores using key-entity scoring 
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
           
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    
    cvec_arr, cveclist_bdrs, key_concept, key_entity, key_entity_vector = find_key_concept(candslist, direction, method)
    
    # Iterate 
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        cand_scores=[]

        for v in cvec:
            try:
                d = 1-sp.spatial.distance.cosine(key_entity_vector, v);
            except:
                d=0                
            if np.isnan(d):
                d=0
            
            cand_scores.append(d)    
        candslist_scores.append(cand_scores) 
    return candslist_scores



def coherence_scores_driver(C, ws=5, method='rvspagerank', direction=DIR_BOTH, op_method="keydisamb"):
    """ Assigns a score to every candidate 
        Inputs:
            C: Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: Windows size for chunking
            method: similarity method
            direction: embedding type
            op_method: disambiguation method, either keyentity or entitycontext
        Output:
            Candidate Scores
        
    """
    windows = [[start, min(start+ws, len(C))] for start in range(0,len(C),ws) ]
    last = len(windows)
    if last > 1 and windows[last-1][1]-windows[last-1][0]<2:
        windows[last-2][1] = len(C)
        windows.pop()
    scores=[]    
    for w in windows:
        chunk_c = C[w[0]:w[1]]
        if op_method == 'keydisamb':
            scores += keyentity_candidate_scores(chunk_c, direction, method)
            
        if op_method == 'entitycontext':
            _, _, candslist_scores = entity_to_context_scores(chunk_c, direction, method);
            scores += candslist_scores
            
    return scores


In [None]:
"""Testing Coherence
"""
from coherence import *
# S=["Carlos", "met", "David", "and" , "Victoria", "in", "Madrid"]
# M=[[0, "Roberto_Carlos"], [2, "David_Beckham"], [4, "Victoria_Beckham"], [6, "Madrid"]]


S=["Three", "of", "the", "greatest", "guitarists", "started", "their", "career", "in", "a", "single", "band", ":", "Clapton", ",", "Beck", ",", "and", "Page", "."]
M=[[13, "Eric_Clapton"], [15, "Jeff_Beck"], [18, "Jimmy_Page"]]

# S=["Phoenix, Arizona"] 
# M=[[0, "Phoenix,_Arizona"]]

C = generate_candidates(S, M, max_t=5, enforce=False)
print "Candidates: ", C, "\n"


coh_scores = coherence_scores_driver(C, ws=5, method='rvspagerank', direction=DIR_BOTH, op_method="entitycontext")
print coh_scores

In [None]:
%%writefile wsd.py 
"""Context-based disambiguation and also Learning-To-Rank combination
    of several features.
"""

from __future__ import division

from collections import Counter
import cPickle as pickle
import sys
from coherence import *
#sys.path.insert(0,'..')

#from wikisim.calcsim import *
#from wsd.wsd import *
# My methods
#from senseembed_train_test.ipynb

disam_model_file_name = os.path.join(home,'backup/datasets/ner/ltr.pkl')
disam_model = pickle.load(open(disam_model_file_name, 'rb'))    


def get_context(anchor, eid, rows=50000):
    """Returns the context
       Inputs: 
           anchor: the anchor text
           eid: The id of the entity this anchor points to
       Output:
           The context (windows size is, I guess, 20)       
    """
    params={'wt':'json', 'rows':rows}
    anchor = solr_escape(anchor)
    
    q='anchor:"%s" AND entityid:%s' % (anchor, eid)
    params['q']=q
    
#     session = requests.Session()
#     http_retries = Retry(total=20,
#                     backoff_factor=.1)
#     http = requests.adapters.HTTPAdapter(max_retries=http_retries)
#     session.mount('http://localhost:8983/solr', http)
    
    r = session.get(qstr, params=params).json()
    if 'response' not in r: 
        print "[terminating]\t%s",(str(r),)
        sys.stdout.flush()
        os._exit(0)
        
    if not r:
        return []
    return r['response']['docs']

#from wsd
def word2vec_context_candidate_scores (S, M, candslist, ws=5):
    '''returns entity scores using the similarity with their context
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: word size
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        pos = M[i][0]
        context = S[max(pos-ws,0):pos]+S[pos+1:pos+ws+1]
        context_vec = sp.zeros(getword2vec_model().vector_size)
        for c in context:
            context_vec += getword2vector(c).as_matrix()
        cand_scores=[]

        for c in cands:
            try:
                cand_vector = getentity2vector(encode_entity(c[0],'word2vec', get_id=False))
                d = 1-sp.spatial.distance.cosine(context_vec, cand_vector);
            except:
                d=0                
            if np.isnan(d):
                d=0
            
            cand_scores.append(d)    
        candslist_scores.append(cand_scores) 

    return candslist_scores

#from wsd
def word2vec_context_disambiguate(S, M, candslist):
    '''Disambiguate a sentence using word-context similarity
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           
       Returns: 
           a list of entity ids and a list of titles
    '''
    
        
    candslist_scores = word2vec_context_candidate_scores (S, M, candslist)
                      
    # Iterate 
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles 



#from wikisim
def get_solr_count(s):
    """ Gets the number of documents the string occurs 
        NOTE: Multi words should be quoted
    Arg:
        s: the string (can contain AND, OR, ..)
    Returns:
        The number of documents
    """
    q='+text:(%s)'%(s,)
    qstr = 'http://localhost:8983/solr/enwiki20160305/select'
    params={'indent':'on', 'wt':'json', 'q':q, 'rows':0}
    r = requests.get(qstr, params=params)
    D = r.json()['response']
    return D['numFound']



# Editing Ryan's code
def context_to_profile_sim(mention, context, candidates):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mention: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    
    
    # put text in right format
    if not context:
        return [0]*len(candidates)
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['id:' +  str(tid) for tid,_ in candidates])
        

    # select all the docs from Solr with the best scores, highest first.
    qst = 'http://localhost:8983/solr/enwiki20160305/select'
    #q='text:('+context+')^1 title:(' + mention+')^1.35'
    q='text:('+context+')'
    
    params={'fl':'id score', 'fq':filter_ids, 'indent':'on',
            'q':q, 'wt':'json','rows':len(candidates)}
    
    
    r = requests.get(qst, params = params).json()['response']['docs']
    id_score_map=defaultdict(float, {long(ri['id']):ri['score'] for ri in r})
    id_score=[id_score_map[c] for c,_ in candidates]
    return id_score

# Important TODO
# This queriy is very much skewed toward popularity, better to replace space with AND
#!!!! I don't like this implementation, instead of retrieving and counting, better to let the 
# solr does the counting, 
def context_to_context_sim(mention, context, candidates, rows=1000):
    """
    Description:
        Uses Solr to find the relevancy scores of the candidates based on the context.
    Args:
        mentionStr: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    if not context:
        return [0]*len(candidates)
    
    # put text in right format
    context = solr_escape(context)
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['entityid:' +  str(tid) for tid,_ in candidates])
    
    
    # select all the docs from Solr with the best scores, highest first.
    qstr = 'http://localhost:8983/solr/enwiki20160305_context/select'
    q="_context_:(%s) entity:(%s)" % (context,mention)
    q="_context_:(%s) " % (context)
    
    params={'fl':'entityid', 'fq':filter_ids, 'indent':'on',
            'q':q,'wt':'json', 'rows':rows}
    r = requests.get(qstr, params = params)
    cnt = Counter()
    
    for doc in r.json()['response']['docs']:
        cnt[long(doc['entityid'])] += 1
    
    id_score=[cnt[c] for c,_ in candidates]
    return id_score


def context_candidate_scores (S, M, candslist, ws=5, method='c2c', skip_current=1):
    '''returns entity scores using  context seatch
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            ws: word size
            method: Either 'c2p': for context to profile, or 'c2c' for context to context
            skip_current: Whether or not include the current mention in the context
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    candslist_scores=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        pos = M[i][0]
        mention=S[pos]
        context = S[max(pos-ws,0):pos]+S[pos+skip_current:pos+ws+1]
        context=" ".join(context)
        
        if method == 'c2p':
            cand_scores=context_to_profile_sim(mention, context, cands)
        if method == 'c2c':
            cand_scores=context_to_context_sim(mention, context, cands)
            
        candslist_scores.append(cand_scores) 

    return candslist_scores

def mention_to_title_sim(mention, candidates):
    """
    Description:
        Uses Solr to find the string similarity scores between the mention candidates.
    Args:
        mention: The mention as it appears in the text
        context: The words that surround the target word.
        candidates: A list of candidates that each have the entity id and its frequency/popularity.
    Return:
        The score for each candidate in the same order as the candidates.
    """
    
    
    # put text in right format
    mention = solr_escape(mention)
    
    filter_ids = " ".join(['id:' +  str(tid) for tid,_ in candidates])
        

    # select all the docs from Solr with the best scores, highest first.
    qst = 'http://localhost:8983/solr/enwiki20160305/select'
    q='title:(' + mention+')'
    
    params={'fl':'id score', 'fq':filter_ids, 'indent':'on',
            'q':q, 'wt':'json','rows':len(candidates)}
    
    
    r = requests.get(qst, params = params).json()['response']['docs']
    id_score_map=defaultdict(float, {long(ri['id']):ri['score'] for ri in r})
    id_score=[id_score_map[c] for c,_ in candidates]
    return id_score

def mention_candidate_score(S, M, candslist):
    return [mention_to_title_sim(S[m[0]], c) for m,c in zip(M,candslist) ]

def popularity_score(candslist):
    """Retrieves the popularity score from the candslist
    """
    scores=[[s for _, s in cands] for cands in candslist]
    return scores

def normalize(scores_list):
    """Normalize a matrix, row-wise
    """
    normalized_scoreslist=[]
    for scores in scores_list:
        smooth=0
        if 0 in scores:
            smooth=1
        sum_s = sum(s+smooth for s in scores )        
        n_scores = [float(s+smooth)/sum_s for s in scores]
        normalized_scoreslist.append(n_scores)
    return normalized_scoreslist
        
def normalize_minmax(scores_list):
    """Normalize a matrix, row-wise, using minmax technique
    """
    normalized_scoreslist=[]
    for scores in scores_list:
        scores_min = min(scores)        
        scores_max = max(scores)        
        if scores_min == scores_max:
            n_scores = [0]*len(scores)
        else:
            n_scores = [(float(s)-scores_min)/(scores_max-scores_min) for s in scores]
        normalized_scoreslist.append(n_scores)
    return normalized_scoreslist

def find_max(candslist,candslist_scores):
    '''Disambiguate a sentence using a list of candidate-score tuples
       Inputs: 
           candslist: candidate list [[(c11, s11),...(c1k, s1k)],...[(cn1, sn1),...(c1m, s1m)]]
       Returns: 
           a list of entity ids and a list of titles
    '''
            
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles        

#Delete, useless
def disambiguate_random(C):
    '''Disambiguate using the given order (which can be random)
        Input:
            C: Candlist
        Output:
            Disambiguated entities
    '''
    
    ids = [c[0][0] for c in C ]
    titles= ids2title(ids)
    return ids, titles

def get_scores(S, M, C, method):
    """ Disambiguate C list using a disambiguation method 
        Inputs:
            S: Sentence
            M: Metntions
            C: Candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            method: similarity method
            direction: embedding type
            op_method: disambiguation method 
                        most important ones: ilp (integer linear programming), 
                                             key: Key Entity based method
        
    """
    scores=None
    if method == 'popularity'  :
        scores = popularity_score(C)
    if method == 'keydisamb'  :
        scores = coherence_scores_driver(C, method='rvspagerank', direction=DIR_BOTH, op_method="keydisamb")
    if method == 'entitycontext'  :
        scores = coherence_scores_driver(C, method='rvspagerank', direction=DIR_BOTH, op_method="entitycontext")
    if method == 'mention2entity'  :
        scores = mention_candidate_score (S, M, C)
    if method == 'context2context'  :
        scores = context_candidate_scores (S, M, C, method='c2c')
    if method == 'context2profile'  :
        scores = context_candidate_scores (S, M, C, method='c2p')    
    if method == 'learned'  :
        scores = learned_scores (S, M, C)    
        
    scores = normalize_minmax(scores)    
    return scores

def formated_scores(scores):
    """Only for pretty-printing
    """
    scores = [['{0:.2f}'.format(s) for s in cand_scores] for cand_scores in scores]
    return scores

def formated_all_scores(scores):
    """Only for pretty-printing
    """
    scores = [[tuple('{0:.2f}'.format(s) for s in sub_scores) for sub_scores in cand_scores] for cand_scores in scores]
    return scores

def get_all_scores(S, M, C):
    """Give all scores as different lists
        Inputs:
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
            C: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]

        Output:
            Scores, in this format [[(c111,.., c1k1),...(cm11,.., cmks)],...[(c1n1,.., pm1s),...(c1m1,.., p1ms)]]
            where cijk is the k-th scores for cij candidate
        
            Scores, in this format [[(c111, c11s),...(c1k1, c1ks)],...[(cn11, pn1s),...(c1m1, p1ms)]]
            where cijk is the k-th scores for cij candidate
    """
    all_scores= [get_scores(S, M, C, method) for method in \
           ['popularity','keydisamb','entitycontext','mention2entity','context2context','context2profile']]
    return [zip(*s) for s in zip(*all_scores)]



def keyentity_disambiguate(candslist, direction=DIR_OUT, method='rvspagerank'):
    '''Disambiguate a sentence using key-entity method
       Inputs: 
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
           direction: embedding direction
           method: similarity method
       Returns: 
           a list of entity ids and a list of titles
    '''
    
        
    candslist_scores = keyentity_candidate_scores (candslist, direction, method)
    # Iterate 
    true_entities = []
    for cands, cands_scores in zip(candslist, candslist_scores):
        max_index, max_value = max(enumerate(cands_scores), key= lambda x:x[1])
        true_entities.append(cands[max_index][0])

    titles = ids2title(true_entities)
    return true_entities, titles  

def learned_scores (S, M, candslist):
    '''returns entity scores using the learned (learned-to-rank method)
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
       Returns:
           Scores [[s11,...s1k],...[sn1,...s1m]] where sij is cij similarity to the key-entity
    '''
    all_scores=get_all_scores(S,M,candslist)
    return [disam_model.predict(cand_scores) for cand_scores in all_scores] 

def wsd(S, M, C, method='learned'):
    '''Gets a sentence, mentions and candslist, and returns disambiguation
       Inputs: 
           S: Sentence
           M: Mentions
           candslist: candidate list [[(c11, p11),...(c1k, p1k)],...[(cn1, pn1),...(c1m, p1m)]]
            method: disambiguation method 
       Returns: 
           A disambiguated list in the form of  (true_entities, titles)
    
    '''
    candslist_scores = get_scores(S, M, C, method)
    return find_max(C,candslist_scores)


In [2]:
%%writefile gen_trainrep.py 
""" Create a train-set 
    entity_id, query_id, scores1, score2, ..., scoren, true/false (is it a correct entity)
"""
from __future__ import division
from wsd import *
sys.stdout.flush()

max_t=20
max_count=15000

outdir = os.path.join(baseresdir, 'wsd')
outfile = os.path.join(home,'backup/datasets/ner/trainrepository.%s.30000.tsv'%(max_count,))

dsname = os.path.join(home,'backup/datasets/ner/wiki-mentions.30000.json')

count = 0          
with open(dsname,'r') as ds, open(outfile,'w') as outf:
    qid=0
    for line in ds:                           
        js = json.loads(line.decode('utf-8').strip());
        S = js["text"]
        M = js["mentions"]
        count +=1        
        print "%s:\tS=%s\n\tM=%s" % (count, json.dumps(S, ensure_ascii=False).encode('utf-8'),json.dumps(M, ensure_ascii=False).encode('utf-8'))        
        C = generate_candidates(S, M, max_t=max_t, enforce=False)
        all_scores=get_all_scores(S,M,C)
        for i in  range(len(C)):
            m=M[i]
            cands = C[i]
            cand_scores = all_scores[i]
            wid = title2id(m[1]) 
            for (eid,_),scores in zip (cands, cand_scores):
                is_true_eid = (wid == eid)
                string_scores=[str(s) for s in scores]
                outf.write("\t".join([str(eid), str(qid)]+string_scores+[str(int(is_true_eid))])+"\n")
            qid += 1
        if count >= max_count:
            break
print "Done"             
        

        

Overwriting gen_trainrep.py


In [3]:
%%writefile train_ltr.py 
""" Train a LambdaMart (LTR) Method
"""
from __future__ import division
import pyltr
import pandas as pd
import os
from wsd import *
nrows=50

outdir = os.path.join(baseresdir, 'wikify')
tr_file_name = os.path.join(home,'backup/datasets/ner/trainrepository.30000.tsv')
data=pd.read_table(tr_file_name, header=None)
#data=data.head(100)
num_cols = len(data.columns)

grouped=data.groupby(1)
total_len=len(grouped)
group = grouped.filter(lambda x:x.iloc[0,1] >= 0 and x.iloc[0,1] < 0.6*total_len)
X_train = group.iloc[:,2:num_cols-1].as_matrix()
y_train = group.iloc[:,num_cols-1].as_matrix()
qid_train = group.iloc[:,1].as_matrix()

group=grouped.filter(lambda x:x.iloc[0,1] >= 0.6*total_len and x.iloc[0,1] < 0.8*total_len)
X_validate = group.iloc[:,2:num_cols-1].as_matrix()
y_validate = group.iloc[:,num_cols-1].as_matrix()
qid_validate = group.iloc[:,1].as_matrix()


group=grouped.filter(lambda x:x.iloc[0,1] >= 0.8*total_len and x.iloc[0,1] < 1.0*total_len)
X_test = group.iloc[:,2:num_cols-1].as_matrix()
y_test = group.iloc[:,num_cols-1].as_matrix()
qid_test = group.iloc[:,1].as_matrix()

monitor = pyltr.models.monitors.ValidationMonitor(
     X_validate, y_validate, qid_validate, metric=pyltr.metrics.NDCG(k=10), stop_after=250)
model = pyltr.models.LambdaMART(n_estimators=300, learning_rate=0.1, verbose = 0)
#lmart.fit(TX, TY, Tqid, monitor=monitor)
model.fit(X_train, y_train, qid_train, monitor=monitor)

metric = pyltr.metrics.NDCG(k=10)
Ts_pred = model.predict(X_test)
print 'Random ranking:', metric.calc_mean_random(qid_test, y_test)
print 'Our model:', metric.calc_mean(qid_test, y_test, Ts_pred)

import cPickle as pickle
model_file_name = os.path.join(home,'backup/datasets/ner/ltr.pkl')

pickle.dump(model, open(model_file_name, 'wb'))

print 'Model saved'

Overwriting train_ltr.py


In [None]:
%%writefile mention_detection.py 

from wsd import *

#constants
CORE_NLP=0
LEARNED_MENTION=1


def tokenize_stanford(text):
    addr = 'http://localhost:9001'
    params={'annotators': 'tokenize', 'outputFormat': 'json'}
    r = requests.post(addr, params=params, data=text.encode('utf-8'))    
    
    return [token['originalText'] for token in r.json()['tokens']]

def encode_solrtexttagger_result(text,tags):
    """ Convert the solrtext output to our M,S format
        input:
            text: The original text
            tags: The result of the solrtexttagger
        output:
            S,M
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
    """
    start=0
    termindex=0
    S=[]
    M=[]
    # pass 1, adjust partial mentions. 
    # approach one, expand (the other could be shrink)
    
    for tag in tags:
        assert text[tag[1]:tag[3]] == tag[5]
        seg = text[start:tag[1]]
        S += seg.strip().split()
        M.append([len(S),'UNKNOWN'])
        S += [" ".join(text[tag[1]:tag[3]].split())]
        start = tag[3]
        
    S += text[start:].strip().split()
    return S, M

def annotate_with_solrtagger(text):
    ''' Annonate a text using solrtexttagger
        Input: 
            text: The input text *must be unicode*
        Output:
            Annotated text
    '''
    addr = 'http://localhost:8983/solr/enwikianchors20160305/tag'
    params={'overlaps':'LONGEST_DOMINANT_RIGHT', 'tagsLimit':'5000', 'fl':'id','wt':'json','indent':'on','matchText':'true'}
    text=solr_escape(text)
    r = requests.post(addr, params=params, data=text.encode('utf-8'))    

    S,M = encode_solrtexttagger_result(text,r.json()['tags'])
    return S,M


def encode_corenlp_result(text,annotated):
    """ Convert the corenlp output to our M,S format
        input:
            text: The original text
            mentions: The result of the solrtexttagger
        output:
            S,M
            S: segmented sentence [w1, ..., wn]
            M: mensions [m1, ... , mj]
    """
    #****** Important ****
    #* The indices are not correct if it contains unicode, 
    #* in case you need to work with the indices, decode to utf-8
    #******
    S=[]
    M=[]
    P=[]
    # pass 1, adjust partial mentions. 
    # approach one, expand (the other could be shrink)
    
    for sentence in annotated['sentences']: 
        start=0
        
        for mention in sentence['entitymentions']:
            S += [token['originalText'] for token in sentence['tokens'][start:mention['tokenBegin']]]
            M.append([len(S),'UNKNOWN'])
            mentionstr = mention['text']
            S += [mentionstr]
            start = mention['tokenEnd']

        S += [token['originalText'] for token in sentence['tokens'][start:]]
        P += [[token['originalText'],token['pos']] for token in sentence['tokens']]
    return S, M, P

def annotate_with_corenlp(text):
    ''' Annonate a text using coreNLP
        Input: 
            text: The input text
        Output:
            Annotated text
    '''
    addr = 'http://localhost:9001'
    params={'annotators': 'entitymentions', 'outputFormat': 'json'}
    r = requests.post(addr, params=params, data=text.encode('utf-8'))    
    
    S,M, P = encode_corenlp_result(text, r.json())
    return S,M,P

def solrtagger_pos(S,M,P):
    ''' Alligns the tags from corenlp to solrtagger's mentions
        Input:
            S: Sentence 
            M: Mentions
            P: POS of the mentions, from corenlp
        Output:
            Q: POS of solrtagger's mentions
    '''
    Q=[]
    j=0
    for i in range(len(M)):
        m=tokenize_stanford(solr_unescape(S[M[i][0]])) 
        j_backup=j
        q=[]
        while j<len(P):
            if similar(P[j][0], m[0])> .8:
                k=0
                while similar(P[j][0], m[k])>0.8:
                    #q.append(P[j]) #good for debugging
                    q.append(P[j][1]) #good for debugging
                    k=k+1
                    j=j+1
                    if j >= len(P) or k>=len(m):
                        break

                Q.append(" ".join(q))
                break
            j=j+1
        if not q:
            Q.append("NA")
            j=j_backup
    return Q

def get_mention_count(s):
    """
    Description:
        Returns the amount of times that the given string appears as a mention in wikipedia.
    Args:
        s: the string (can contain AND, OR, ..)
    Return:
        The amount of times the given string appears as a mention in wikipedia
    """
    
    return sum(c for _,c in anchor2concept(s))  

def mention_prob(text):
    """
    Description:
        Returns the probability that the text is a mention in Wikipedia.
    Args:
        text: 
    Return:
        The probability that the text is a mention in Wikipedia.
    """
    
    total_mentions = get_mention_count(text)
    total_appearances = get_solr_count(text.replace(".", ""))
    if total_appearances == 0:
        return 0 # a mention never used probably is not a good link
    return float(total_mentions)/total_appearances

def get_mention_probs(S,M):
    return [mention_prob(S[m[0]]) for m in M]


def boil_down_candidate_score(score_list):
    return [sum(scores)/len(scores) for scores in scores_list]
        
    
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def mention_overlap(S1, M1, S2,M2):
    '''Calculates the overlap between two given detected mentions
        Input:
            S1: Source Setnence
            M1: Source Mention
            S2: Destination Sentence
            M2: Destination mention            
        Output: A 0/1 vector of size M1, each element shows whether M1[i] is also in M2
    '''
    is_detected = []
    for m1 in M1:
        found = 0
        for m2 in M2:
            if similar(S1[m1[0]], S2[m2[0]])>0.8:
                found=1
        is_detected.append(found)
    return is_detected

def detect_and_score_mentions(text, max_t=5):
    """Give
        Uses solrtagger to detect mentions, and score them
        Inputs:
            text: Given text
        Output:
            Scores, in this format [[(c111, c11s),...(c1k1, c1ks)],...[(cn11, pn1s),...(c1m1, p1ms)]]
            where cijk is the k-th scores for cij candidate
    """
    assert type(text) is str
    solr_S, solr_M = annotate_with_solrtagger(text)
    # max_t does not have to equal the number of candidates in wsd, it's just to 
    # get an average relevancy
    solr_C = generate_candidates(solr_S, solr_M, max_t=max_t, enforce=False)
    
    
    wsd_scores = [[sum(sc)/len(sc) for sc in get_scores(solr_S, solr_M, solr_C, method)] for method in \
               ['popularity','entitycontext','mention2entity','context2context','context2profile']]

    mention_scores=[]
    mention_scores.extend(wsd_scores)
    mention_scores.append(get_mention_probs(solr_S, solr_M))
    
    core_S, core_M, core_P = annotate_with_corenlp(text)
    overlap_with_corenlp = mention_overlap(solr_S, solr_M, core_S,core_M)
    mention_scores.append(overlap_with_corenlp)
    
    pos_list = solrtagger_pos(solr_S, solr_M,core_P)
    mention_scores.append(pos_list)
    
    return solr_S, solr_M, zip(*mention_scores)

def get_learned_mentions(text):
#     S_solr,M_solr,scores = detect_and_score_mentions(text)
#     M_scores = mention_model.predict(scores) for cand_scores in all_scores
#     M = [m for m_s, m in zip(M_scores, M_solr) if m_s==1]
    return M,S
    
def detect_mentions(line, mentionmethod=CORE_NLP):
#     if mentionmethod == CORE_NLP:
#         return core_S, core_M = annotate_with_corenlp(line)        
#     if mention_metho = LEARNED_MENTION:
    return get_learned_mentions(text)
    

In [None]:
%%writefile gen_trainrep_for_mention.py 
""" Create a train-set 
    entity_id, query_id, scores1, score2, ..., scoren, true/false (is it a correct entity)
"""
from __future__ import division
from mention_detection import *
from wsd import *
sys.stdout.flush()

max_count=2
skip_lines=0

outdir = os.path.join(baseresdir, 'wsd')
outfile = os.path.join(home,'backup/datasets/ner/mentiontrainrepository.%s.30000.tsv'%(max_count,))

dsname = os.path.join(home,'backup/datasets/ner/wiki-mentions.30000.json')

count = 0  
mention_id = 0
with open(dsname,'r') as ds, open(outfile,'w') as outf:
    for line in ds:                           
        count +=1  
        if count <= skip_lines:
            continue
        js = json.loads(line.decode('utf-8').strip());
        S = js["text"]
        M = js["mentions"]
        text= " ".join(S)
        text = solr_encode(text)
        print "%s:\tS=%s\n\tM=%s\ttext=%s" % (count, json.dumps(S, ensure_ascii=False).encode('utf-8'),json.dumps(M, ensure_ascii=False).encode('utf-8'),text)        
        
        solr_S, solr_M, scores = detect_and_score_mentions(text)
        correct_mention = mention_overlap(solr_S, solr_M, S, M)
        for i in  range(len(solr_M)):
            string_scores=[str(s) for s in scores[i]]
            outf.write("\t".join([str(mention_id)] + string_scores+[str(correct_mention[i])])+"\n")
            mention_id += 1
        if count >= max_count:
            break
print "Done"             
        

        

In [None]:
#%%writefile wikify.py 
from __future__ import division
from wsd import *




def wikify_string(line, mentionmethod=CORE_NLP):
    S,M = detect_mentions(line, mentionmethod):        
    C = generate_candidates(S, M, max_t=20, enforce=False)
    E = wsd(S, M, C, method='learned'
    return S,M,E

def wikify_a_line(line, mentionmethod=CORE_NLP):
    S, M, E = wikify_string(line, mentionmethod=CORE_NLP) 
    for m,e in zip(M, E[1]): 
        S[m[0]]="<a href=https://en.wikipedia.org/wiki/%s>%s</a>"  % (S[m[0]],e)
    S_reconcat = " ".join(S)
    return S_reconcat
def wikify_api(text, mentionmethod=CORE_NLP):
    for line in text.splitlines():
        outlist.append(wikify_a_line(line, mentionmethod))
    return "\n".join(outlist)

def wikify_from_file_api(infilename, outfilename, mentionmethod=CORE_NLP):
    with open(infilename) as infile, open(outfilename, 'w') as outfile:
        for line in infilename.readlines():
            wikified = wikify_a_line(text, mentionmethod)
            outfile.write(wikified + "\n")

            