In [None]:
# %load_ext autoreload
# %autoreload

import os
import time;
import json 
import requests
import numpy as np
        

# %aimport wikipedia
# %aimport calcsim
import sys
sys.path.insert(0,'..')

from wikisim.config import *
from wikisim.calcsim import *


def generate_candidates(S, M, max_t=10, enforce=True):
    candslist=[]
    for m in M:
        wid = title2id(m[1])
        if wid is None:
            raise Exception(m[1].encode('utf-8') + ' not found')
        
        clist = anchor2concept(S[m[0]])
        clist = sorted(clist, key=lambda x: -x[1])

        smooth=0    
        trg = [(i,(c,f)) for i,(c,f) in enumerate(clist) if c==wid]
        if not trg:
            trg=[(len(clist), (wid,0))]
            smooth=1

            
        clist = clist[:max_t]
        if smooth==1 or trg[0][0]>=max_t: 
            if clist:
                clist.pop()
            clist.append(trg[0][1])
        s = sum(c[1]+smooth for c in clist )        
        clist = [(c,float(f+smooth)/s) for c,f in clist ]
            
        candslist.append(clist)
    return  candslist 

def disambiguate(C, method, direction, op_method):
        
    if op_method == 'ilp':
        return disambiguate_ilp(C, method, direction)
    if op_method == 'ilp2':
        return disambiguate_ilp_2(C, method, direction)
    if  op_method == 'context1'  :
        return contextdisamb_1(C, direction)
    if  op_method == 'context2'  :
        return contextdisamb_2(C, direction)
    if  op_method == 'context3'  :
        return contextdisamb_3(C, direction)
    
    if op_method == 'keyq':
        return key_quad(C, method, direction)
    
    if  op_method == 'context4_1'  :
        return contextdisamb_4(C, direction, 1)
    if  op_method == 'context4_2'  :
        return contextdisamb_4(C, direction, 2)
    if  op_method == 'context4_3'  :
        return contextdisamb_4(C, direction, 3)
    if  op_method == 'context4_4'  :
        return contextdisamb_4(C, direction, 4)
    if  op_method == 'pcontext4_4'  :
        return contextdisamb_4(C, direction, 4)
    
    if  op_method == 'tagme'  :
        return tagme(C, method, direction)
    if  op_method == 'tagme2'  :
        return tagme(C, method, direction, True)
    
    return None



def disambiguate_driver(C, ws, method, direction, op_method):
    ids = []
    titles = []
    
    windows = [[start, min(start+ws, len(C))] for start in range(0,len(C),ws) ]
    last = len(windows)
    if last > 1 and windows[last-1][1]-windows[last-1][0]<3:
        windows[last-2][1] = len(C)
        windows.pop()
        
    for w in windows:
        chunk_c = C[w[0]:w[1]]
        chunk_ids, chunk_titles = disambiguate(chunk_c, method, direction, op_method)
        ids += chunk_ids
        titles += chunk_titles
    return ids, titles     

def get_tp(gold_titles, ids):
    tp=0
    for m,id2 in zip(gold_titles, ids):
        if title2id(m[1]) == id2:
            tp += 1
    return [tp, len(ids)]

def get_prec(tp_list):
    if not tp_list:
        return 0, 0
    overall_tp = 0
    overall_count=0
    macro_prec = 0;
    for tp, count in tp_list:
        overall_tp += tp
        overall_count += count
        macro_prec += float(tp)/count
        
    macro_prec = macro_prec/len(tp_list)
    micro_prec = float(overall_tp)/overall_count
    
    return micro_prec, macro_prec



In [None]:
import random
from itertools import chain
from itertools import product
from itertools import combinations

random.seed(7)
#C = [('a','b','c'), ('h', 'i'),('h', 'i','j','j'),('g','l','a'),('o','o','p')]

def getscore(x,y,method, direction):
    return getsim(x,y ,method, direction)
    #return random.random()

def get_sim_matrix(candslist,method, direction):
    concepts=  list(chain(*candslist))
    concepts=  list(set(c[0] for c in concepts))
    sims = pd.DataFrame(index=concepts, columns=concepts)
    for cands1,cands2 in combinations(candslist,2):
        for c1,c2 in product(cands1,cands2):
            sims[c1[0]][c2[0]]= sims[c2[0]][c1[0]] = getscore(c1[0],c2[0] , method, direction)
    return sims        

#simmatrix = get_sim_matrix(C, 'method', 'direction')


In [None]:
# normla
def key_criteria(x):
    if len(x[1])==1 or x[1][1][1]==0:
        return float("inf")
    
    return (x[1][0][1]-x[1][1][1]) / x[1][1][1]

def find_key_concept(candslist, cveclist_bdrs, cvec_arr, ver):
    
    aggr_cveclist = np.zeros(shape=(len(candslist),cvec_arr.shape[1]))
    for i in range(len(cveclist_bdrs)):
        b,e = cveclist_bdrs[i]
        aggr_cveclist[i]=cvec_arr[b:e].sum(axis=0)
    
    from itertools import izip
    resolved = 0
    Dlist=[]        
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        convec=aggr_cveclist[:i].sum(axis=0) + aggr_cveclist[i+1:].sum(axis=0)
        D=[]    
        for v in cvec:
            d = 1-sp.spatial.distance.cosine(convec, v);
            if np.isnan(d):
                d=0
            D.append(d)
        D=sorted(enumerate(D), key=lambda x: -x[1])
        Dlist.append(D)

    if ver ==1: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: x[1][0][1] if len(x[1])>1 else -1)
    elif ver ==2: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: (x[1][0][1]-x[1][1][1]) if len(x[1])>1 else -1)
    elif ver ==3: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: (x[1][0][1]-x[1][1][1])/(x[1][0][1]+x[1][1][1]) if len(x[1])>1 else -1)
    elif ver ==4: 
        max_concept, _ = max(enumerate(Dlist), key=key_criteria)
    max_candidate = Dlist[max_concept][0][0]
    return max_concept, max_candidate


def contextdisamb_4(candslist, direction=DIR_OUT, ver=1):
    cframelist=[]
    cveclist_bdrs = []
    ambig_count=0
    for cands in candslist:
        if len(candslist)>1:
            ambig_count += 1
        cands_rep = [conceptrep(c[0], direction, get_titles=False) for c in cands]
        cveclist_bdrs += [(len(cframelist), len(cframelist)+len(cands_rep))]
        cframelist += cands_rep

    #print "ambig_count:", ambig_count
        
    cvec_fr = pd.concat(cframelist, join='outer', axis=1)
    cvec_fr.fillna(0, inplace=True)
    cvec_arr = cvec_fr.as_matrix().T
    i=0
    for cframe in cframelist:
        if cframe.empty:
            cvec_arr = np.insert(cvec_arr,i,0, axis=0)
        i+=1    
    
        
    # find maximum ... 
        
    max_concept, max_candidate = find_key_concept(candslist, cveclist_bdrs, cvec_arr, ver)
    
    b,e = cveclist_bdrs[max_concept]
    
    convec =  cvec_arr[b:e][max_candidate]
        
    
    # Iterate 
    res=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]

        maxd=-1
        index = -1
        mi=0

        for v in cvec:
            d = 1-sp.spatial.distance.cosine(convec, v);
            if d>maxd:
                maxd=d
                index=mi
            mi +=1
        if index==-1:
            index=0
        #print i, index, maxd    
        res.append(cands[index][0]) 
        b,e = cveclist_bdrs[i]
        cveclist_bdrs[i] = (b+index,b+index+1)
        
        #aggr_cveclist[i] =  cvec_arr[b:e][index]
        
        candslist[i] = candslist[i][index][0]
    titles = ids2title(res)

    return res, titles        





In [None]:
#parallel
def key_criteria(x):
    if len(x[1])==1 or x[1][1][1]==0:
        return float("inf")
    
    return (x[1][0][1]-x[1][1][1]) / x[1][1][1]
qfrom functools import partial

def find_key_concept(candslist, cveclist_bdrs, cvec_arr, ver):
    
    aggr_cveclist = np.zeros(shape=(len(candslist),cvec_arr.shape[1]))
    for i in range(len(cveclist_bdrs)):
        b,e = cveclist_bdrs[i]
        aggr_cveclist[i]=cvec_arr[b:e].sum(axis=0)
    
    from itertools import izip
    resolved = 0
    Dlist=[]        
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]
        convec=aggr_cveclist[:i].sum(axis=0) + aggr_cveclist[i+1:].sum(axis=0)
        D=[]    
        for v in cvec:
            d = 1-sp.spatial.distance.cosine(convec, v);
            if np.isnan(d):
                d=0
            D.append(d)
        D=sorted(enumerate(D), key=lambda x: -x[1])
        Dlist.append(D)

    if ver ==1: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: x[1][0][1] if len(x[1])>1 else -1)
    elif ver ==2: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: (x[1][0][1]-x[1][1][1]) if len(x[1])>1 else -1)
    elif ver ==3: 
        max_concept, _ = max(enumerate(Dlist), key=lambda x: (x[1][0][1]-x[1][1][1])/(x[1][0][1]+x[1][1][1]) if len(x[1])>1 else -1)
    elif ver ==4: 
        max_concept, _ = max(enumerate(Dlist), key=key_criteria)
    max_candidate = Dlist[max_concept][0][0]
    return max_concept, max_candidate


def contextdisamb_4(candslist, direction=DIR_OUT, ver=1):
    cframelist=[]
    cveclist_bdrs = []
    ambig_count=0
    for cands in candslist:
        if len(candslist)>1:
            ambig_count += 1
        cands_rep = [conceptrep(c[0], direction, get_titles=False) for c in cands]
        cveclist_bdrs += [(len(cframelist), len(cframelist)+len(cands_rep))]
        cframelist += cands_rep

    #print "ambig_count:", ambig_count
        
    cvec_fr = pd.concat(cframelist, join='outer', axis=1)
    cvec_fr.fillna(0, inplace=True)
    cvec_arr = cvec_fr.as_matrix().T
    i=0
    for cframe in cframelist:
        if cframe.empty:
            cvec_arr = np.insert(cvec_arr,i,0, axis=0)
        i+=1    
    
        
    # find maximum ... 
        
    max_concept, max_candidate = to(candslist, cveclist_bdrs, cvec_arr, ver)
    
    b,e = cveclist_bdrs[max_concept]
    
    convec =  cvec_arr[b:e][max_candidate]
        
    
    # Iterate 
    res=[]
    for i in range(len(candslist)):
        cands = candslist[i]
        b,e = cveclist_bdrs[i]
        cvec = cvec_arr[b:e]

        maxd=-1
        index = -1
        mi=0

        for v in cvec:
            d = 1-sp.spatial.distance.cosine(convec, v);
            if d>maxd:
                maxd=d
                index=mi
            mi +=1
        if index==-1:
            index=0
        #print i, index, maxd    
        res.append(cands[index][0]) 
        b,e = cveclist_bdrs[i]
        cveclist_bdrs[i] = (b+index,b+index+1)
        
        #aggr_cveclist[i] =  cvec_arr[b:e][index]
        
        candslist[i] = candslist[i][index][0]
    titles = ids2title(res)

    return res, titles        



ids = Pkey_quad(C, 'method', 'direction', simmatrix)
print ids


In [None]:
# %load_ext autoreload
# %autoreload

# %aimport wsd
# import sys
from wsd import *
import time
S=["Major League Baseball", "New York City", "Major League Baseball", "American League",
            "Boston Red Sox", "Seattle Mariners", "Cleveland Indians", "Milwaukee Brewers", "Baltimore Orioles",
            "Oakland Athletics", "New York Yankees", "Chicago White Sox", "Toronto Blue Jays", "Texas Rangers", 
            "Minnesota Twins", "Detroit Tigers", "Kansas City Royals"]
M=[[0, "Major_League_Baseball"], [1, "New_York_City"], [2, "Major_League_Baseball"], [3, "American_League"],
   [4, "Boston_Red_Sox"], [5, "Seattle_Mariners"], [6, "Cleveland_Indians"], [7, "Milwaukee_Brewers"],
   [8, "Baltimore_Orioles"], [9, "Oakland_Athletics"], [10, "New_York_Yankees"], [11, "Chicago_White_Sox"],
   [12, "Toronto_Blue_Jays"], [13, "Texas_Rangers_(baseball)"], [14, "Minnesota_Twins"], [15, "Detroit_Tigers"],
   [16, "Kansas_City_Royals"]]

start = time.time()

C = generate_candidates(S, M, 5)
#print C
#try:
ids, titles = disambiguate_driver(C, 5, 'rvspagerank', 2, 'context4_4')
#except:
    #print "Error"

elapsed = str(timeformat(int(time.time()-start)));
#print ids
#
print titles

tp = get_tp(M, ids) 
print tp
print elapsed
# prec = get_prec(tp)
# print prec

# Cripples wsd

In [None]:
%load_ext autoreload
%autoreload

%aimport wsd

import sys

from wsd import *



dsnames = [os.path.join(home,'backup/datasets/ner/kore.json'),
          os.path.join(home,'backup/datasets/ner/aida.json'), 
          os.path.join(home,'backup/datasets/ner/wiki-mentions.5000.json')]

#dsnames = [os.path.join(home,'backup/datasets/ner/wiki-mentions.json'), 
#           os.path.join(home,'backup/datasets/ner/kore.json')]

dsnames = [os.path.join(home,'backup/datasets/ner/kore.json')]

methods = (('wlm', DIR_IN,'ilp'), ('rvspagerank', DIR_OUT, 'ilp'))

methods = (
           ('rvspagerank', DIR_OUT, 'context3'),
           ('rvspagerank', DIR_OUT, 'context3'),
           ('rvspagerank', DIR_OUT, 'context1'))

# methods = (('wlm', DIR_IN,'ilp'), ('rvspagerank', DIR_OUT, 'ilp'))
#methods = (('wlm', DIR_IN, 'tagme'),)
#methods = (('rvspagerank', DIR_BOTH, 'ilp2'), )
methods = (('rvspagerank', DIR_BOTH, 'pkeyq'), )

max_t=5
max_count=-1
ws=5
verbose=True
restart = True

outdir = os.path.join(baseresdir, 'wsd')
if not os.path.exists(outdir):
    os.makedirs(outdir)
    
resname =  os.path.join(outdir, 'reslog.txt')
#clearlog(resname)

detailedresname=  os.path.join(outdir, 'detailedreslog.txt')
#clearlog(detailedresname)


for method, direction, op_method in methods:
    for dsname in dsnames:
        start = time.time()
        
        print "dsname: %s, method: %s, op_method: %s, direction: %s, max_t: %s, ws: %s ..."  % (dsname,
                method, op_method, direction, max_t, ws)
        sys.stdout.flush()
        tmpfilename = os.path.join(outdir, 
                                   '-'.join([method, str(direction), op_method, str(max_t), str(ws), os.path.basename(dsname)]))
        
        overall=[]
        start_count=-1
        if os.path.isfile(tmpfilename):
            if restart:
                os.remove(tmpfilename)
            else:
                with open(tmpfilename,'r') as tmpf:
                    for line in tmpf:
                        js = json.loads(line.strip())
                        start_count = js['no']
                        if js['tp'] is not None:
                            overall.append(js['tp'])
        
        if start_count !=-1:
            print "Continuing from\t", start_count
        count=0
        with open(dsname,'r') as ds, open(tmpfilename,'a') as tmpf:
            for line in ds:
                js = json.loads(line.decode('utf-8').strip());
                S = js["text"]
                M = js["mentions"]
                count +=1
                if count <= start_count:
                    continue
                if verbose:
                    print "%s:\tS=%s\n\tM=%s" % (count, json.dumps(S, ensure_ascii=False),json.dumps(M, ensure_ascii=False))
                    sys.stdout.flush()
                    
                C = generate_candidates(S, M, max_t=max_t, enforce=True)
                try:
                    ids, titles = disambiguate_driver(C, ws, method, direction, op_method)
                except:
                    print "Error"
                    tmpf.write(json.dumps({"no":count, "tp":None})+"\n")
                    continue
                    
                tp = get_tp(M, ids) 
                overall.append(tp)
                tmpf.write(json.dumps({"no":count, "tp":tp})+"\n")
                if (max_count !=-1) and (count >= max_count):
                    break
                    

        elapsed = str(timeformat(int(time.time()-start)));
        print "done"
        detailedres ={"dsname":dsname, "method": method, "op_method": op_method, "driection": direction,
                      "max_t": max_t, "tp":overall, "elapsed": elapsed, "ws": ws}
        
        
        #logres(detailedresname, '%s',  json.dumps(detailedres))
        #print('%s',  json.dumps(detailedres))
        
        micro_prec, macro_prec = get_prec(overall)        
        #logres(resname, '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s', method, op_method, graphtype(direction), max_t , ws, 
               #dsname, micro_prec, macro_prec, elapsed)
        print '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'% (method, op_method, graphtype(direction), max_t , ws, 
               dsname, micro_prec, macro_prec, elapsed)

print "done"

wlm	tagme	in	20	5	/home/sajadi/backup/datasets/ner/wiki-mentions.json	0.607142857143	0.58427045177	0:05:47

In [None]:
# %load_ext autoreload
# %autoreload

# %aimport wsd
# import sys
from wsd import *
import time
S=["Major League Baseball", "New York City", "Major League Baseball", "American League",
            "Boston Red Sox", "Seattle Mariners", "Cleveland Indians", "Milwaukee Brewers", "Baltimore Orioles",
            "Oakland Athletics", "New York Yankees", "Chicago White Sox", "Toronto Blue Jays", "Texas Rangers", 
            "Minnesota Twins", "Detroit Tigers", "Kansas City Royals"]
M=[[0, "Major_League_Baseball"], [1, "New_York_City"], [2, "Major_League_Baseball"], [3, "American_League"],
   [4, "Boston_Red_Sox"], [5, "Seattle_Mariners"], [6, "Cleveland_Indians"], [7, "Milwaukee_Brewers"],
   [8, "Baltimore_Orioles"], [9, "Oakland_Athletics"], [10, "New_York_Yankees"], [11, "Chicago_White_Sox"],
   [12, "Toronto_Blue_Jays"], [13, "Texas_Rangers_(baseball)"], [14, "Minnesota_Twins"], [15, "Detroit_Tigers"],
   [16, "Kansas_City_Royals"]]

start = time.time()

cvec_arr, cveclist_bdrs, key_concept, key_entity, key_entity_vector = find_key_concept(candslist, direction, method, ver)
#print C
#try:
ids, titles = disambiguate_driver(C, 5, 'rvspagerank', 2, 'context4_4')
#except:
    #print "Error"

elapsed = str(timeformat(int(time.time()-start)));
#print ids
#
print titles

tp = get_tp(M, ids) 
print tp
print elapsed
# prec = get_prec(tp)
# print prec

In [None]:
{"text": ["Tiger", "lost", "the", "US Open", "."], "mentions": [[0, "Tiger_Woods"], [3, "U.S._Open_(golf)"]]}