# Preparing wsd datasets
## A bunch of scripts to preprocess and unify the strucutre of several datasets we use in the Wikifier Evaluations

Each annotated sentence is converted to the following format:

`S = [w1,w2,...,w2]`

`M = [[m1,e1], [m2,e2],...[mk, ek]]`

where `[wi]`s are the tokens and `[mi,ei]` means `S[mi]` is linked to entity `ei` 

Example: 
A sentence such as: `David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven.`

is converted to 

`S=["David", "and", "Victoria", "named", "their", "children", "Brooklyn", ",", "Romeo", ",", "Cruz", ",", "and", "Harper Seven", "."]
M=[[0, "David_Beckham"], [2, "Victoria_Beckham"]]`



In [1]:
S=["David", "and", "Victoria", "named", "their", "children", "Brooklyn", ",", "Romeo", ",", "Cruz", ",", "and", "Harper Seven", "."]
" ".join(S)

'David and Victoria named their children Brooklyn , Romeo , Cruz , and Harper Seven .'

# Preparing wikimensions

In [None]:
%load_ext autoreload
%autoreload


import re
import json
import sys
import urllib
import sys
import requests
import MySQLdb

from HTMLParser import HTMLParser
sys.path.insert(0,'../..')
from wikisim.wikipedia import *

def qualify(r, min_mention=2, min_abmig=2):
    
    # condition 1
    if len(r['opening_annotation']) < min_mention:
        #print "c1 failed"
        return None
    
    nottop = False
    found_min_ambig=False
    annotations = json.loads(r['opening_annotation'])
    for ann in annotations:
        surf = ann['surface_form']
        linktitle = ann['url']
        wid = title2id(linktitle.encode('utf-8').replace(' ','_'))
        # condition 2
        if wid is None:
            #print "c2 failed", linktitle
            return None
        
        allids = anchor2concept(surf)
        # condition 3
        if not allids:
            #print "c3 failed"
            return None
        # condition 4
        if len(allids) >= min_abmig:
            found_min_ambig = True
        allids = sorted(allids, key=lambda k:-k[1])
        
        # condition 5
        #print allids
        if allids[0][0] != wid:
            nottop = True
    if not found_min_ambig:
        #print "c4 failed"
        return None
    return nottop                                                                                   
    
    
    
    

In [None]:
wanted=36000
wanted_nottop=10000

reopen()
qstr = 'http://localhost:8983/solr/enwiki20160305/select'
q='*:*'
rows=1000
params={'indent':'on', 'wt':'json', 'fl':'title opening_text opening_annotation', 'q':q, "start":0,
        "rows":rows}
home = os.path.expanduser("~");
output = open(os.path.join(home,'backup/datasets/ner/wiki.%s.json'%(wanted, )), 'w')

start=0
count=0
count_nottop=0
enough=False
while True:
    params["start"] = start
    start += rows
    r = requests.get(qstr, params=params)
    #print r.json()['response']['docs']
    for d in r.json()['response']['docs']:
        #print d["id"]; enough=True; break
        if not (count < wanted or count_nottop < wanted_nottop):
            enough = True
            break
        qd = qualify(d) 
        if qd is None:
            continue
            
        if count_nottop < wanted_nottop and qd :
            output.write(json.dumps(d, ensure_ascii=False).encode('utf-8')+'\n')
            count_nottop += 1
            count += 1
            if ((count) % int(wanted/10) ==0) or (count_nottop) % int(wanted_nottop/10) ==0:
                print "count: ", count, ", count_nottop: ", count_nottop
            continue
        if count < wanted:
            count +=1
            output.write(json.dumps(d, ensure_ascii=False).encode('utf-8')+'\n')
            if (count) % int(wanted/10) ==0:   
                print "count: ", count
            
    #break
    if enough:
        break
print 'done'        
output.close()    
#r=json.loads(r['opening_annotation'])


## Converting json to input format

In [None]:
import sys, os, json
import string
home = os.path.expanduser("~");
wanted=36000
jsname = os.path.join(home,'backup/datasets/ner/wiki.%s.json'%(wanted,))
outjsname = os.path.join(home,'backup/datasets/ner/wiki-mentions.%s.json'%(wanted,))


    
def splittext(text,url):
    start=0
    termindex=0
    t=[]
    mentions=[]
    # pass 1, adjust partial mentions. 
    # approach one, expand (the other could be shrink)
    
    for u in url:
        seg = text[start:u['from']]
        t += seg.strip().split()
        mentions.append([len(t),u['url']])
        t+=[" ".join(text[u['from']:u['to']].split())]
        start = u['to']
        
    t += text[start:].strip().split()
    return t, mentions

with open(jsname) as jf, open(outjsname,'w') as outjs:
    for line in jf:
        line = line.strip().decode('utf-8')
        js = json.loads(line)
        text = js['opening_text'] 
        url = json.loads(js['opening_annotation'])
        t, m = splittext(text, url)
        outjs.write((json.dumps({"text":t, "mentions":m}, ensure_ascii=False)+'\n').encode('utf-8'))
print "done"

## Preparing KORE

In [None]:
%load_ext autoreload
%autoreload

import sys, os, json
import string
from HTMLParser import HTMLParser
hp = HTMLParser()
home = os.path.expanduser("~");

sys.path.append('../../cgi-bin/')
%aimport wikipedia
from wikipedia import *
reopen()


infname = os.path.join(home,'backup/datasets/ner/KORE50/AIDA.tsv')
outjsname = os.path.join(home,'backup/datasets/ner/kore.json')
url_col = 3

t=[]
m=[]
with open(infname) as inf, open(outjsname,'w') as outjs:
    state = 0
    for line in inf:
        line = line.strip()
        if not line:
            continue
        try:
            line = line.decode('unicode-escape')
        except:
            pass            
        if line.startswith('-DOCSTART-'):
            if m and t:
                outjs.write (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
            t = []
            m = []
            continue
                
        w = line.split('\t')
        if len(w) == 1:
            t.append(w[0])
            continue
            
        if w[1] == 'B':
            if title2id(w[3]) is None:
                print w[3], " not found"
            if w[url_col] != '--NME--' and (title2id(w[3]) is not None):
                m.append((len(t), w[3]))
            t.append(w[2])
            
        if w[1] == 'I':
            continue
    if m and t:
        outjs.write (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
        
            



In [None]:
x="Rudi_V\u00f6ller"
print x.decode('unicode-escape')

# Preparing Aida

In [None]:
%load_ext autoreload
%autoreload


import sys, os, json
import string
import re

from HTMLParser import HTMLParser
hp = HTMLParser()
home = os.path.expanduser("~");

sys.path.append('../../cgi-bin/')
%aimport wikipedia
from wikipedia import *
reopen()

pattern = re.compile("http://en.wikipedia.org/wiki/(.*)")


infname = os.path.join(home,'backup/datasets/ner/Aida01/aida-yago2-dataset/AIDA-YAGO2-annotations.tsv')
#infname = os.path.join(home,'backup/datasets/ner/aida-shalam.tsv')
outjsname = os.path.join(home,'backup/datasets/ner/aida.json')
#outjsname = os.path.join(home,'backup/datasets/ner/aida-shalam.json')
# set it to 3 for AIDA, 
url_col = 4
pattern = re.compile("http://en.wikipedia.org/wiki/(.*)")

t=[]
m=[]
with open(infname) as inf, open(outjsname,'w') as outjs:
    state = 0
    for line in inf:
        line = line.strip().decode('utf-8')
        #line = line.strip()
        if not line:
            continue
        if line.startswith('-DOCSTART-'):
            if m and t:
                outjs.write (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
                #print (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
            t = []
            m = []
            continue
                
        w = line.split('\t')
            
        if w[1] == '--NME--':
            continue
        title = w[2]
        r = re.match(pattern, title)
        title = r.group(1)
        rwd = title2id(title)
            
        rwd = title2id(title)
            
        anchors = id2anchor(rwd)
        if not anchors:
            print "no anchor for: ", title
            continue
                            
        c=max(anchors, key=lambda k:k[1])[0]    
        m.append((len(t), title))
        t.append(c.decode('utf-8'))
            
    if m and t:
        outjs.write (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
        #print (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
                
print "done"        
        
            



# Preparing MASNBC/AQUAINT (Ignoring the text (for WSD-only evaluation))

In [None]:
import xml.etree.ElementTree as ET
from os import listdir
from os.path import isfile, join, expanduser
import json
import sys
sys.path.append('../../wikisim/')
from wikipedia import *
home = expanduser("~");
mypath = join(home, 'backup/datasets/WikificationACL2011Data/AQUAINT/Problems/')
outjsname = join(home,'backup/datasets/ner/aquaint.json')
outjs=open(outjsname,'w')
dsnames = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for ds in dsnames:
    
    with open(join(mypath, ds)) as f:
        filestr=f.read()
    #print filestr
    filestr=filestr.decode(encoding='cp1252').replace('&','&amp;').encode(encoding='utf-8')
    root=ET.fromstring(filestr)
    
    
    t=[]
    m=[]
    i=0
    for child in root.findall('ReferenceInstance'):
        anchor=child.find('SurfaceForm').text.strip()
        url = child.find('ChosenAnnotation').text.strip()
        if url == '*null*':
            continue
        if url[:29] !='http://en.wikipedia.org/wiki/':
            print url[:29]
            raise Exception('bad format')
        url=url[29:]
        if title2id(url) is None:
            print ds,':',url, " not found"
            continue
        
        t.append(anchor)
        m.append([i,url])
        i +=1
        
    outjs.write (json.dumps({"text":t, "mentions":m}, ensure_ascii=False).encode('utf-8')+'\n')
outjs.close()
print "done"

# Preparing MSNBC/AQUAINT with Text


In [12]:
import xml.etree.ElementTree as ET
from os import listdir
from os.path import isfile, join, expanduser
import json
import re
import sys
sys.path.append('../../wikisim/')
from wikipedia import *
home = expanduser("~");


#ds_dir = 'AQUAINT'
ds_dir = 'MSNBC'
p_path = join('../../datasets/ner/source/WikificationACL2011Data', ds_dir )

out_jsname = join('../../datasets/ner/', ds_dir.lower()+'.txt.json')

text_path = join(p_path,'RawTexts' )
problem_path = join(p_path,'Problems' )

outjs=open(out_jsname,'w')
def replace_bad_chars(s):
    return s.replace("\x85", '.').replace("\x91","'").replace("\x92","'").replace("\x93",'"')\
            .replace("\x94", '"').replace("\x96", '-').replace("\x97", '-').replace("\xfc",'x')

dsnames = [f for f in listdir(problem_path) if isfile(join(problem_path, f)) and (re.match(r'.*\.htm$', f) or re.match(r'.*\.txt$', f))]
def splittext(text,url):
    start=0
    termindex=0
    t=[]
    mentions=[]
    # pass 1, adjust partial mentions. 
    # approach one, expand (the other could be shrink)
    
    for u in url:
        seg = text[start:u['from']]
        t += seg.strip().split()
        mentions.append([len(t),u['url']])
        t+=[" ".join(text[u['from']:u['to']].split())]
        start = u['to']
        
    t += text[start:].strip().split()
    return t, mentions

def get_annotation(filename):
    with open(join(problem_path, filename)) as f:
        annotstr=f.read()
    #print filestr
    #annotstr=annotstr.decode(encoding='cp1252').replace('&','&amp;').encode(encoding='utf-8')
    
    annotstr=replace_bad_chars(annotstr).replace('&','&amp;').strip()
    root=ET.fromstring(annotstr)
    
    
    t=[]
    for child in root.findall('ReferenceInstance'):
        anchor=child.find('SurfaceForm').text.strip()
        url = child.find('ChosenAnnotation').text.strip()
        if url == '*null*':
            continue
        if url[:29] !='http://en.wikipedia.org/wiki/':
            print url[:29]
            raise Exception('bad format')
        url=url[29:]
        if title2id(url) is None:
            #print ds,':',url, " not found"
            continue
        ufrom = int(child.find('Offset').text.strip()) 
        uto = ufrom + int(child.find('Length').text.strip())
        t.append({'from':ufrom, 'to':uto, 
                'url':url, 'anchor': anchor})
    return t

for ds in dsnames:
    print ds
    url = get_annotation(ds)
    with open(join(text_path, ds)) as f:
        text=f.read()
    text=replace_bad_chars(text).decode('utf-8')
    
    #break
    t, m = splittext(text, url)
    outjs.write((json.dumps({"text":t, "mentions":m}, ensure_ascii=False)+'\n').encode('utf-8'))
    #print filestr

        
outjs.close()
print "done"

Bus3683270.txt
TvN16442342.txt
Tec16451635.txt
Spo16417540.txt
Tec16454435.txt
Ent16453733.txt
Tra16444229.txt
Tra16454203.txt
Pol16447720.txt
Wor16447201.txt
Hea16384904.txt
Wor13259309.txt
Bus16451112.txt
Spo16455207.txt
Hea16451212.txt
Ent16444023.txt
USN16443053.txt
USN16444287.txt
TvN16442287.txt
Pol16452612.txt
done


# Testing datasets

In [None]:
count = 0
with open ('/home/sajadi/backup/datasets/ner/MSNBC.txt.json') as f:
    for line in f.readlines():
        line = line.decode('utf-8')
        jline = json.loads(line)
        for ann in jline['mentions']:
            print jline['text'][ann[0]],'-->', ann[1]
        count +=1
        if count >= 10:
            break        