# Creating train-test for senseembedding

# Materialization of the datasets

In [13]:
%%writefile materialize.py 

import os
from collections import defaultdict
import sys
import time
import requests
import random
import json
import math
from multiprocessing import Pool, Process, Manager 
import functools
import thread
from requests.packages.urllib3 import Retry
sys.path.insert(0,'../..')
home = os.path.expanduser("~");
from wikisim.wikipedia import *

qstr = 'http://localhost:8983/solr/enwiki20160305_context/select'
process_no=25
tr_percent=0.8

down_sample = True
max_anchor = 100
skip_line=-1

random.seed(3)
written_sofar=0
example_per_anchor=10

session = requests.Session()
http_retries = Retry(total=20,
                backoff_factor=.1)
http = requests.adapters.HTTPAdapter(max_retries=http_retries)
session.mount('http://localhost:8983/solr', http)

def solr_escape(s):
    return re.sub(r'''['"\\]''', r'\\\g<0>', s)

def get_context(anchor, eid):
    
    params={'wt':'json', 'rows':'50000'}
    anchor = solr_escape(anchor)
    
    q='anchor:"%s" AND entityid:%s' % (anchor, eid)
    params['q']=q
    
#     session = requests.Session()
#     http_retries = Retry(total=20,
#                     backoff_factor=.1)
#     http = requests.adapters.HTTPAdapter(max_retries=http_retries)
#     session.mount('http://localhost:8983/solr', http)
    
    r = session.get(qstr, params=params).json()
    if 'response' not in r: 
        print "[terminating]\t%s",(str(r),)
        sys.stdout.flush()
        os._exit(0)
        
    if not r:
        return []
    return r['response']['docs']

def loadanchors(min_count=5):
    rows = load_table('anchors')
    anchors = defaultdict(list)
    for r in rows:
        if r[2] >= min_count:
            anchors[r[0]].append((r[1], r[2]))        
    return anchors.items()


def mater_anchor((a,l), trq, tsq, lgq):
    global written_sofar
    if down_sample and written_sofar >=max_anchor:
        return
    if (not a) or len(l)<2:
        lgq.put( "[Error]\tanchor_empty_or_not_ambig\t%s]" % json.dumps({"anchor": a, "length": l}))
        return
    #print '(wid,n) = ', (a,l)
    for i in range(len(l)):
        (wid,f) = l[i]
        neg = l[:i]+l[i+1:]
        #neg = [nid for (nid, _) in neg]
        contexts = get_context(a,wid)        
        n=len(contexts)
        
        random.shuffle(contexts)
        
        if down_sample:
            contexts = contexts[:example_per_anchor]
            n=len(contexts)
                        
        if not contexts:
            lgq.put("[Error]\tcontext_empty\t%s" % json.dumps({"wid": wid, "frq": f}))
            continue
        # now we have a     
        cutpoint=int(math.ceil(tr_percent*n))
        if skip_line==-1:
            train = contexts[:cutpoint]
            test = contexts[cutpoint:]
        else:
            train = [c for c in contexts if skip_line not in c['paragraph_no']]
            test = [c for c in contexts if skip_line in c['paragraph_no']]
            
        lgq.put ("[success]\t%s" % json.dumps({"anchor": a,"wid": wid, "freq": f, "context_length": n,
                                            "train_size":len(train), "test_size":len(test)}))
        
        mater_sample(train, neg, trq)    
        mater_sample(test, neg, tsq)    
        if down_sample:
            written_sofar += 1
def mater_sample(context, neg, q):
    for c in context:
        c.pop('id', None)
        c.pop('_version_', None)
        q.put(json.dumps({"context":c, "neg": neg, "freq": len(c)},ensure_ascii=False).encode('utf-8'))
        
def worker(fname, q):
    w = open(fname,'w')
    print "[Writer started]"
    sys.stdout.flush()
    while True:
        s = q.get()
        if s=='kill':
            print "[Writer worker closing]"
            sys.stdout.flush()
            break
        w.write(s+"\n")
    w.close()    

    
startTime = time.time()
anchors = loadanchors()    
print '[anchors loaded to memory]'    
print time.time()-startTime
sys.stdout.flush()
        
startTime = time.time()

manager= Manager()

extension='%s.%s.json'%(down_sample, skip_line)
if down_sample:
    extension="%s.%s"%(max_anchor, extension)
    
train_name = os.path.join(home,'backup/datasets/cmod/train.%s'%(extension))
test_name = os.path.join(home,'backup/datasets/cmod/test.%s'%(extension))
log_name = os.path.join(home,'backup/datasets/cmod/log.%s'%(extension))
    
train_q = manager.Queue()
test_q = manager.Queue()
log_q = manager.Queue()


train_proc = Process(target=worker, args=(train_name, train_q))
train_proc.start()   
        
test_proc = Process(target=worker, args=(test_name, test_q))
test_proc.start()   

log_proc = Process(target=worker, args=(log_name, log_q))
log_proc.start()   


#pool = Pool(process_no) 
#pool.map(functools.partial(mater_anchor, trq=train_q, tsq=test_q ), anchors)
map(functools.partial(mater_anchor, trq=train_q, tsq=test_q, lgq=log_q   ), anchors)

train_q.put('kill')    
test_q.put('kill')
log_q.put('kill')

train_proc.join()
test_proc.join()
log_proc.join()

print 'Done'    
print time.time()-startTime
sys.stdout.flush()


Overwriting materialize.py


# integizing

In [14]:
%%writefile utils.py
import collections
import json

def build_vocab(words, min_count=5):
    count = [['UNK', -1]]
    count.extend([ (w,c) for w,c in collections.Counter(words).items()])
    vocab = dict()
    for word, c in count:
        if c >= min_count:
            vocab[word] = len(vocab)
    return count, vocab

def getwords(*filenames):
    words=[]
    for filename in filenames:
        with open(filename) as infile:
            for line in infile:
                ex = json.loads(line.decode('utf-8').strip())
                words += [str(n[0]) for n in ex["neg"]]
                if "left" in ex["context"]:
                    words += ex["context"]["left"].split()
                if "right" in ex["context"]:
                    words += ex["context"]["right"].split()
                words .append(ex["context"]["entityid"])
    return words
        
def integize(infile_name, outfile_name, vocab):
    with open(infile_name) as infile, open(outfile_name, 'w' ) as outfile:
        for line in infile:
            ex = json.loads(line.decode('utf-8').strip())
            
            neg =  [[vocab[str(n[0])],n[1]] for n in ex["neg"] if str(n[0]) in vocab]                
            if not neg or ex["context"]["entityid"] not in vocab:
                continue
                
            entityid  = vocab[ex["context"]["entityid"]]
                
            if "left" in ex["context"]:
                left = [vocab[w] for w in ex["context"]["left"].split() if w in vocab]
            if "right" in ex["context"]:
                right = [vocab[w] for w in ex["context"]["right"].split() if w in vocab]
            
            
            ex_id = {"neg": neg, 
                     "context": { "left": left, "entityid" : entityid, 
                                 "right": right, "freq":ex["freq"] },                     
                    }
            outfile.write(json.dumps(ex_id, ensure_ascii=False).encode('utf-8')+'\n')
        

Overwriting utils.py


In [19]:
#%%writefile integize.py
import os
from utils import *
home = os.path.expanduser("~")
filepattern='10000.True.0'
train_name_w = os.path.join(home, 'backup/datasets/cmod/train.%s.json'%(filepattern,))
train_name = os.path.join(home, 'backup/datasets/cmod/train.id.%s.json'%(filepattern,))
test_name_w = os.path.join(home, 'backup/datasets/cmod/test.%s.json'%(filepattern,))
test_name = os.path.join(home, 'backup/datasets/cmod/test.id.%s.json'%(filepattern,))

words = getwords(train_name_w, test_name_w)
count, vocab = build_vocab(words, min_count=5)
with open(os.path.join(home, 'backup/datasets/cmod/vocab.%s.tsv'%(filepattern,)), 'w') as out:
    out.write(json.dumps({"orig_size": len(count), "size": len(vocab)})+'\n')
    out.write(json.dumps(count, ensure_ascii=False).encode('utf-8')+'\n')
    out.write(json.dumps(vocab, ensure_ascii=False).encode('utf-8')+'\n')


integize(train_name_w, train_name, vocab)
integize(test_name_w, test_name, vocab)
print "done"

done


# Prepare for word2vec
## modifying word2vec/word2vec.ipynb

In [None]:
%%writefile replace_surface.py
import re
import sys
import urllib
import sys
from HTMLParser import HTMLParser
import time

sys.path.insert(0,'..')
from memapi import memwiki as wiki


fileinput = sys.stdin

def title2id(title):
    if not title:
        return "NA1"
    wid = wiki.title2id(title)
    if wid is None:
        title=title[0].upper()+title[1:]    
        wid = wiki.title2id(title)
    if wid is None:
        return "NA2"
    return str(wid)
    

def url2id(antext, url):
    hp = HTMLParser()
    
    url=url.encode('utf-8')
    url =  urllib.unquote(url)
    url = url.decode('utf-8')

    url=hp.unescape(url)
    url=hp.unescape(url)
    url=url.replace(u"\xA0"," ")
    x = url.find("#")
    if x!=-1:
        url=url[:x]
    return "id_"+title2id(url)
    
    
def replacelinks(text):
    
    annotations = []
    deltaStringLength = 0
    hrefreg=r'<a href="([^"]+)">([^>]+)</a>'
    
    text = re.sub(hrefreg, lambda m:url2id(m.group(2), m.group(1)), text)  
    return text


def process():
    hp = HTMLParser()
    rstart=r'<doc id="(.*)" url="(.*)" title="(.*)">'
    rend=r'</doc>'
    
    line_no=-1;
    for line in fileinput.readlines():
        line = line.decode('utf-8').strip()
        if not line:
            continue
            
        ms = re.match(rstart, line)
        if ms is not None:
            wid=ms.group(1)
            wtitle=hp.unescape(ms.group(3)).replace(u"\xA0"," ")
            line_no=0
            #print 'id_'+title2id(wtitle)
            continue
        if line_no ==0:
            line_no=1
            continue
        if re.match(rend,line):
            print "\n"
            continue    

        text = replacelinks(line).encode('utf-8')
        print text
        continue
    
if __name__ == "__main__": 
    #startTime = time.time()
    wiki.load_tables()
    #print 'wiki loaded to memory'    
    #print time.time()-startTime
    #sys.stdout.flush()
    
    process()

In [3]:
qstr = 'http://localhost:8983/solr/enwiki20160305_context/select'
process_no=25
tr_percent=0.8

down_sample = True
max_anchor = 100
skip_line=-1

random.seed(3)
written_sofar=0
example_per_anchor=10

session = requests.Session()
http_retries = Retry(total=20,
                backoff_factor=.1)
http = requests.adapters.HTTPAdapter(max_retries=http_retries)
session.mount('http://localhost:8983/solr', http)

3