In [None]:
# preparation of dataset for company similarity

In [3]:
import os
from collections import defaultdict, Counter
from text_utils import remove_reg, tokenise, get_provenance
import json
import numpy as np
from tqdm.notebook import tqdm
from wiki_labels import qid_lab_get
from wiki_location import q2cc
from marisa_trie import Trie
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

qid_lab_get(37156, lang='en', include_alt=True).keys(), q2cc['Q90'], q2cc['Q142'], 

(dict_keys(['ibm', '18m', 'i.b.m.', 'big blue', 'ibm corp.', 'ibm corporation', 'international business machines', 'international business machines corporation']),
 'FR',
 'FR')

In [23]:
stat = Counter()
all_names = set()
fo = open('/backup/wikidata/wiki_org_names.tmp', 'w')
for l in open('/backup/wikidata/wiki_org.jsonl'):
    j = json.loads(l)
    names = list(qid_lab_get(int(j['id'][1:]), lang='en', include_alt=True).keys())
    #names.update(j['names'])
    ccs = set([q2cc[a] for a in j['location'] if a in q2cc])
    if not j['location']:
        stat['no_loc'] += 1
    elif 'US' in ccs:
        fo.write('\t'.join(names)+'\n')
        stat['us'] += 1
    else:
        stat['non-us'] += 1
    all_names.update([a for a in names if len(a)>2])
fo.close()
all_names = list(all_names)
trie = Trie(all_names)
print(len(all_names), len(trie), stat)

469135 469135 Counter({'non-us': 298917, 'no_loc': 109671, 'us': 77498})


In [46]:
# rearange all_names
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
all_names = [trie.restore_key(i) for i in range(len(trie))]
vecs = model.encode(all_names)
print(all_names[trie['international business machines']])

international business machines


In [54]:
np.inner(vecs[trie['international business machines']], vecs[trie['international business machines corporation']])
np.inner(vecs[trie['ibm']], vecs[trie['i.b.m.']])


0.95381683

In [68]:
import faiss                   
dimensionality = vecs.shape[1]
quantizer = faiss.IndexFlatL2(dimensionality)
index = faiss.IndexIVFFlat(quantizer, dimensionality, 100, faiss.METRIC_L2)

index.train(vecs)
print(index.is_trained)
index.add(vecs)                  
print(index.ntotal)

True
469135


In [89]:
def find(s, n=10, onlyint=False, returnq=False):
    sind = trie[s] if s in trie else None
    if sind:
        vec = vecs[sind]
    else:
        vec = model.encode([s])[0]        
    D, I = index.search(vec.reshape(1, dimensionality), n)
    ret = []
    for dist, i in zip(D[0], I[0]):
        if returnq or i != sind:
            r = {'ind': i, 'dist': dist}
            if not onlyint:
                r['name'] = trie.restore_key(i)
            ret.append(r)
    return ret
    

find('international business machines')

[{'ind': 465953,
  'dist': 0.09236614,
  'name': 'international business machines corporation'},
 {'ind': 467404, 'dist': 0.5085323, 'name': 'international computers limited'},
 {'ind': 158242, 'dist': 0.6014789, 'name': 'innosoft international'},
 {'ind': 382576, 'dist': 0.6094106, 'name': 'business network international'},
 {'ind': 90693, 'dist': 0.6357165, 'name': 'iq infotech & co'},
 {'ind': 444981,
  'dist': 0.63672066,
  'name': 'integrated technologies & services international'},
 {'ind': 345783, 'dist': 0.6473044, 'name': 'intercorp'},
 {'ind': 88532, 'dist': 0.65288514, 'name': 'insoft inc.'},
 {'ind': 89963, 'dist': 0.65344036, 'name': 'idom inc.'}]

In [118]:
from itertools import combinations
from sklearn.model_selection import train_test_split
br = 0
n_negative = 10

examples = []
for l in tqdm(open('/backup/wikidata/wiki_org_names.tmp'), total=77498):
    ls = [a for a in l.strip('\n').split('\t') if a]
    if len(ls) > 1:
        for n1, n2 in combinations(ls, 2):
            examples.append((trie[n1], trie[n2], 1))
    # hard negative
    for n in ls:
        uk = n_negative
        for r in find(n, n=n_negative+5):
            if r['name'] not in ls and r['name']:
                examples.append((trie[n], trie[r['name']], 0))
            uk -= 1
            if uk == 0:
                break
    br += 1
    #if br>100:
    #    break

ts = np.zeros((len(examples), 3))
for i, (i1, i2, lab) in enumerate(examples):
    ts[i] = (i1, i2, lab)
    
np.random.shuffle(ts)
test_size = int(min(len(ts)*.1, 3000))

np.savez_compressed('/backup/wikidata/name_similarity_ds', train=ts[test_size*2:], dev=ts[:test_size], test=ts[test_size:test_size*2])
trie.save('/backup/wikidata/name_similarity_ds.trie')

  0%|          | 0/77498 [00:00<?, ?it/s]