In [15]:
import bz2
import json
import ujson
import gensim
import gensim.models.word2vec

About this notebook:
We are interested in finding more entity mappings from NELL to Freebase 
(because Google's w2v pre-trained vectors have the Freebase entity name)
This notebook reads all wikidata at wikidata-20160502-all.json.bz2 and 
filters out irrelevant data fields. The original json data schema is 
described here: https://www.mediawiki.org/wiki/Wikibase/DataModel/JSON 

The value sought in this exercise: 
1) the alias / wikipedia page title fields in Wikidata could possibly overlap 
with NELL's "Entity literalStrings" thus providing a mapping to Freebase enitites.
2) If the NELL triple has relation concept:haswikipediaurl, the value literal
is a wikipedia page for the entity. That could provide a mapping to the Freebase IF

What we're eventually looking for is:
Nell Entity -> Freebase ID

<img src="New%20Doc%2011_1.jpg">

In [15]:
json_didnt_work = set()
multiple_freebase_values = set()

num_freebase_exists = 0
num_wiki_exists = 0
total_num_tried = 0


'''
{
    wikidata_id: "Q1234",                  #d['id']
    en_label:       "somelabel",           #d['labels']['en']['value'] 
    en_aliases:     ["alias1","alias2"],   #d['aliases']['en'][0-n]['value']
    freebase_id:    ["m/som1","m/som2"],   #d['claims']['P646'][0-n]['mainsnak']['datavalue']['value']
    en_wikipedia_page: {
                        title:  ""         #d['sitelinks']['enwiki']['title']
                        url:    ""         #d['sitelinks']['enwiki']['url']
                    }
}
'''

with open("latest-all.json", "r") as f, open('wikidata-id-label-alias-freebase-wiki-only.json-ish.txt','w') as fw:
    for line in f:
        total_num_tried += 1
        relevant_properties = {'en_wikipedia_page':{'title':'','url':''},'freebase_ids':[],'en_label':'','en_aliases':[]}
        try:
            d = json.loads(line.replace(',\n',''))
            #print( d['labels'].keys())
            #print(d.keys())

            relevant_properties['wikidata_id']    = d['id']
            if 'en' in d['labels']:
                relevant_properties['en_label']       = d['labels']['en']['value']
            if 'en' in d['aliases']:
                relevant_properties['en_aliases']     = [alias['value'] for alias in d['aliases']['en']]

            if 'P646' in d['claims']:
                num_freebase_exists += 1
                if len(d['claims']['P646']) > 1:
                    multiple_freebase_values.add(d['id'])
                relevant_properties['freebase_ids'] =  [fb_prop['mainsnak']['datavalue']['value'] for fb_prop in d['claims']['P646'] if 'mainsnak' in fb_prop and 'datavalue' in fb_prop['mainsnak'] and 'value' in fb_prop['mainsnak']['datavalue']]

            if 'sitelinks' in d:
                if 'enwiki' in d['sitelinks']:
                    num_wiki_exists += 1
                    if 'title' in d['sitelinks']['enwiki']:
                        relevant_properties['en_wikipedia_page']['title'] = d['sitelinks']['enwiki']['title']
                    if 'url' in d['sitelinks']['enwiki']:
                        relevant_properties['en_wikipedia_page']['url'] = d['sitelinks']['enwiki']['url']                

            fw.write(ujson.dumps(relevant_properties)+'\n')

        except ValueError:
            json_didnt_work.add(line)

        if total_num_tried % 10000 == 0:
            print(total_num_tried,"records processed")
            print(total_num_tried)
            print(len(json_didnt_work))
            print(num_freebase_exists)
            print(len(multiple_freebase_values))
            print(num_wiki_exists)
            
print(total_num_tried)
print(len(json_didnt_work))
print(num_freebase_exists)
print(len(multiple_freebase_values))
print(num_wiki_exists)

10000 records processed
10000
1
6824
4
8836
20000 records processed
20000
1
14941
4
18074
30000 records processed
30000
1
18722
5
25111
40000 records processed
40000
1
25369
6
33462
50000 records processed
50000
1
32125
7
41761
60000 records processed
60000
1
37455
8
50093
70000 records processed
70000
1
43731
9
58870
80000 records processed
80000
1
50381
10
67204
90000 records processed
90000
1
55594
10
75519
100000 records processed
100000
1
59569
11
83868
110000 records processed
110000
1
66431
12
92421
120000 records processed
120000
1
72989
12
100351
130000 records processed
130000
1
79233
12
108701
140000 records processed
140000
1
85011
12
116382
150000 records processed
150000
1
90985
13
124316
160000 records processed
160000
1
97521
13
132508
170000 records processed
170000
1
104494
14
140904
180000 records processed
180000
1
109521
14
147847
190000 records processed
190000
1
112458
14
152686
200000 records processed
200000
1
118062
15
159807
210000 records processed
210000
1


In [None]:
# Total statistics
#print(total_num_tried)                    #21,176,336
#print(len(json_didnt_work))               #2
#print(num_freebase_exists)                #1,157,849
#print(len(multiple_freebase_values))      #146
#print(num_wiki_exists)                    #6,544,763

In [16]:
json_didnt_work
#This is good, all json objects parsed as expected

{'[\n', ']\n'}

In [18]:
for mid in multiple_freebase_values:
    print(mid,end=' ')
#Strangely, there were wikidata entities that mapped to multiple freebase entities
#Choose <any id> from below and visit https://www.wikidata.org/wiki/<any id> to see the entity
#e.g. https://www.wikidata.org/wiki/Q2143665

Q2143665 Q157443 Q2049109 Q1378447 Q1074472 Q252689 Q128758 Q1150666 Q838382 Q3692102 Q19367312 Q3128196 Q23925117 Q68 Q7044447 Q7042855 Q284657 Q888890 Q2553148 Q2650238 Q842256 Q1080794 Q2152587 Q19799931 Q18205446 Q706364 Q689475 Q185415 Q283673 Q6742 Q333173 Q7130098 Q1824203 Q621261 Q11775280 Q3244156 Q450699 Q1257444 Q226730 Q8054 Q17988962 Q3893044 Q1278338 Q20980384 Q18123744 Q1342372 Q1202247 Q200782 Q79899 Q93196 Q4046379 Q7720527 Q2635268 Q1867183 Q18207711 Q1193055 Q19600521 Q2450000 Q17444603 Q4867386 Q458 Q959790 Q424709 Q19637462 Q1103870 Q1292421 Q3341780 Q13473 Q7130077 Q963674 Q510236 Q193537 Q708340 Q19775715 Q1860 Q1788030 Q19839246 Q19119517 Q4986245 Q18150290 Q11401 Q54111 Q331321 Q459290 Q1783270 Q15056579 Q452681 Q1662673 Q2166646 Q1248233 Q1198887 Q2163817 Q21623978 Q35690 Q1525611 Q2685999 Q8811 Q18630946 Q186195 Q1064904 Q1846261 Q4910873 Q1370848 Q555759 Q1035226 Q3054637 Q1571829 Q19263668 Q430034 Q2778850 Q857579 Q3673891 Q624745 Q1080374 Q3745670 Q844709 

In [8]:
'''
{
    wikidata_id: "Q1234",                  #d['id']
    en_label:       "somelabel",           #d['labels']['en']['value'] 
    en_aliases:     ["alias1","alias2"],   #d['aliases']['en'][0-n]['value']
    freebase_id:    ["m/som1","m/som2"],   #d['claims']['P646'][0-n]['mainsnak']['datavalue']['value']
    en_wikipedia_page: {
                        title:  ""         #d['sitelinks']['enwiki']['title']
                        url:    ""         #d['sitelinks']['enwiki']['url']
                    }
}
'''

errors = 0

with open('10000-working-copy-wikidata-id-label-alias-freebase-wiki-only.json-ish.txt') as f:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'wikidata_id' not in d \
           or 'en_label' not in d  \
           or 'en_aliases' not in d  \
           or 'freebase_ids' not in d \
           or 'en_wikipedia_page' not in d:
            errors += 1
    
print(errors)


everything_exists = 0
with open('10000-working-copy-wikidata-id-label-alias-freebase-wiki-only.json-ish.txt') as f:
    for line in f:
        d = json.loads(line.strip('\n'))
        #{'en_wikipedia_page':{'title':'','url':''},'freebase_ids':[],'en_label':'','en_aliases':[]}
        if d['en_wikipedia_page']['title'] != '' and d['en_aliases']!=[] and d['freebase_ids'] != [] and d['en_label'] != '':
            everything_exists += 1

print(everything_exists)

0
2434


In [3]:
num_empty_alias = num_empty_label = num_empty_freebase_id = num_empty_wikipedia = 0

keysset = set(['wikidata_id','en_label','en_aliases','freebase_ids','en_wikipedia_page'])

#Compacting the reduced data schema further by removing empty keys
with open('wikidata-id-label-alias-freebase-wiki-only.json-ish.txt') as f, open('wikidata-relevant-schema.json.txt','w') as fw:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'sitelinks' in d:
            del d['sitelinks']
        if set(d.keys()) ^ keysset:
            print("well, that's odd... weird keyssets")
            break
        if not d['wikidata_id']:
            print("well, that's odd... no wikidata id")
        if not d['en_label']:
            del d['en_label']
            num_empty_label += 1
        if not d['en_aliases']:
            del d['en_aliases']
            num_empty_alias += 1
        if not d['freebase_ids']:
            del d['freebase_ids']
            num_empty_freebase_id += 1
        if d['en_wikipedia_page'] == {'title':'','url':''}:
            del d['en_wikipedia_page']
            num_empty_wikipedia += 1
        fw.write(ujson.dumps(d)+'\n')

In [5]:
# creating a candidate set for entity literals in wikidata
# possibly a label, alias, or even wikipedia title

with open('wikidata-relevant-schema.json.txt') as f, open('candidate-wikidata-entity-literals.txt','w') as fw:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'en_label' in d:
            fw.write(d['en_label']+"\n")
        if 'en_aliases' in d:
            for alias in d['en_aliases']:
                fw.write(alias+"\n")
        if 'en_wikipedia_page' in d:
            fw.write(d['en_wikipedia_page']['title']+"\n")

# Note that the strings are then lower cased and uniq-d at the terminal
# saved to this file: candidate-wikidata-entity-literals-lower-sort-uniq.txt
# It has over 12 million candidates. Clearly this isn't doing us much good.

In [None]:
# But let's see how well we do just out of the box, without any post processing
# Wait. I've got an idea, which really should have happened earlier. 
# If a wikidata record doesn't have an English label, don't add it to the file at all.

In [7]:
json_didnt_work = set()

num_freebase_exists = 0
num_wiki_exists = 0
total_num_tried = 0
num_with_en_label = 0

with open("latest-all.json", "r") as f, open('wikidata-relevant-schema-only-en-label.json.txt','w') as fw, open('error-log','w') as fw2:
    for line in f:
        
        relevant_properties = {}
        try:
            d = json.loads(line.replace(',\n',''))
            total_num_tried += 1
            
            if 'en' in d['labels']:
                relevant_properties['wikidata_id']    = d['id']
                relevant_properties['en_label']   = d['labels']['en']['value']
                num_with_en_label += 1
            else:
                continue
            if 'en' in d['aliases']:
                relevant_properties['en_aliases'] = [alias['value'] for alias in d['aliases']['en']]

            if 'P646' in d['claims']:
                num_freebase_exists += 1
                relevant_properties['freebase_ids'] =  [fb_prop['mainsnak']['datavalue']['value']
                                                          for fb_prop in d['claims']['P646'] 
                                                            if 'mainsnak' in fb_prop and
                                                                'datavalue' in fb_prop['mainsnak'] and
                                                                'value' in fb_prop['mainsnak']['datavalue']]
                if relevant_properties['freebase_ids'] == []:
                    fw2.write(str(d['claims']['P646'])+"\n")

            if 'sitelinks' in d:
                if 'enwiki' in d['sitelinks']:
                    num_wiki_exists += 1
                    relevant_properties['en_wikipedia_page'] = {}
                    if 'title' in d['sitelinks']['enwiki']:
                        relevant_properties['en_wikipedia_page']['title'] = d['sitelinks']['enwiki']['title']
                    if 'url' in d['sitelinks']['enwiki']:
                        relevant_properties['en_wikipedia_page']['url'] = d['sitelinks']['enwiki']['url']                

            fw.write(ujson.dumps(relevant_properties)+'\n')

        except ValueError:
            json_didnt_work.add(line)

        if total_num_tried % 10000 == 0:
            print(total_num_tried,"records processed")
            #print(total_num_tried)
            #print(len(json_didnt_work))
            #print(num_freebase_exists)
            #print(num_wiki_exists)
            
print(total_num_tried)         #21,176,334
print(len(json_didnt_work))    #2
print(num_freebase_exists)     #1,156,695
print(num_wiki_exists)         #6,538,695
print(num_with_en_label)       #12,047,322 (as opposed to 21176334 (well, all entries, in the previous iteration)

10000 records processed
30000 records processed
40000 records processed
60000 records processed
70000 records processed
80000 records processed
90000 records processed
100000 records processed
110000 records processed
120000 records processed
130000 records processed
140000 records processed
150000 records processed
160000 records processed
170000 records processed
180000 records processed
190000 records processed
200000 records processed
220000 records processed
230000 records processed
240000 records processed
270000 records processed
290000 records processed
300000 records processed
310000 records processed
320000 records processed
340000 records processed
350000 records processed
360000 records processed
370000 records processed
380000 records processed
400000 records processed
410000 records processed
420000 records processed
440000 records processed
450000 records processed
460000 records processed
470000 records processed
480000 records processed
490000 records processed
520000 

In [9]:
# Ok, back to creating a candidate set for entity literals in wikidata
# possibly a label, alias, or even wikipedia title

with open('wikidata-relevant-schema-only-en-label.json.txt') as f, open('candidate-wikidata-entity-literals-only-en-label.txt','w') as fw:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'en_label' in d:
            fw.write(d['en_label']+"\n")
        if 'en_aliases' in d:
            for alias in d['en_aliases']:
                fw.write(alias+"\n")
        if 'en_wikipedia_page' in d:
            fw.write(d['en_wikipedia_page']['title']+"\n")

## Ummm, strangely that made very little difference. 20459951 vs 20467788. A difference of 7837. I don't quite get it.
## I still have Chinese letters and smileys and numbers appearing.

In [11]:
#Ok let me try writing to separate files.
with open('wikidata-relevant-schema-only-en-label.json.txt') as f, \
     open('cweloel-lab.txt','w') as fw1,\
     open('cweloel-ali.txt','w') as fw2,\
     open('cweloel-wik.txt','w') as fw3:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'en_label' in d:
            fw1.write(d['en_label']+"\n")
        if 'en_aliases' in d:
            for alias in d['en_aliases']:
                fw2.write(alias+"\n")
        if 'en_wikipedia_page' in d:
            fw3.write(d['en_wikipedia_page']['title']+"\n")

In [19]:
#Also, another way to filter is only choose those entities whose freebase ids exist in the word2vec model

model = gensim.models.Word2Vec.load_word2vec_format('freebase-vectors-skipgram1000.bin.gz',binary=True)

In [20]:
import pickle
vocab = set(model.vocab.keys())
pickle.dump(vocab,open('freebase-vectors-skipgram1000.bin.gz-vocab.pkl','wb'))

In [25]:
mids_in_w2v_and_wd = set([])
with open('wikidata-relevant-schema-only-en-label.json.txt') as f:
    for line in f:
        d = json.loads(line.strip('\n'))
        if 'freebase_ids' in d:
            for fid in d['freebase_ids']:
                if fid in vocab:
                    mids_in_w2v_and_wd.add(fid)

In [27]:
pickle.dump(mids_in_w2v_and_wd,open('mids_in_w2v_and_wd.pkl','wb'))

In [None]:
#generate unique lower case candidate entities with:
#cat candidate-wikidata-entity-literals-only-en-label.txt | tr '[:upper:]' '[:lower:]' | sort | uniq > candidate-wikidata-entity-literals-only-en-label-lower-sort-uniq.txt

In [None]:
# creating a candidate set of entity literals in NELL
# Currently using names.txt's 2nd column with some post processing

# TODO
# one set is the column 'Best Entity literalString' in NELL.08m.990.esv.csv.gz. This was created using:
# cut -d' ' -f2,9,10 NELL.08m.990.esv.csv > NELL.08m.990.esv.minim.csv
# followed by 
# this was done using gnu tools as follows: