In [1]:
import spacy
import pickle

### Load NER model

In [2]:
model = pickle.load(open("NER.pickle", "rb"))

### Text to test the entity_linking model

In [3]:
text = "Bambang merupakan pemain bola yang menjadi andalan indonesia"
doc = model(text)
for ent in doc.ents:
    print(f"Named entity '{ent.text}' with label '{ent.label_}'")

Named entity 'Bambang' with label 'PERSON'
Named entity 'pemain bola' with label 'JOB_TITLE'


### Create function to load initial Knowledge Base data from csv

In [4]:
import csv
from pathlib import Path
def load_entities():
    name_loc = Path.cwd() / "dict_name.csv"
    desc_loc = Path.cwd()/"dict_desc.csv"
    names = dict()
    description = dict()
    with name_loc.open("r", encoding="utf8") as csvname:
        namereader = csv.reader(csvname, delimiter=",")
        for row in namereader:
            qid = row[0]
            name = row[1]
            names[qid] = name
    
    with desc_loc.open("r", encoding="utf8") as csvdesc:
        descreader = csv.reader(csvdesc, delimiter=",")
        for row in descreader:
            qid = row[0]
            desc = row[1]
            description[qid] = desc
    return names,description

### Define name dictionary and description dict for wikidata

In [5]:
name_dict, desc_dict = load_entities()
for QID in name_dict.keys():
    print(f"{QID}, name={name_dict[QID]}, desc={desc_dict[QID]}")

Q221310, name=Anggun C. Sasmi, desc=penyanyi-penulis lagu Indonesia-Prancis
Q1362169, name=Agnez Mo, desc=penyanyi dan artis Internasional
Q76127, name=Soekarno, desc=Presiden pertama Republik Indonesia
Q3318231, name=Joko Widodo, desc=presiden ke-7 Indonesia
Q44819, name=Soeharto, desc=Presiden ke-2 Republik Indonesia dari 1967 hingga 1998
Q76179, name=Megawati Soekarnoputri, desc=Presiden ke-5 dan Wakil Presiden ke-8 Republik Indonesia
Q57405, name=Susilo Bambang Yudhoyono, desc=Presiden ke-6 Republik Indonesia dari 2004 sampai 2014
Q189488, name=Willem Einthoven, desc=dosen
Q76167, name=Abdurrahman Wahid, desc=Presiden ke-4 Republik Indonesia dari 1999 hingga 2001
Q76156, name=Bacharuddin Jusuf Habibie, desc=presiden ke-3 dan wakil presiden ke-7 Indonesia
Q326935, name=Pramoedya Ananta Toer, desc=penulis Indonesia
Q29050, name=Mohammad Hatta, desc=Wakil Presiden Republik Indonesia ke-1
Q42297483, name=Hary Gunarto, desc=ilmuwan Jepang
Q315689, name=Andrés Scotti, desc=Uruguayan foot

### create list of people name "Bambang" for test

In [6]:
test_data_name = {qid:name for qid,name in name_dict.items() if qid in ["Q975434", "Q57405", "Q805927"]}
test_data_desc = {qid:desc for qid,desc in desc_dict.items() if qid in ["Q975434", "Q57405", "Q805927"]}
print(test_data_desc)

{'Q57405': 'Presiden ke-6 Republik Indonesia dari 2004 sampai 2014', 'Q975434': 'pesepak bola Indonesia', 'Q805927': 'badminton player'}


### Create Knowledge Base

In [7]:
from spacy.kb import KnowledgeBase
def create_kb(vocab):
    return KnowledgeBase(vocab=vocab, entity_vector_length=96)

kb = create_kb(model.vocab)

#### add each record to knowledge base by encode its description using built-in vector of nlp model

In [8]:
vector = model.vocab
print(vector)

<spacy.vocab.Vocab object at 0x000001F1693541F0>


In [9]:
for id,teks in test_data_desc.items():
    print(teks)
    pred_desc = model(teks)
    print([(tes.text, tes.label_) for tes in pred_desc.ents])

Presiden ke-6 Republik Indonesia dari 2004 sampai 2014
[('Presiden', 'JOB_TITLE'), ('Indonesia', 'GPE')]
pesepak bola Indonesia
[('Indonesia', 'GPE')]
badminton player
[]


In [10]:
for qid,desc in test_data_desc.items():
    desc_doc = model(desc)
    print(type(desc_doc))
    print([(tes.text, tes.has_vector, tes.vector_norm) for tes in desc_doc.ents])
    desc_enc = desc_doc.vector
    print(desc_enc)
    kb.add_entity(entity=qid, entity_vector=desc_enc, freq=342)

<class 'spacy.tokens.doc.Doc'>
[('Presiden', True, 24.466875), ('Indonesia', True, 22.693472)]
[ 0.75242615 -1.0240536   0.25938946  1.298917    0.5808679  -0.7929794
 -0.63040495 -0.10847893 -1.0475309   0.6115074  -0.7011555   0.09636406
 -1.648802    0.9305185   0.70654714 -0.16597208 -1.6405128  -0.16758516
 -0.51248276 -0.1905116  -0.6443113  -0.07427826  0.9403927  -0.80078924
  0.04926576  0.8441375  -0.618108    0.9908975  -0.0538023   0.5456103
  0.02248681 -1.2348497   1.1346684  -1.0934504   0.00873392  0.8128322
 -0.00238701  0.36538142 -1.0543642   0.6252912   0.2132651   0.36654904
  0.66815495 -0.55302393  0.37822193  0.16564529 -0.51167506 -1.1713943
 -1.0999238  -0.5293921   0.9970094   1.3657238  -0.62867385  0.4122885
  0.9147053  -0.09679216 -0.91339463  0.4513821   0.19641992 -0.3173883
  0.11974125 -0.05921207 -0.27352542 -2.1573308  -1.5245266   0.73461616
 -1.393272   -0.25371832 -0.8425559   0.9648364  -0.0079657   0.9440492
  0.65751916  0.71331644  0.20244043

### add aliases for each QID

In [11]:
for qid, name in test_data_name.items():
    kb.add_alias(alias=name, entities=[qid], probabilities=[1])

In [12]:
qids = test_data_name.keys()
probs = [0.3 for qid in qids]
kb.add_alias(alias="Bambang", entities=qids, probabilities=probs)

4734843956947139246

In [13]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['Q975434', 'Q57405', 'Q805927']
Aliases in the KB: ['Susilo Bambang Yudhoyono', 'Bambang Suprianto', 'Bambang Pamungkas', 'Bambang']


In [14]:
print(f"Candidates for SBY: {[c.entity_ for c in kb.get_alias_candidates('Susilo Bambang Yudhoyono')]}")
print(f"Candidates for Bambang: {[c.entity_ for c in kb.get_alias_candidates('Bambang')]}")

Candidates for SBY: ['Q57405']
Candidates for Bambang: ['Q57405', 'Q975434', 'Q805927']


### Save KnowledgeBase

In [15]:
import os
output_dir = Path.cwd()/"kb_test"
kb.to_disk(output_dir / "my_kb")

### Create Training Data for KnowledgeBase

In [16]:
with open("text.txt") as file:
    training_text = file.readlines()

In [17]:
import pprint
pprint.pprint(training_text)

['Bambang merupakan presiden ke-6 indonesia.\n',
 'Bambang Pamungkas menjadi pemain bola yang diandalkan.\n',
 'Pertandingan Bambang di thomas cup sangat dinantikan.\n',
 'Kinerja yang ditunjukan oleh Bambang untuk negara sangat luar biasa.\n',
 'Bambang atau yang biasa disebut bepe itu menjadi kapten timnas yang '
 'dipercaya rekan timnya.\n',
 'Raket yang digunakan oleh Bambang dijual dengan harga yang fantastis.\n',
 'Pertemuan antara Bambang dengan perdana menteri inggris menghasilkan sebuah '
 'kolaborasi.\n',
 'Tendangan yang dilakukan Bambang berhasil mencetok skor untuk indonesia.\n',
 'Smash yang dilakukan oleh Bambang sangat ditakuti oleh lawan-lawannya.\n',
 'Bambang yang dulunya seorang jenderal TNI tentunya memiliki kedisiplinan '
 'yang tinggi.\n',
 'Bambang sebagai kapten kesulitan untuk membimbing rekan satu timnya di '
 'lapangan.\n',
 'Sebagai atlet bulu tangkis, Bambang selalu tekun melatih tekniknya.\n',
 'Susilo Bambang Yudhoyono merupakan Presiden Indonesia ke-6.\

In [18]:
print(training_text[0][-2])

.


In [19]:
# train_doc = model(kalimat)
# print(train_doc)
# for tes in train_doc.ents:
#     print(tes.text, tes.label_)
# span = train_doc[0:3]
# print(span)
import re

def get_offset(text, list_ent):
    token_list = []
    for token in list_ent:
        for match in re.finditer(re.escape(token[0]), text):
            token_list.append(
                {
                    "start" : match.start(),
                    "end" : match.end(),
                    "word" : token[0]
                }
            )
    return token_list

def predict_word():
    list_ent_link = []
    for sentence in training_text:
        train_doc = model(sentence)
        list_token = [(ent.text, ent.label_) for ent in train_doc.ents]
        list_offset = get_offset(sentence, list_token)
        list_offset = [(word["start"],word["end"]) for word in list_offset if  word["word"]=="Susilo Bambang Yudhoyono" or word["word"]=="Bambang Pamungkas" or word["word"]=="Bambang Suprianto" or word["word"]=="Bambang"]
        for offset in list_offset:
            links_dict = {" ":1.0}
            list_ent_link.append((sentence, {"links":{offset: links_dict}}))
    return list_ent_link

In [20]:
list_ent = predict_word()
print(len(list_ent))
for cek in list_ent:
    pprint.pprint(cek)

23
('Bambang merupakan presiden ke-6 indonesia.\n',
 {'links': {(0, 7): {' ': 1.0}}})
('Bambang Pamungkas menjadi pemain bola yang diandalkan.\n',
 {'links': {(0, 17): {' ': 1.0}}})
('Kinerja yang ditunjukan oleh Bambang untuk negara sangat luar biasa.\n',
 {'links': {(29, 36): {' ': 1.0}}})
('Bambang atau yang biasa disebut bepe itu menjadi kapten timnas yang '
 'dipercaya rekan timnya.\n',
 {'links': {(0, 7): {' ': 1.0}}})
('Raket yang digunakan oleh Bambang dijual dengan harga yang fantastis.\n',
 {'links': {(26, 33): {' ': 1.0}}})
('Pertemuan antara Bambang dengan perdana menteri inggris menghasilkan sebuah '
 'kolaborasi.\n',
 {'links': {(17, 24): {' ': 1.0}}})
('Tendangan yang dilakukan Bambang berhasil mencetok skor untuk indonesia.\n',
 {'links': {(25, 32): {' ': 1.0}}})
('Smash yang dilakukan oleh Bambang sangat ditakuti oleh lawan-lawannya.\n',
 {'links': {(26, 33): {' ': 1.0}}})
('Bambang yang dulunya seorang jenderal TNI tentunya memiliki kedisiplinan '
 'yang tinggi.\n',
 

In [21]:
print(test_data_name)

{'Q57405': 'Susilo Bambang Yudhoyono', 'Q975434': 'Bambang Pamungkas', 'Q805927': 'Bambang Suprianto'}


In [22]:
def insert_QID():
    list_ent[0][1]["links"][(0, 7)]["Q57405"] = list_ent[0][1]["links"][(0, 7)][" "]
    del list_ent[0][1]["links"][(0, 7)][" "]

    list_ent[1][1]["links"][(0, 17)]["Q975434"] = list_ent[1][1]["links"][(0, 17)][" "]
    del list_ent[1][1]["links"][(0, 17)][" "]

    list_ent[2][1]["links"][(29, 36)]["Q57405"] = list_ent[2][1]["links"][(29, 36)][" "]
    del list_ent[2][1]["links"][(29, 36)][" "]

    list_ent[3][1]["links"][(0, 7)]["Q975434"] = list_ent[3][1]["links"][(0, 7)][" "]
    del list_ent[3][1]["links"][(0, 7)][" "]

    list_ent[4][1]["links"][(26, 33)]["Q805927"] = list_ent[4][1]["links"][(26, 33)][" "]
    del list_ent[4][1]["links"][(26, 33)][" "]

    list_ent[5][1]["links"][(17, 24)]["Q57405"] = list_ent[5][1]["links"][(17, 24)][" "]
    del list_ent[5][1]["links"][(17, 24)][" "]

    list_ent[6][1]["links"][(25, 32)]["Q975434"] = list_ent[6][1]["links"][(25, 32)][" "]
    del list_ent[6][1]["links"][(25, 32)][" "]

    list_ent[7][1]["links"][(26, 33)]["Q805927"] = list_ent[7][1]["links"][(26, 33)][" "]
    del list_ent[7][1]["links"][(26, 33)][" "]

    list_ent[8][1]["links"][(0, 7)]["Q57405"] = list_ent[8][1]["links"][(0, 7)][" "]
    del list_ent[8][1]["links"][(0, 7)][" "]

    list_ent[9][1]["links"][(0, 7)]["Q975434"] = list_ent[9][1]["links"][(0, 7)][" "]
    del list_ent[9][1]["links"][(0, 7)][" "]

    list_ent[10][1]["links"][(28, 35)]["Q805927"] = list_ent[10][1]["links"][(28, 35)][" "]
    del list_ent[10][1]["links"][(28, 35)][" "]

    list_ent[11][1]["links"][(0, 24)]["Q57405"] = list_ent[11][1]["links"][(0, 24)][" "]
    del list_ent[11][1]["links"][(0, 24)][" "]

    list_ent[12][1]["links"][(0, 17)]["Q975434"] = list_ent[12][1]["links"][(0, 17)][" "]
    del list_ent[12][1]["links"][(0, 17)][" "]

    list_ent[13][1]["links"][(0, 17)]["Q805927"] = list_ent[13][1]["links"][(0, 17)][" "]
    del list_ent[13][1]["links"][(0, 17)][" "]

    list_ent[14][1]["links"][(21, 28)]["Q57405"] = list_ent[14][1]["links"][(21, 28)][" "]
    del list_ent[14][1]["links"][(21, 28)][" "]

    list_ent[15][1]["links"][(0, 7)]["Q975434"] = list_ent[15][1]["links"][(0, 7)][" "]
    del list_ent[15][1]["links"][(0, 7)][" "]

    list_ent[16][1]["links"][(35, 52)]["Q975434"] = list_ent[16][1]["links"][(35, 52)][" "]
    del list_ent[16][1]["links"][(35, 52)][" "]

    list_ent[17][1]["links"][(0, 24)]["Q57405"] = list_ent[17][1]["links"][(0, 24)][" "]
    del list_ent[17][1]["links"][(0, 24)][" "]

    list_ent[18][1]["links"][(0, 7)]["Q975434"] = list_ent[18][1]["links"][(0, 7)][" "]
    del list_ent[18][1]["links"][(0, 7)][" "]

    list_ent[19][1]["links"][(0, 7)]["Q805927"] = list_ent[19][1]["links"][(0, 7)][" "]
    del list_ent[19][1]["links"][(0, 7)][" "]

    list_ent[20][1]["links"][(0, 7)]["Q975434"] = list_ent[20][1]["links"][(0, 7)][" "]
    del list_ent[20][1]["links"][(0, 7)][" "]

    list_ent[21][1]["links"][(0, 7)]["Q805927"] = list_ent[21][1]["links"][(0, 7)][" "]
    del list_ent[21][1]["links"][(0, 7)][" "]

    list_ent[22][1]["links"][(55, 62)]["Q57405"] = list_ent[22][1]["links"][(55, 62)][" "]
    del list_ent[22][1]["links"][(55, 62)][" "]
    
    return list_ent

In [23]:
if " " in list_ent[0][1]["links"][(0, 7)]:
    new_ent = insert_QID()

In [24]:
print(new_ent[0])

('Bambang merupakan presiden ke-6 indonesia.\n', {'links': {(0, 7): {'Q57405': 1.0}}})


### Check proportion

In [25]:
gold_ids = []
for text, annot in new_ent:
    for span, links_dict in annot["links"].items():
        for link, value in links_dict.items():
            if value:
                gold_ids.append(link)
                
from collections import Counter
print(Counter(gold_ids))

Counter({'Q975434': 9, 'Q57405': 8, 'Q805927': 6})


In [26]:
import random

train_dataset = []
test_dataset = []

for QID in qids:
    indices = [i for i,j in enumerate(gold_ids) if j==QID]
    if QID == "Q975434":
        train_dataset.extend(new_ent[index] for index in indices[0:7])
        test_dataset.extend(new_ent[index] for index in indices[7:9])
    elif QID == "Q57405":
        train_dataset.extend(new_ent[index] for index in indices[0:6])
        test_dataset.extend(new_ent[index] for index in indices[6:8])
    elif QID == "Q805927":
        train_dataset.extend(new_ent[index] for index in indices[0:5])
        test_dataset.extend(new_ent[index] for index in indices[5:6])
        
random.shuffle(train_dataset)
random.shuffle(test_dataset)

In [27]:
TRAIN_DOC = []
for text, annotation in train_dataset:
    doc = model(text)
    TRAIN_DOC.append((doc,annotation))

In [28]:
entity_linker = model.add_pipe("entity_linker")
entity_linker.set_kb(create_kb)
# model.add_pipe("entity_linker", last=True)
# entity_linker.set_kb(create_kb)

In [29]:
from spacy.util import minibatch, compounding

other_pipes = [pipe for pipe in model.pipe_names if pipe!="entity_linker"]
with model.disable_pipes(*other_pipes):
    optimizer = model.begin_training()
    for itn in range(500):
        random.shufle(TRAIN_DOC)
        batches = minibatch(TRAIN_DOC, size=compounding(4.0, 32.0, 1.001))
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,
                annotations,
                drop=0.2,
                losses=losses,
                sgd=optimizer,
            )
        if itn % 50 == 0:
            print(itn, "Losses", losses)
print(itn, "Losses", losses)

ValueError: [E139] Knowledge base for component 'entity_linker' is empty. Use the methods `kb.add_entity` and `kb.add_alias` to add entries.