# Knowledge extraction from text

Erik Tjong Kim Sang

e.tjongkimsang(a)esciencecenter.nl

Python package requirements:
* regex
* nltk
* transformers
* pytorch
* rdflib

In [None]:
from IPython.display import clear_output

In [None]:
def squeal(text=None):
    clear_output(wait=True)
    if not text is None: 
        print(text)

## 1. Get texts

In [None]:
import regex
import urllib.request

In [None]:
DATA_URL = "https://gutenberg.org/cache/epub/2591/pg2591.txt"
LAST_PREAMBLE_LINE = "THE BROTHERS GRIMM FAIRY TALES"
FIRST_POSTCRIPT_LINE = "*****" 

In [None]:
def convert_bytes_to_string(string):
    return string.decode("utf-8")

In [None]:
def get_text_from_url(data_url):
    with urllib.request.urlopen(data_url) as response:
        data_text = response.read()
        response.close()
    return convert_bytes_to_string(data_text)

In [None]:
def get_stories_from_text(data_text, 
                          last_preamble_line=LAST_PREAMBLE_LINE,
                          first_postscript_line=FIRST_POSTCRIPT_LINE):
    in_text = False
    texts = {}
    texts_key = ""
    for line in data_text.split("\n"):
        line = line.strip()
        if line == last_preamble_line:
            in_text = True
        elif line == first_postscript_line:
            in_text = False
        elif in_text:
            if regex.search("^[A-Z, -]+$", line.strip().split("[")[0]):
                texts_key = line
                texts[texts_key] = ""
            elif len(texts) > 0:
                texts[texts_key] += line + " "
    return texts

In [None]:
data_text = get_text_from_url(DATA_URL)
texts = get_stories_from_text(data_text)

## 2. Extract relation triples from texts

Uses the system [REBEL](https://github.com/Babelscape/rebel), based on blog by Fabio Chiusano: https://medium.com/nlplanet/building-a-knowledge-base-from-texts-a-full-practical-example-8dbbffb912fa

In [None]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from nltk.tokenize import sent_tokenize

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

# https://gist.githubusercontent.com/fabiochiusano/934ad5ff318626befbdd20c72e074186/raw/e3e44110a0db5408d17fba52be559ecaf676b6d2/kb_4.py

In [None]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

# source: https://gist.githubusercontent.com/fabiochiusano/e64d5250371e18f7a6cc02ac0cdc64c5/raw/24af0f7f23b313591fe91fc9f8826cf216ca4568/kb_5.py

In [None]:
def from_small_text_to_kb(text, verbose=False, prefix=""):
    kb = KB()

    # Tokenizer text
    model_inputs = tokenizer(text, max_length=512, padding=True, truncation=True,
                            return_tensors='pt')
    if verbose:
        squeal(f"{prefix}Num tokens: {len(model_inputs['input_ids'][0])}")

    # Generate
    gen_kwargs = {
        "max_length": 216,
        "length_penalty": 0,
        "num_beams": 5,
        "num_return_sequences": 5
    }
    generated_tokens = model.generate(
        **model_inputs,
        **gen_kwargs,
    )
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

    # create kb
    for sentence_pred in decoded_preds:
        relations = extract_relations_from_model_output(sentence_pred)
        for r in relations:
            if regex.search(r["head"], text): # block hallucinations
                kb.add_relation(r)

    return kb

# source: https://gist.githubusercontent.com/fabiochiusano/ceec4d9ff1ce2ad25c40fbd8412aa9e4/raw/796771f88776fca9d7c4c84bd1b3a52d9ef5b5c1/kb_6.py

In [None]:
def extract_relations_per_text(text):
    relations = []
    sentences = sent_tokenize(text)
    for sentence_index, sentence in enumerate(sentences):
        clear_output(wait=True)
        prefix = f"sentence {1+sentence_index}/{len(sentences)} "
        kb = from_small_text_to_kb(sentence, verbose=True, prefix=prefix)
        relations.extend(kb.__dict__["relations"])
    return relations

# source for line 6: https://gist.githubusercontent.com/fabiochiusano/a720da218ee8d19de3130fa36c23a69b/raw/a9b94a3ddbad61cfb3713234476423fffbfdca41/kb_7.py

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

In [None]:
relations = extract_relations_per_text(texts["LITTLE RED-CAP [LITTLE RED RIDING HOOD]"])

In [None]:
pd.DataFrame(relations)

## 3. Lookup relation parts on Wikidata

In [None]:
import requests

In [None]:
entity_cache = {}
property_cache = {}

In [None]:
def get_wikidata_info(entity_name, cache=entity_cache, find_property=False):
    if entity_name in cache:
        return cache[entity_name]
    url = f"https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "format": "json",
        "language": "en",
        "limit": 10,
        "uselang": "en",
        "search": entity_name
    }
    if find_property:
        params["type"] = "property"
    response = requests.get(url, params=params)
    data = response.json()
    if 'search' in data.keys():
        cache[entity_name] = data["search"]
    else:
        cache[entity_name] = []
    return cache[entity_name]

In [None]:
for relation_index, relation in enumerate(relations):
    clear_output(wait=True)
    print(f"processing relation {relation_index + 1}/{len(relations)}")
    get_wikidata_info(relation["head"])
    get_wikidata_info(relation["type"], cache=property_cache, find_property=True)
    get_wikidata_info(relation["tail"])

## 4. Convert relation triples to RDF

In [None]:
from rdflib import Graph, URIRef, Namespace

In [None]:
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")

In [None]:
def store_relations_in_knowledge_graph(relations):
    knowledge_graph = Graph()
    for relation in relations:
        if (relation["head"] in entity_cache and 
            relation["tail"] in entity_cache and
            relation["type"] in property_cache and
            len(entity_cache[relation["head"]]) > 0 and 
            len(entity_cache[relation["tail"]]) > 0 and
            len(property_cache[relation["type"]]) > 0):
            head = wd[entity_cache[relation['head']][0]['id']]
            type_ = wdt[property_cache[relation['type']][0]['id']]
            tail = wd[entity_cache[relation['tail']][0]['id']]
            knowledge_graph.add((URIRef(head), URIRef(type_), URIRef(tail)))
    return knowledge_graph

In [None]:
knowledge_graph = store_relations_in_knowledge_graph(relations)

## 5. Extract information from knowledge graph with SPARQL queries

For tips on building SPARQL queries, see WikiData SPARQL tutorial: https://www.wikidata.org/wiki/Wikidata:SPARQL_tutorial

In [None]:
def get_property_id(property_label, cache=property_cache):
    if property_label in property_cache and len(property_cache[property_label]) > 0:
        return property_cache[property_label][0]["id"]
    else:
        raise ValueError(f"unknown property label: {property_label}")

In [None]:
url_prefix = "http://www.wikidata.org/entity/"
search_cache = {}

def get_entity_label(entity_url):
    entity_id = entity_url.split("/")[-1] 
    for entity_label in entity_cache:
        for entity in entity_cache[entity_label]:
            if url_prefix + entity["id"] == entity_url:
                return(entity["label"])
    return get_wikidata_info(entity_id, cache=search_cache)[0]["label"]

In [None]:
get_property_id("mother")

In [None]:
query = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?head ?tail
WHERE { 
    ?head wdt:P25 ?tail. 
}
"""

In [None]:
pd.DataFrame([(get_entity_label(str(relation[0])),
               get_entity_label(str(relation[1])))
              for relation in knowledge_graph.query(query)], columns=["head", "tail"])

## 6. Visualize knowledge graph

In [None]:
from rdflib.extras.external_graph_libs import rdflib_to_networkx_multidigraph
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
G = rdflib_to_networkx_multidigraph(knowledge_graph)
pos = nx.spring_layout(G, scale=2)
edge_labels = nx.get_edge_attributes(G, 'r')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
nx.draw(G, with_labels=True, font_size=6)
plt.show()