In [None]:
### FOR THIS NOTEBOOK WE NEED:

# Python 3.7
# Spacy 2.1
# Neuralcoref

In [1]:
import requests
response = requests.get(
    'https://en.wikipedia.org/w/api.php',
    params={
        'action': 'query',
        'format': 'json',
        'titles': 'Decentralized finance',
        'prop': 'extracts',
        'exintro': True,
        'explaintext': True,
    }
).json()
page = next(iter(response['query']['pages'].values()))
text = page['extract']

In [2]:
print(text)

Decentralized finance (DeFi) offers financial instruments without relying on  intermediaries such as brokerages, exchanges, or banks. Instead, it uses smart contracts on a blockchain. DeFi platforms allow people to lend or borrow funds from others, speculate on price movements on assets using derivatives, trade cryptocurrencies, insure against risks, and earn interest in savings-like accounts. DeFi uses a layered architecture and highly composable building blocks. Some applications promote high interest rates but are subject to high risk. As of October 2021,  the value of assets used in decentralized finance amounted to $100 billion.


In [5]:
import pandas as pd
import re
import spacy
import neuralcoref

nlp = spacy.load('en_core_web_lg')
neuralcoref.add_to_pipe(nlp)


def get_entity_pairs(text, coref=True):
    # preprocess text
    text = re.sub(r'\n+', '.', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    text = nlp(text)
    if coref:
        text = nlp(text._.coref_resolved)  # resolve coreference clusters

    def refine_ent(ent, sent):
        unwanted_tokens = (
            'PRON',  # pronouns
            'PART',  # particle
            'DET',  # determiner
            'SCONJ',  # subordinating conjunction
            'PUNCT',  # punctuation
            'SYM',  # symbol
            'X',  # other
        )
        ent_type = ent.ent_type_  # get entity type
        if ent_type == '':
            ent_type = 'NOUN_CHUNK'
            ent = ' '.join(str(t.text) for t in
                           nlp(str(ent)) if t.pos_
                           not in unwanted_tokens and t.is_stop == False)
        elif ent_type in ('NOMINAL', 'CARDINAL', 'ORDINAL') and str(ent).find(' ') == -1:
            refined = ''
            for i in range(len(sent) - ent.i):
                if ent.nbor(i).pos_ not in ('VERB', 'PUNCT'):
                    refined += ' ' + str(ent.nbor(i))
                else:
                    ent = refined.strip()
                    break

        return ent, ent_type

    sentences = [sent.string.strip() for sent in text.sents]  # split text into sentences
    ent_pairs = []
    for sent in sentences:
        sent = nlp(sent)
        spans = list(sent.ents) + list(sent.noun_chunks)  # collect nodes
        spans = filter_spans(spans)
        with sent.retokenize() as retokenizer:
            [retokenizer.merge(span, attrs={'tag': span.root.tag,
                                            'dep': span.root.dep}) for span in spans]
        deps = [token.dep_ for token in sent]

        # limit our example to simple sentences with one subject and object
        if (deps.count('obj') + deps.count('dobj')) != 1\
                or (deps.count('subj') + deps.count('nsubj')) != 1:
            continue

        for token in sent:
            if token.dep_ not in ('obj', 'dobj'):  # identify object nodes
                continue
            subject = [w for w in token.head.lefts if w.dep_
                       in ('subj', 'nsubj')]  # identify subject nodes
            if subject:
                subject = subject[0]
                # identify relationship by root dependency
                relation = [w for w in token.ancestors if w.dep_ == 'ROOT']
                if relation:
                    relation = relation[0]
                    # add adposition or particle to relationship
                    if relation.nbor(1).pos_ in ('ADP', 'PART'):
                        relation = ' '.join((str(relation), str(relation.nbor(1))))
                else:
                    relation = 'unknown'

                subject, subject_type = refine_ent(subject, sent)
                token, object_type = refine_ent(token, sent)

                ent_pairs.append([str(subject), str(relation), str(token),
                                  str(subject_type), str(object_type)])

    ent_pairs = [sublist for sublist in ent_pairs
                          if not any(str(ent) == '' for ent in sublist)]
    pairs = pd.DataFrame(ent_pairs, columns=['subject', 'relation', 'object',
                                             'subject_type', 'object_type'])
    print('Entity pairs extracted:', str(len(ent_pairs)))

    return pairs

def filter_spans(spans):
    # Filter a sequence of spans so they don't contain overlaps
    # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
    get_sort_key = lambda span: (span.end - span.start, -span.start)
    sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
    result = []
    seen_tokens = set()
    for span in sorted_spans:
        # Check for end - 1 here because boundaries are inclusive
        if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
            result.append(span)
        seen_tokens.update(range(span.start, span.end))
    result = sorted(result, key=lambda span: span.start)
    return result

In [6]:
get_entity_pairs(text)

Entity pairs extracted: 4


Unnamed: 0,subject,relation,object,subject_type,object_type
0,Decentralized finance,offers,financial instruments,NOUN_CHUNK,NOUN_CHUNK
1,Decentralized finance,uses,smart contracts,NOUN_CHUNK,NOUN_CHUNK
2,DeFi,uses,layered architecture,ORG,NOUN_CHUNK
3,applications,promote,high interest rates,NOUN_CHUNK,NOUN_CHUNK


In [19]:
with open("../whitepapers/ethereum.txt") as f:
    data = f.read()
get_entity_pairs(data)

Entity pairs extracted: 27


Unnamed: 0,subject,relation,object,subject_type,object_type
0,mechanism,solved,two problems,NOUN_CHUNK,CARDINAL
1,Alice,send,70,NOUN_CHUNK,MONEY
2,Alice,creates,transaction,PERSON,NOUN_CHUNK
3,application,returns,error,NOUN_CHUNK,NOUN_CHUNK
4,protocol,facilitate,weak version,NOUN_CHUNK,NOUN_CHUNK
5,UTXO script,is,fine grained control,NOUN_CHUNK,NOUN_CHUNK
6,Ethereum account,contains,four fields:.The nonce,NOUN_CHUNK,CARDINAL
7,transaction sender,Check that,at least 2000,NOUN_CHUNK,CARDINAL
8,Ethereum,apply,block diagram,NOUN_CHUNK,NOUN_CHUNK
9,party B,Wait for,1000 ether,NOUN_CHUNK,CARDINAL
