# Extract triples using OpenIE 5 and OLLIE

In [1]:
import json
from pyopenie import OpenIE5

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line)
            
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

def write_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

In [6]:
# read triples file of OLLIE and convert to JSON format
ollie_output = read_text('../data/ollie_output.txt')
triples = {}
extractions = ollie_output.split('\n')
title, extractions = extractions[0], extractions[1:]
print(title)
for extraction in extractions:
    # skip empty line
    if not extraction:
        continue
    extraction = extraction.split('\t')
    if extraction[4] == 'None':
        extraction[4] = None
    if extraction[5] == 'None':
        extraction[5] = None
    if extraction[6] not in triples:
        triples[extraction[6]] = []
    triples[extraction[6]].append({
        'subject': extraction[1],
        'relation': extraction[2],
        'object': [extraction[3]],
        'enabler': extraction[4],
        'attribution': extraction[5]  
    })

write_json(triples, '../data/ollie_triples.json')

confidence	arg1	rel	arg2	enabler	attribution	text	pattern	dependencies


In [3]:
text = read_text('../data/preprocessed_text.txt')
sentences = text.split('\n')[:-1]

# Get tripples from OpenIE5 for the preprocessed text
edges = []

triples = {}

extractor = OpenIE5('http://localhost:8000')
for s_no in range(len(sentences)):
    sentence = sentences[s_no]
    extractions = extractor.extract(sentence)
    triples[sentence] = []
    for extraction in extractions:
        bad_extraction = False
        extraction = extraction['extraction']
        if len(extraction['arg1']['text'].split()) > 5:
            bad_extraction = True
    # 		print(extraction['arg1']['text'], '|', extraction['rel']['text'] , '|', *[extraction['arg2s'][i]['text'] + ',' for i in range(len(extraction['arg2s']))], extraction['negated'])

        triples[sentence].append({
            'subject' : extraction['arg1']['text'], 
            'relation': extraction['rel']['text'], 
            'object': [e['text'] for e in extraction['arg2s']],
            'negated': extraction['negated']
        })
    
        edges.append({'$' + str(s_no), extraction['rel']['text'], extraction['arg1']['text']})
        for i in range(len(extraction['arg2s'])):
            if len(extraction['arg2s'][i]['text'].split()) > 5:
                bad_extraction = True
            edges.append(['$' + str(s_no), 'object', extraction['arg2s'][i]['text']])
        if extraction['negated']:
            edges.append(['$' + str(s_no), 'type', 'Don\'t'])

In [4]:
write_json(triples, '../data/openie5_triples.json')