# Extracting triples
The model outputs relationship triples in the form of structured text. This notebook shows how the relationship triples are extracted out of that text.

In [20]:
from datasets import load_dataset
import yaml
import re
import pandas as pd

## Load Config

In [2]:
# Load the config

with open('config/config_testing.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

## Load dataset

In [3]:
dataset = load_dataset(
        config['dataset_vars']['type'], 
        data_dir=config['dataset_vars']['dir'],
        column_names=config['dataset_vars']['column_names']
        )

dataset_train = dataset['train'].select(range(1,501)) # remove first row that contains column names
dataset_eval = dataset['validation'].select(range(1,501)) # remove first row that contains column names

In [4]:
relations_in_structerd_text = dataset_train["relations"]
entity_types = {"chemical":"@CHEMICAL@", 
                "disease":"@DISEASE@"}
rel_types = {"chemical induced disease":"@CID@"}

In [5]:
# voorbeeld
relations_in_structerd_text[0]

'alpha-methyldopa @CHEMICAL@ hypotensive @DISEASE@ @CID@'

In [None]:
def split_on_labels(input_text, labels):
    # Escape labels to ensure special characters are treated as literals in regex
    escaped_labels = [re.escape(label) for label in labels]
    # Join the labels into a regex pattern with alternation to match any of them
    pattern = '|'.join(escaped_labels)
    # Use re.split() with the compiled pattern, keeping the delimiters in the result
    relation_segments = re.split(f'({pattern})', input_text)
    # Filter out empty strings that might result from splitting
    relation_segments = [segment for segment in relation_segments if segment]
    return relation_segments

In [19]:
import re

def extract_relation_triples(input_text, labels):
    # Split the input text into relation segments
    relation_segments = split_on_labels(input_text, labels)
    
    # Remove the last empty segment if it exists
    if not relation_segments[-1].strip():
        relation_segments = relation_segments[:-1]

    # Map relation label to entity text
    entity_texts = relation_segments[::2]
    relation_labels = relation_segments[1::2]
    if len(entity_texts) != len(relation_labels): raise ValueError('Amount of relation labels in the text does not equal to amount of entities pairs')
    
    # Initialize a list to hold the relation triples
    relations = []
    
    for entity_text, rel_label in zip(entity_texts, relation_labels):
        # Extract entities and their labels
        entities = re.findall(r'(.+?)\s@(\w+)@', entity_text)
        
        # Initialize a dict to store entities mapped to their labels
        relation_dict = {}
        
        for entity, label in entities:
            # Handle coreferent mentions (splitting by ";")
            coreferents = [coref.strip() for coref in entity.split(';')]
            
            if len(coreferents) > 1:
                relation_dict[label] = coreferents
            else:
                relation_dict[label] = coreferents[0]

        rel_label = rel_label.split('@')[1]
        relations.append({rel_label:relation_dict})
    
    return relations

# Example usage
input_text = 'lithium ; li @CHEMICAL@ chronic renal failure @DISEASE@ @CID@ lithium ; li @CHEMICAL@ proteinuria @DISEASE@ @CID@ lithium ; li @CHEMICAL@ hypertension @DISEASE@ @CID@'
relation_triples = extract_relation_triples(input_text, ['@CID@'])
print(relation_triples)


[{'CID': {'CHEMICAL': ['lithium', 'li'], 'DISEASE': 'chronic renal failure'}}, {'CID': {'CHEMICAL': ['lithium', 'li'], 'DISEASE': 'proteinuria'}}, {'CID': {'CHEMICAL': ['lithium', 'li'], 'DISEASE': 'hypertension'}}]


In [22]:
%%time
relation_triples = [extract_relation_triples(i, ['@CID@']) for i in relations_in_structerd_text]

CPU times: user 4.31 ms, sys: 381 µs, total: 4.69 ms
Wall time: 4.59 ms


In [23]:
pd.DataFrame({'structerd_text': relations_in_structerd_text, 'Relation triples': relation_triples})

Unnamed: 0,structerd_text,Relation triples
0,alpha-methyldopa @CHEMICAL@ hypotensive @DISEA...,"[{'CID': {'CHEMICAL': 'alpha-methyldopa', 'DIS..."
1,lidocaine @CHEMICAL@ cardiac asystole @DISEASE...,"[{'CID': {'CHEMICAL': 'lidocaine', 'DISEASE': ..."
2,suxamethonium ; suxamethonium chloride ; sch @...,"[{'CID': {'CHEMICAL': ['suxamethonium', 'suxam..."
3,scopolamine ; hyoscine @CHEMICAL@ overdosage @...,"[{'CID': {'CHEMICAL': ['scopolamine', 'hyoscin..."
4,lithium ; li @CHEMICAL@ chronic renal failure ...,"[{'CID': {'CHEMICAL': ['lithium', 'li'], 'DISE..."
...,...,...
495,zonisamide @CHEMICAL@ visual hallucinations @D...,"[{'CID': {'CHEMICAL': 'zonisamide', 'DISEASE':..."
496,pan ; puromycin aminonucleoside @CHEMICAL@ nep...,"[{'CID': {'CHEMICAL': ['pan', 'puromycin amino..."
497,ticlopidine @CHEMICAL@ aplastic anemia @DISEAS...,"[{'CID': {'CHEMICAL': 'ticlopidine', 'DISEASE'..."
498,scopolamine @CHEMICAL@ amnesia @DISEASE@ @CID@...,"[{'CID': {'CHEMICAL': 'scopolamine', 'DISEASE'..."
