In [1]:
import gcsfs
import json
import ast
import pandas as pd
import numpy as np
import re
import spacy
from spacy.util import filter_spans 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = 'gs://doccano_annotation_2/data/doccano_export_10_10_2022.jsonl'
drop_labels = ['BIOVERB']
display = False
write_all = True
write_only_test = False

In [3]:
# Read data
def read_jsonl(data_path):
    gcs_file_system = gcsfs.GCSFileSystem()
    with gcs_file_system.open(data_path, 'r', encoding="utf-8") as f:
        json_list = list(f)
    data = []
    for j in json_list:
        data.append(json.loads(j))
    return pd.DataFrame(data)

def read_parquet(data_path):
    return pd.read_parquet(data_path)

def doccano_doc_to_df(df):
    data = df[['doccano_doc']].copy()
    data['entities'] = data['doccano_doc'].apply(lambda x: x['entities'])
    data['text'] = data['doccano_doc'].apply(lambda x: x['text'])
    data['relations'] = data['doccano_doc'].apply(lambda x: x['relations'])
    return data[['text', 'entities', 'relations']]

In [4]:
if data_path.split('.')[-1] == 'jsonl':
    our_data = read_jsonl(data_path)
elif data_path.split('.')[-1] == 'parquet':
    our_data = read_parquet(data_path)
    our_data = doccano_doc_to_df(our_data)
else:
    raise Error('Please provide jsonl or paquet format')

In [5]:
our_data.head()

Unnamed: 0,id,text,Comments,entities,relations
0,110,Infection of the cell lines with M. arginini r...,[],"[{'id': 17390, 'label': 'TARGET', 'start_offse...","[{'id': 9805, 'from_id': 34791, 'to_id': 17392..."
1,111,Our results show that RA-NP inhibited LPS-indu...,[],"[{'id': 17395, 'label': 'BIOVERB', 'start_offs...","[{'id': 2250, 'from_id': 17400, 'to_id': 17396..."
2,112,"PF (10 μM) inhibited IL-33 production, Ca infl...","[{'id': 2, 'comment': 'Impossible a Tagger !!!...","[{'id': 17402, 'label': 'TARGET', 'start_offse...","[{'id': 2253, 'from_id': 17401, 'to_id': 17406..."
3,113,We further showed that 4.1B inhibited the prol...,[],"[{'id': 17409, 'label': 'TARGET', 'start_offse...","[{'id': 2259, 'from_id': 17418, 'to_id': 17412..."
4,114,"Y(1) receptor antagonists, BIBP3226 and BIBO33...",[],"[{'id': 17420, 'label': 'TARGET', 'start_offse...","[{'id': 9821, 'from_id': 17425, 'to_id': 17424..."


In [6]:
len_relations = our_data.relations.apply(lambda x: len(x))
our_data = our_data.loc[len_relations > 0]

In [7]:
our_data.shape

(386, 5)

In [8]:
ner_labels = our_data.entities.apply(lambda x: [x[i]['label'] for i in range(len(x))])
print('Entities labels : ', list(set(np.sum([ner_labels.iloc[k] for k in range(ner_labels.shape[0])]))))

Entities labels :  ['DISEASE', 'TISSUE', 'CELL_LINE', 'SMALL_MOLECULE', 'BIOVERB', 'CHEMICAL', 'PATHWAY', 'UNKNOWN', 'CELL LINE', 'TARGET']


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [9]:
classes = our_data.relations.apply(lambda x: [x[i]['type'] for i in range(len(x))])
print('Relations labels : ', list(set(np.sum([classes.iloc[k] for k in range(classes.shape[0])]))))

Relations labels :  ['!induc', 'regul', 'suppress', 'interact', '!bind', 'inhibit', 'reduc', 'elevate', '!decreas', '!regul', 'induc', 'promote', 'imped', 'decreas', 'bind', '!increas', 'target', '!reduc', '!block', 'enhance', 'block', 'express', 'activ', 'increas', 'stimulate', '!activ', '!express', '!inhibit']


In [10]:
# Drop labels that we are not interested in
def drop_label(x, labels):
    return [entity_dict for entity_dict in x if entity_dict['label'] not in labels]

# Preprocess text input
def preprocess_text(string):
    return string.lower()

In [11]:
# Get tokens
def get_tokens(x, i):
    doc = x['doc'].copy()
    text = str(doc[i])
    start = doc[i:i+1].start_char
    end = doc[i:i+1].end_char
    if len(str(doc)) > end:
        next_char = str(doc)[end]
    else:
        next_char = ''
    if next_char == ' ':
        ws = True
    else:
        ws = False
    return {'text': text, 
            'start': start, 
            'end': end, 
            'id': i, 
            'ws': ws,
            'disabled': True}

In [12]:
# Get spans
def get_spans(x):
    spans = []
    doc = x['doc'].copy()
    for d in x.entities:
        start = d['start_offset']
        end = d['end_offset']
        substring = x['text'][start: end]
        lspaces = len(substring) - len(substring.lstrip())
        rspaces = len(substring) - len(substring.rstrip())
        start += lspaces
        end -= rspaces
        span = doc.char_span(start, end, d['label'], kb_id=d['id'])
        if span is not None:
            spans.append(span)
    filtered_spans = filter_spans(spans)
    return [{'id': span.kb_id,
            'text': str(span), 
            'start': span.start_char, 
            'token_start': span.start,
            'token_end': span.end, 
            'end': span.end_char, 
            'type': 'span',
            'label': span.label_}
           for span in filtered_spans]

In [13]:
# Get relations
def get_head_child_attributes(spans, head_entity, child_entity, label):
    head_dicts = [span_dict for span_dict in spans if (span_dict['id']==head_entity)]
    child_dicts = [span_dict for span_dict in spans if (span_dict['id']==child_entity)]
    
    assert len(head_dicts) <= 1
    assert len(child_dicts) <= 1

    if (len(head_dicts) > 0) & (len(child_dicts) > 0):
        head_start = head_dicts[0]['start']
        head_end = head_dicts[0]['end']
        head_token_start = head_dicts[0]['token_start']
        head_token_end = head_dicts[0]['token_end']
        head_label = head_dicts[0]['label']

        child_start = child_dicts[0]['start']
        child_end = child_dicts[0]['end']
        child_token_start = child_dicts[0]['token_start']
        child_token_end = child_dicts[0]['token_end']
        child_label = child_dicts[0]['label']

        head = head_token_end
        child = child_token_end
        label = label

        relations_dict = {'head': head,
                          'child': child,
                          'head_span': {'start': head_start, 
                                       'end': head_end, 
                                       'token_start': head_token_start, 
                                       'token_end': head_token_end, 
                                       'label': head_label}, 
                            'child_span': {'start': child_start, 
                                       'end': child_end, 
                                       'token_start': child_token_start, 
                                       'token_end': child_token_end, 
                                       'label': child_label}, 
                            'label': label}
    else:
        relations_dict = {}
    return relations_dict


def get_relations(x, directed=True, add_no_rel=False, labels_to_drop=['target']):
    relations_form = []
    relations = x['relations'].copy()
    spans = x['spans'].copy()
    remaining_combinations = [(spans[i]['id'], spans[j]['id']) 
                        for i in range(len(spans))
                        for j in range(len(spans))
                        if spans[i]['id'] != spans[j]['id']]
    for i in range(len(relations)):
        head_entity = relations[i]['from_id'] 
        child_entity = relations[i]['to_id']
        rel_label = x['relations'][i]['type'].split('!')[-1]
        if rel_label not in labels_to_drop:
            relations_dict = get_head_child_attributes(spans, head_entity, child_entity, rel_label)
            if relations_dict != {}:
                relations_form.append(relations_dict)
                if not directed:
                    relations_form.append(get_head_child_attributes(spans, child_entity, head_entity, 
                                                                    x['relations'][i]['type'].split('!')[-1]))
            try:
                remaining_combinations.remove((head_entity, child_entity))
                if not directed:
                    remaining_combinations.remove((child_entity, head_entity))
            except ValueError:
                pass
    if add_no_rel:
        for c in range(len(remaining_combinations)):
            head_entity = remaining_combinations[c][0]
            child_entity = remaining_combinations[c][1]
            relations_dict = get_head_child_attributes(spans, head_entity, child_entity, 'No-Rel')
            relations_form.append(relations_dict)
    return relations_form

In [14]:
def delete_rows_no_rel(x):
    to_drop = False
    labels = [x['relations_form'][i]['label'] for i in range(len(x['relations_form']))]
    if (list(set(labels)) == ['No-Rel']) or (len(labels) == 0):
        to_drop = True
    return to_drop

In [15]:
def preprocess(our_data, drop_labels=[], directed=True, add_no_rel=False, rel_labels_to_drop=['target']):
    nlp = spacy.load('en_core_web_sm')
    # nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
    if len(drop_labels) > 0:
        our_data['entities'] = our_data['entities'].apply(lambda x: drop_label(x, drop_labels))
    our_data['text_preprocessed'] = our_data['text'].str.lower()
    our_data['doc'] = our_data['text_preprocessed'].apply(lambda x: nlp(x))
    our_data['tokens'] = our_data.apply(lambda x: [get_tokens(x, i) for i in range(len(x.doc))], axis=1)
    our_data['spans'] = our_data.apply(lambda x: get_spans(x), axis=1)
    our_data['relations_form'] = our_data.apply(lambda x: get_relations(x, 
                                                                        directed=directed, 
                                                                        add_no_rel=add_no_rel, 
                                                                        labels_to_drop=rel_labels_to_drop), 
                                                axis=1)
    our_data['to_drop'] = our_data.apply(lambda x: delete_rows_no_rel(x), axis=1)
    our_data = our_data[our_data.to_drop==False].copy()
    our_data['answer'] = 'accept'
    our_data['meta'] = None
    our_data['meta'] = our_data['meta'].apply(lambda x: {'source': 'unkown'})
    formatted_data = our_data[['text_preprocessed', 'spans', 'tokens', 'relations_form', 'answer', 'meta']]
    formatted_data.columns = ['text', 'spans', 'tokens', 'relations', 'answer', 'meta']
    return formatted_data

In [16]:
formatted_data = preprocess(our_data, drop_labels=drop_labels)

In [17]:
formatted_data.shape

(379, 6)

In [18]:
# Display spans
from spacy import displacy

def display_entities(text, entities, entity_type=None):
    nlp = spacy.blank("en")
    doc = nlp.make_doc(text)
    ents = []
    for entity in entities:
        if (entity["label"] == entity_type) or (entity_type is None):
            span_start = entity["start"]
            span_end = entity["end"]
            label = entity["label"]
            ent = doc.char_span(span_start, span_end, label=label)
            if ent is None:
                continue
            ents.append(ent)
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

if display:
    not_working = []
    for k in range(formatted_data.shape[0]):
        row = formatted_data.iloc[k]
        try:
            display_entities(row.text, row.spans, None)
        except ValueError:
            not_working.append(k)

In [19]:
def write_jsonl(df, path):
    our_jsonl = df.to_dict(orient='records')
    with open(path, 'w') as outfile:
        for json_line in our_jsonl:
            json.dump(json_line, outfile)
            outfile.write('\n')

In [20]:
if write_only_test:
    assert write_all == False
    write_jsonl(formatted_data, '../NER/assets/annotations_test.jsonl')
    write_jsonl(formatted_data, '../RE/assets/annotations_test.jsonl')

In [21]:
if write_all:
    np.random.seed(0)
    n1 = int(formatted_data.shape[0] * 65/100)
    n2 = int(formatted_data.shape[0] * 85/100) 
    df_shuffled = formatted_data.sample(frac=1)
    annotations_train = df_shuffled[:n1]
    annotations_dev = df_shuffled[n1:n2]
    annotations_test = df_shuffled[n2:]
    # Write for NER
    write_jsonl(annotations_train, '../NER/assets/annotations_train.jsonl')
    write_jsonl(annotations_dev, '../NER/assets/annotations_dev.jsonl')
    write_jsonl(annotations_test, '../NER/assets/annotations_test.jsonl')
    # Write for RE
    write_jsonl(annotations_train, '../RE/assets/annotations_train.jsonl')
    write_jsonl(annotations_dev, '../RE/assets/annotations_dev.jsonl')
    write_jsonl(annotations_test, '../RE/assets/annotations_test.jsonl')

In [22]:
labels = formatted_data.relations.apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [23]:
total_labels = list(np.sum([labels.iloc[k] for k in range(labels.shape[0])]))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [24]:
for label in np.unique(total_labels):
    print(label, total_labels.count(label) / len(total_labels))

activ 0.18608058608058609
bind 0.07472527472527472
block 0.04322344322344322
decreas 0.0652014652014652
elevate 0.0029304029304029304
enhance 0.011721611721611722
express 0.12747252747252746
imped 0.007326007326007326
increas 0.0695970695970696
induc 0.08937728937728938
inhibit 0.13553113553113552
interact 0.0029304029304029304
promote 0.008791208791208791
reduc 0.07692307692307693
regul 0.07179487179487179
stimulate 0.014652014652014652
suppress 0.011721611721611722


Pb : sur-représentation de la classe No-Rel

Solutions possibles : 

    - Modifier la fonction de perte en pondérant le score en fonction de la classe
    - Considérer des relations non dirigées : relations(head, child) = relation(child, head) ==> moins de No-Rel 
    - Concaténer deux modèles : cf https://doc.rero.ch/record/327157/files/cud_tci.pdf
        - un premier modèle qui prédit l'absence de relation
        - un deuxième qui prédit la classe de relation
        
        
Remarque : 
- Pas de classe No-Rel !! Dans le modèle, on considère qu'il y a une relation A de e1 vers e2 ssi proba(A, e1, e2) > s=0.5. Si toutes les probas sont inférieures à s, alors pas de relation