In [1]:
import json
import ast
import pandas as pd
import numpy as np
import re
import spacy
from spacy.util import filter_spans 
from spacy.tokenizer import Tokenizer

In [2]:
data_path = 'gs://doccano_annotation/data/preannoated_data_with_bioverbs.parquet'
drop_labels = ['BIOVERB', 'CHEMICAL', 'CELL LINE', 'UNKNOWN']
display = False

In [3]:
# Read data
def read_jsonl(data_path):
    with open(data_path, 'r') as f:
        json_list = list(f)
    data = []
    for j in json_list:
        data.append(json.loads(j))
    return pd.DataFrame(data)

def read_parquet(data_path):
    return pd.read_parquet(data_path)

def doccano_doc_to_df(df):
    data = df[['doccano_doc']].copy()
    data['entities'] = data['doccano_doc'].apply(lambda x: x['entities'])
    data['text'] = data['doccano_doc'].apply(lambda x: x['text'])
    data['relations'] = data['doccano_doc'].apply(lambda x: x['relations'])
    return data[['text', 'entities', 'relations']]

In [4]:
if data_path.split('.')[-1] == 'jsonl':
    our_data = read_jsonl(data_path)
elif data_path.split('.')[-1] == 'parquet':
    our_data = read_parquet(data_path)
    our_data = doccano_doc_to_df(our_data)
else:
    raise Error('Please provide jsonl or paquet format')

In [5]:
our_data.head()

Unnamed: 0,text,entities,relations
0,1 Effects of phorbol esters on the evoked nor...,"[{'end_offset': 28, 'id': 22692, 'label': 'CHE...","[{'from_id': 22699, 'to_id': 22700, 'type': 'i..."
1,Polymorphisms in NFKB1 that diminish its expr...,"[{'end_offset': 23, 'id': 22811, 'label': 'TAR...","[{'from_id': 22811, 'to_id': 22812, 'type': 'i..."
2,A rabbit brain beta-galactosidase catalyzes t...,"[{'end_offset': 34, 'id': 22847, 'label': 'TAR...","[{'from_id': 22854, 'to_id': 22849, 'type': 'i..."
3,The Saccharomyces cerevisiae CRY1 and CRY2 ge...,"[{'end_offset': 34, 'id': 22942, 'label': 'TAR...","[{'from_id': 22946, 'to_id': 22942, 'type': 'e..."
4,The multifunctional cytokine interleukin-6 (I...,"[{'end_offset': 29, 'id': 23054, 'label': 'TAR...","[{'from_id': 23054, 'to_id': 23060, 'type': 'r..."


In [6]:
ner_labels = our_data.entities.apply(lambda x: [x[i]['label'] for i in range(len(x))])
print('Entities labels : ', list(set(np.sum([ner_labels.iloc[k] for k in range(ner_labels.shape[0])]))))

Entities labels :  ['CHEMICAL', 'CELL_LINE', 'TARGET', 'UNKNOWN', 'DISEASE', 'BIOVERB']


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [7]:
classes = our_data.relations.apply(lambda x: [x[i]['type'] for i in range(len(x))])
print('Relations labels : ', list(set(np.sum([classes.iloc[k] for k in range(classes.shape[0])]))))

Relations labels :  ['!increas', '!reduc', 'activ', '!decreas', '!bind', '!express', 'bind', 'regul', '!block', 'induc', 'decreas', 'reduc', 'inhibit', 'increas', '!activ', '!regul', '!inhibit', 'express', 'block', '!induc']


In [9]:
# Drop labels that we are not interested in
def drop_label(x, labels):
    return [entity_dict for entity_dict in x if entity_dict['label'] not in labels]

# Preprocess text input
def preprocess_text(string):
    return string.lower()

In [10]:
# Get tokens
def get_tokens(x, i):
    doc = x['doc'].copy()
    text = str(doc[i])
    start = doc[i:i+1].start_char
    end = doc[i:i+1].end_char
    return {'text': text, 
            'start': start, 
            'end': end, 
            'id': i, 
            'ws': True if end<len(x.text_preprocessed) else False,
            'disabled': True}

In [11]:
# Get spans
def get_spans(x):
    spans = []
    doc = x['doc'].copy()
    for d in x.entities:
        span = doc.char_span(d['start_offset'], d['end_offset'], d['label'], kb_id=d['id'])
        if str(span)[-1] == ' ':
            span = doc.char_span(d['start_offset'], d['end_offset']-1, d['label'], kb_id=d['id']) # delete ' ' at the beginning of the entity
        if str(span)[0] == ' ':
            span = doc.char_span(d['start_offset']+1, d['end_offset'], d['label'], kb_id=d['id']) # delete ' ' at the end of the entity
        if span is not None:
            spans.append(span)
    filtered_spans = filter_spans(spans)
    return [{'id': span.kb_id,
            'text': str(span), 
            'start': span.start_char, 
            'token_start': span.start,
            'token_end': span.end, 
            'end': span.end_char, 
            'type': 'span',
            'label': span.label_}
           for span in filtered_spans]

In [12]:
# Get relations
def get_head_child_attributes(spans, head_entity, child_entity, label):
    head_dicts = [span_dict for span_dict in spans if (span_dict['id']==head_entity)]
    child_dicts = [span_dict for span_dict in spans if (span_dict['id']==child_entity)]

    assert len(head_dicts) <= 1
    assert len(child_dicts) <= 1

    if (len(head_dicts) > 0) & (len(child_dicts) > 0):
        head_start = head_dicts[0]['start']
        head_end = head_dicts[0]['end']
        head_token_start = head_dicts[0]['token_start']
        head_token_end = head_dicts[0]['token_end']
        head_label = head_dicts[0]['label']

        child_start = child_dicts[0]['start']
        child_end = child_dicts[0]['end']
        child_token_start = child_dicts[0]['token_start']
        child_token_end = child_dicts[0]['token_end']
        child_label = child_dicts[0]['label']

        head = head_token_end
        child = child_token_end
        label = label

        relations_dict = {'head': head,
                          'child': child,
                          'head_span': {'start': head_start, 
                                       'end': head_end, 
                                       'token_start': head_token_start, 
                                       'token_end': head_token_end, 
                                       'label': head_label}, 
                            'child_span': {'start': child_start, 
                                       'end': child_end, 
                                       'token_start': child_token_start, 
                                       'token_end': child_token_end, 
                                       'label': child_label}, 
                            'label': label}
    else:
        relations_dict = {}
    return relations_dict


def get_relations(x, directed=True, add_no_rel=False):
    relations_form = []
    relations = x['relations'].copy()
    spans = x['spans'].copy()
    remaining_combinations = [(spans[i]['id'], spans[j]['id']) 
                        for i in range(len(spans))
                        for j in range(len(spans))
                        if spans[i]['id'] != spans[j]['id']]
    for i in range(len(relations)):
        head_entity = relations[i]['from_id'] 
        child_entity = relations[i]['to_id']
        relations_dict = get_head_child_attributes(spans, head_entity, child_entity, x['relations'][i]['type'].split('!')[-1])
        if relations_dict != {}:
            relations_form.append(relations_dict)
            if not directed:
                relations_form.append(get_head_child_attributes(spans, child_entity, head_entity, 
                                                                x['relations'][i]['type'].split('!')[-1]))
        try:
            remaining_combinations.remove((head_entity, child_entity))
            if not directed:
                remaining_combinations.remove((child_entity, head_entity))
        except ValueError:
            pass
    if add_no_rel:
        for c in range(len(remaining_combinations)):
            head_entity = remaining_combinations[c][0]
            child_entity = remaining_combinations[c][1]
            relations_dict = get_head_child_attributes(spans, head_entity, child_entity, 'No-Rel')
            relations_form.append(relations_dict)
    return relations_form

In [13]:
def delete_rows_no_rel(x):
    to_drop = False
    labels = [x['relations_form'][i]['label'] for i in range(len(x['relations_form']))]
    if (list(set(labels)) == ['No-Rel']) or (len(labels) == 0):
        to_drop = True
    return to_drop

In [14]:
def preprocess(our_data, drop_labels=[], directed=True, add_no_rel=False):
    nlp = spacy.blank("en")
    nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r'\S+').match)
    if len(drop_labels) > 0:
        our_data['entities'] = our_data['entities'].apply(lambda x: drop_label(x, drop_labels))
    our_data['text_preprocessed'] = our_data['text'].str.lower()
    our_data['doc'] = our_data['text_preprocessed'].apply(lambda x: nlp.make_doc(x))
    our_data['tokens'] = our_data.apply(lambda x: [get_tokens(x, i) for i in range(len(x.doc))], axis=1)
    our_data['spans'] = our_data.apply(lambda x: get_spans(x), axis=1)
    our_data['relations_form'] = our_data.apply(lambda x: get_relations(x, 
                                                                        directed=directed, 
                                                                        add_no_rel=add_no_rel), 
                                                axis=1)
    our_data['to_drop'] = our_data.apply(lambda x: delete_rows_no_rel(x), axis=1)
    our_data = our_data[our_data.to_drop==False].copy()
    our_data['answer'] = 'accept'
    our_data['meta'] = None
    our_data['meta'] = our_data['meta'].apply(lambda x: {'source': 'unkown'})
    formatted_data = our_data[['text_preprocessed', 'spans', 'tokens', 'relations_form', 'answer', 'meta']]
    formatted_data.columns = ['text', 'spans', 'tokens', 'relations', 'answer', 'meta']
    return formatted_data

In [15]:
formatted_data = preprocess(our_data, drop_labels=drop_labels)

In [17]:
formatted_data.spans

1        [{'id': 22811, 'text': 'nfkb1', 'start': 18, '...
4        [{'id': 23054, 'text': 'multifunctional cytoki...
6        [{'id': 23112, 'text': 'lrp5', 'start': 23, 't...
20       [{'id': 24657, 'text': 'apetala2', 'start': 1,...
24       [{'id': 24911, 'text': 'thrombin', 'start': 31...
                               ...                        
26000    [{'id': 1424994, 'text': 'anti-tumor necrosis ...
26001    [{'id': 1425028, 'text': 'cbl-b', 'start': 31,...
26002    [{'id': 1425045, 'text': 's-100 protein', 'sta...
26004    [{'id': 1425186, 'text': 'labor', 'start': 41,...
26005    [{'id': 1425218, 'text': 'tpo', 'start': 33, '...
Name: spans, Length: 8500, dtype: object

In [18]:
# Display spans
from spacy import displacy

def display_entities(text, entities, entity_type=None):
    nlp = spacy.blank("en")
    doc = nlp.make_doc(text)
    ents = []
    for entity in entities:
        if (entity["label"] == entity_type) or (entity_type is None):
            span_start = entity["start"]
            span_end = entity["end"]
            label = entity["label"]
            ent = doc.char_span(span_start, span_end, label=label)
            if ent is None:
                continue
            ents.append(ent)
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

if display:
    not_working = []
    for k in range(formatted_data.shape[0]):
        row = formatted_data.iloc[k]
        try:
            display_entities(row.text, row.spans, None)
        except ValueError:
            not_working.append(k)

In [19]:
np.random.seed(0)
n1 = int(formatted_data.shape[0] * 65/100)
n2 = int(formatted_data.shape[0] * 85/100) 
df_shuffled = formatted_data.sample(frac=1)
annotations_train = df_shuffled[:n1]
annotations_dev = df_shuffled[n1:n2]
annotations_test = df_shuffled[n2:]

In [20]:
len(annotations_train.iloc[0].spans)

2

In [21]:
len(annotations_train.iloc[0].relations)

1

In [22]:
annotations_train.iloc[0].relations

[{'head': 18,
  'child': 22,
  'head_span': {'start': 100,
   'end': 106,
   'token_start': 17,
   'token_end': 18,
   'label': 'DISEASE'},
  'child_span': {'start': 126,
   'end': 134,
   'token_start': 21,
   'token_end': 22,
   'label': 'TARGET'},
  'label': 'increas'}]

In [23]:
def write_jsonl(df, path):
    our_jsonl = df.to_dict(orient='records')
    with open(path, 'w') as outfile:
        for json_line in our_jsonl:
            json.dump(json_line, outfile)
            outfile.write('\n')

In [24]:
write_jsonl(annotations_train, '../NER/assets/annotations_train.jsonl')
write_jsonl(annotations_dev, '../NER/assets/annotations_dev.jsonl')
write_jsonl(annotations_test, '../NER/assets/annotations_test.jsonl')

In [25]:
write_jsonl(annotations_train, '../RE/assets/annotations_train.jsonl')
write_jsonl(annotations_dev, '../RE/assets/annotations_dev.jsonl')
write_jsonl(annotations_test, '../RE/assets/annotations_test.jsonl')

In [26]:
labels = annotations_test.relations.apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [27]:
total_labels = list(np.sum([labels.iloc[k] for k in range(labels.shape[0])]))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [28]:
for label in np.unique(total_labels):
    print(label, total_labels.count(label) / len(total_labels))

activ 0.048323471400394474
bind 0.019230769230769232
block 0.05374753451676528
decreas 0.10157790927021697
express 0.08579881656804733
increas 0.22830374753451677
induc 0.15483234714003946
inhibit 0.1222879684418146
reduc 0.1203155818540434
regul 0.0655818540433925


Pb : sur-représentation de la classe No-Rel

Solutions possibles : 

    - Modifier la fonction de perte en pondérant le score en fonction de la classe
    - Considérer des relations non dirigées : relations(head, child) = relation(child, head) ==> moins de No-Rel 
    - Concaténer deux modèles : cf https://doc.rero.ch/record/327157/files/cud_tci.pdf
        - un premier modèle qui prédit l'absence de relation
        - un deuxième qui prédit la classe de relation
        
        
Remarque : 
- Pas de classe No-Rel !! Dans le modèle, on considère qu'il y a une relation A de e1 vers e2 ssi proba(A, e1, e2) > s=0.5. Si toutes les probas sont inférieures à s, alors pas de relation

In [5]:
annotations_test = read_jsonl('../RE/assets/annotations_test.jsonl')

In [6]:
for i in range()
x = annotations_test.iloc[0]
text = x.text
text

'twenty-four hours after vibration, muscle hyperalgesia was observed, concomitant to increased levels of il-6 in the gastrocnemius muscle and decreased expression of kv1.4 in the dorsal root ganglia'

In [8]:
spans = [(x.spans[i]['start'], x.spans[i]['end'], x.spans[i]['label']) for i in range(len(x.spans))]

In [9]:
spans

[(35, 54, 'DISEASE'),
 (104, 108, 'TARGET'),
 (165, 170, 'TARGET'),
 (178, 197, 'TARGET')]

In [38]:
x.relations

[{'head': 23,
  'child': 6,
  'head_span': {'start': 165,
   'end': 170,
   'token_start': 22,
   'token_end': 23,
   'label': 'TARGET'},
  'child_span': {'start': 35,
   'end': 54,
   'token_start': 4,
   'token_end': 6,
   'label': 'DISEASE'},
  'label': 'increas'},
 {'head': 28,
  'child': 6,
  'head_span': {'start': 178,
   'end': 197,
   'token_start': 25,
   'token_end': 28,
   'label': 'TARGET'},
  'child_span': {'start': 35,
   'end': 54,
   'token_start': 4,
   'token_end': 6,
   'label': 'DISEASE'},
  'label': 'increas'}]

In [38]:
nlp = spacy.blank('en')
doc = nlp(text)

In [43]:
doc.char_span(*spans[3])

insulin