In [1]:
import os
os.chdir('..')

In [15]:
import gcsfs
import json
import ast
import pandas as pd
import numpy as np
import re
import spacy
from spacy.util import filter_spans 

from utils.read_data import read_jsonl, read_jsonl_from_gcs, read_parquet
from utils.preprocess_data import preprocess
from utils.write_jsonl import write_jsonl

In [3]:
data_path = 'gs://doccano_annotation_2/data/doccano_export_10_10_2022.jsonl'
drop_labels = ['BIOVERB']
display = False
write_all = False
write_only_test = False

In [5]:
if data_path.split('.')[-1] == 'jsonl':
    our_data = read_jsonl_from_gcs(data_path)
elif data_path.split('.')[-1] == 'parquet':
    our_data = read_parquet(data_path)
    our_data = doccano_doc_to_df(our_data)
else:
    raise Error('Please provide jsonl or paquet format')

In [6]:
our_data.head()

Unnamed: 0,id,text,Comments,entities,relations
0,110,Infection of the cell lines with M. arginini r...,[],"[{'id': 17390, 'label': 'TARGET', 'start_offse...","[{'id': 9805, 'from_id': 34791, 'to_id': 17392..."
1,111,Our results show that RA-NP inhibited LPS-indu...,[],"[{'id': 17395, 'label': 'BIOVERB', 'start_offs...","[{'id': 2250, 'from_id': 17400, 'to_id': 17396..."
2,112,"PF (10 μM) inhibited IL-33 production, Ca infl...","[{'id': 2, 'comment': 'Impossible a Tagger !!!...","[{'id': 17402, 'label': 'TARGET', 'start_offse...","[{'id': 2253, 'from_id': 17401, 'to_id': 17406..."
3,113,We further showed that 4.1B inhibited the prol...,[],"[{'id': 17409, 'label': 'TARGET', 'start_offse...","[{'id': 2259, 'from_id': 17418, 'to_id': 17412..."
4,114,"Y(1) receptor antagonists, BIBP3226 and BIBO33...",[],"[{'id': 17420, 'label': 'TARGET', 'start_offse...","[{'id': 9821, 'from_id': 17425, 'to_id': 17424..."


In [7]:
len_relations = our_data.relations.apply(lambda x: len(x))
our_data = our_data.loc[len_relations > 0]

In [8]:
our_data.shape

(386, 5)

In [9]:
ner_labels = our_data.entities.apply(lambda x: [x[i]['label'] for i in range(len(x))])
print('Entities labels : ', list(set(np.sum([ner_labels.iloc[k] for k in range(ner_labels.shape[0])]))))

Entities labels :  ['TARGET', 'SMALL_MOLECULE', 'PATHWAY', 'CELL_LINE', 'CHEMICAL', 'CELL LINE', 'UNKNOWN', 'TISSUE', 'DISEASE', 'BIOVERB']


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [10]:
classes = our_data.relations.apply(lambda x: [x[i]['type'] for i in range(len(x))])
print('Relations labels : ', list(set(np.sum([classes.iloc[k] for k in range(classes.shape[0])]))))

Relations labels :  ['activ', 'stimulate', '!increas', 'inhibit', 'suppress', 'enhance', '!regul', 'induc', 'imped', 'reduc', 'express', '!express', 'increas', '!decreas', 'decreas', '!bind', 'regul', 'elevate', 'block', 'target', '!reduc', '!block', 'interact', '!activ', '!inhibit', 'promote', 'bind', '!induc']


In [12]:
formatted_data = preprocess(our_data, drop_labels=drop_labels)

In [13]:
formatted_data.shape

(379, 6)

In [14]:
# Display spans
from spacy import displacy

def display_entities(text, entities, entity_type=None):
    nlp = spacy.blank("en")
    doc = nlp.make_doc(text)
    ents = []
    for entity in entities:
        if (entity["label"] == entity_type) or (entity_type is None):
            span_start = entity["start"]
            span_end = entity["end"]
            label = entity["label"]
            ent = doc.char_span(span_start, span_end, label=label)
            if ent is None:
                continue
            ents.append(ent)
    doc.ents = ents
    displacy.render(doc, style="ent", jupyter=True)

if display:
    not_working = []
    for k in range(formatted_data.shape[0]):
        row = formatted_data.iloc[k]
        try:
            display_entities(row.text, row.spans, None)
        except ValueError:
            not_working.append(k)

In [16]:
if write_only_test:
    assert write_all == False
    write_jsonl(formatted_data, '../NER/assets/annotations_test.jsonl')
    write_jsonl(formatted_data, '../RE/assets/annotations_test.jsonl')

In [17]:
if write_all:
    np.random.seed(0)
    n1 = int(formatted_data.shape[0] * 65/100)
    n2 = int(formatted_data.shape[0] * 85/100) 
    df_shuffled = formatted_data.sample(frac=1)
    annotations_train = df_shuffled[:n1]
    annotations_dev = df_shuffled[n1:n2]
    annotations_test = df_shuffled[n2:]
    # Write for NER
    write_jsonl(annotations_train, '../NER/assets/annotations_train.jsonl')
    write_jsonl(annotations_dev, '../NER/assets/annotations_dev.jsonl')
    write_jsonl(annotations_test, '../NER/assets/annotations_test.jsonl')
    # Write for RE
    write_jsonl(annotations_train, '../RE/assets/annotations_train.jsonl')
    write_jsonl(annotations_dev, '../RE/assets/annotations_dev.jsonl')
    write_jsonl(annotations_test, '../RE/assets/annotations_test.jsonl')

In [18]:
labels = formatted_data.relations.apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [19]:
total_labels = list(np.sum([labels.iloc[k] for k in range(labels.shape[0])]))

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [20]:
for label in np.unique(total_labels):
    print(label, total_labels.count(label) / len(total_labels))

activ 0.18608058608058609
bind 0.07472527472527472
block 0.04322344322344322
decreas 0.0652014652014652
elevate 0.0029304029304029304
enhance 0.011721611721611722
express 0.12747252747252746
imped 0.007326007326007326
increas 0.0695970695970696
induc 0.08937728937728938
inhibit 0.13553113553113552
interact 0.0029304029304029304
promote 0.008791208791208791
reduc 0.07692307692307693
regul 0.07179487179487179
stimulate 0.014652014652014652
suppress 0.011721611721611722
