# Explore data

In [1]:
import os

os.chdir('..')

In [2]:
import json
import ast
import pandas as pd
import numpy as np
import re
from spacy.lang.en import English

from utils.read_data import read_jsonl

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
data_path = r'additional_data/example_assets/annotations_test.jsonl'

In [6]:
ner_df = read_jsonl(data_path)

In [7]:
ner_df.tokens.iloc[0][0]

{'text': 'The', 'start': 0, 'end': 3, 'id': 0, 'ws': True, 'disabled': True}

In [8]:
ws = pd.DataFrame(ner_df.tokens.apply(lambda x: [x[i]['ws'] for i in range(len(x))]))
disabled = pd.DataFrame(ner_df.tokens.apply(lambda x: [x[i]['disabled'] for i in range(len(x))]))

In [9]:
disabled['sum'] = disabled['tokens'].apply(lambda x: np.sum(x) / len(x))

In [10]:
np.mean(disabled['sum'])

0.8357883267469727

In [11]:
ner_df.meta.iloc[0]

{'source': 'BioNLP 2011 Genia Shared Task, PMC-2806624-07-DISCUSSION.txt'}

In [12]:
ner_df['entities'] = ner_df['spans'].apply(lambda x: [x[i]['text'] for i in range(len(x))])
entities = set([item for sublist in list(ner_df['entities'].values) for item in sublist])

In [13]:
# Ner labels
ner_df['labels'] = ner_df['spans'].apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [14]:
labels = set([item for sublist in list(ner_df['labels'].values) for item in sublist])

In [15]:
labels

{'GGP'}

In [16]:
ner_df.text.iloc[1]

'It was shown that TGF-beta is mandatory for the maintenance of peripheral T reg cells and their expression of Foxp3 (Marie et al., 2005; Rubtsov and Rudensky, 2007).'

In [17]:
len(ner_df.spans.iloc[1])

2

In [18]:
len(ner_df.relations.iloc[0])

1

In [19]:
# Rel labels
ner_df['relation_labels'] = ner_df['relations'].apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [20]:
total_labels = [item for sublist in list(ner_df['relation_labels'].values) for item in sublist]

In [21]:
for label in np.unique(total_labels):
    print(label, total_labels.count(label) / len(total_labels))

Binds 0.12582781456953643
Neg-Reg 0.08609271523178808
No-rel 0.09271523178807947
Pos-Reg 0.44370860927152317
Reg 0.25165562913907286


In [22]:
def print_relations(relation):
    row = ner_df[ner_df['relation_labels'].apply(lambda x: relation in x)].sample().iloc[0]
    sentence = row.text
    print('Text :', sentence)
    print('')
    for rel in row.relations:
        if rel['label'] == relation:
            print(sentence[rel['head_span']['start']:rel['head_span']['end']], relation, 
                  sentence[rel['child_span']['start']:rel['child_span']['end']])

In [23]:
print_relations('Binds')

Text : We speculate that estrogens exert effects on erythropoiesis by modulating GATA-1 activity through protein-protein interaction with the ER.

GATA-1 Binds ER


In [24]:
print_relations('Neg-Reg')

Text : Initiation binding repressor [corrected] (IBR) is a chicken erythrocyte factor (apparent molecular mass, 70 to 73 kDa) that binds to the sequences spanning the transcription initiation site of the histone h5 gene, repressing its transcription.

Initiation binding repressor Neg-Reg histone h5
IBR Neg-Reg histone h5


In [25]:
print_relations('No-rel')

Text : As an additional support for this concept, the overexpression of RUNX1 induced increased FOXP3 protein expression without any requirement of TGF-beta and anti-CD3 and anti-CD28 stimulation in human primary CD4+ cells.

TGF-beta No-rel FOXP3


In [26]:
print_relations('Pos-Reg')

Text : It was shown that TGF-beta is mandatory for the maintenance of peripheral T reg cells and their expression of Foxp3 (Marie et al., 2005; Rubtsov and Rudensky, 2007).

TGF-beta Pos-Reg Foxp3


In [27]:
print_relations('Reg')

Text : We examined alpha A1 (an alpha A-gene product) and alpha B1 and alpha B2 (two alpha B-encoded isomers) for their effects on the GM-CSF promoter.

alpha A1 Reg GM-CSF
alpha B1 Reg GM-CSF
alpha B2 Reg GM-CSF
