# Explore data

In [1]:
import json
import ast
import pandas as pd
import numpy as np
import re
from spacy.lang.en import English

In [34]:
data_path = r'../example_assets/annotations_test.jsonl'

In [36]:
def read_jsonl(data_path):
    with open(data_path, 'r') as f:
        json_list = list(f)
    data = []
    for j in json_list:
        data.append(json.loads(j))
    return pd.DataFrame(data)

In [37]:
ner_df = read_jsonl(data_path)

In [38]:
ner_df.tokens.iloc[0]

[{'text': 'The', 'start': 0, 'end': 3, 'id': 0, 'ws': True, 'disabled': True},
 {'text': 'RUNX',
  'start': 4,
  'end': 8,
  'id': 1,
  'ws': True,
  'disabled': False},
 {'text': 'protein',
  'start': 9,
  'end': 16,
  'id': 2,
  'ws': True,
  'disabled': True},
 {'text': 'that',
  'start': 17,
  'end': 21,
  'id': 3,
  'ws': True,
  'disabled': True},
 {'text': 'actually',
  'start': 22,
  'end': 30,
  'id': 4,
  'ws': True,
  'disabled': True},
 {'text': 'induces',
  'start': 31,
  'end': 38,
  'id': 5,
  'ws': True,
  'disabled': True},
 {'text': 'the',
  'start': 39,
  'end': 42,
  'id': 6,
  'ws': True,
  'disabled': True},
 {'text': 'expression',
  'start': 43,
  'end': 53,
  'id': 7,
  'ws': True,
  'disabled': True},
 {'text': 'of', 'start': 54, 'end': 56, 'id': 8, 'ws': True, 'disabled': True},
 {'text': 'FOXP3',
  'start': 57,
  'end': 62,
  'id': 9,
  'ws': True,
  'disabled': False},
 {'text': 'might',
  'start': 63,
  'end': 68,
  'id': 10,
  'ws': True,
  'disabled': Tru

In [6]:
ws = pd.DataFrame(ner_df.tokens.apply(lambda x: [x[i]['ws'] for i in range(len(x))]))
disabled = pd.DataFrame(ner_df.tokens.apply(lambda x: [x[i]['disabled'] for i in range(len(x))]))

In [7]:
disabled['sum'] = disabled['tokens'].apply(lambda x: np.sum(x) / len(x))

In [8]:
np.mean(disabled['sum'])

0.8380485562474074

In [9]:
len(ws.iloc[0].tokens)

25

In [10]:
ner_df.iloc[1].text

'BMP-6 can signal through the ligation of the type I receptors Act-RIA, BMP-RIA, and BMP-RIB and the type II receptors BMP-RII, Act-RIIA and Act-RIIB, which lead to the phosphorylation of the receptor Smads (Smad-1, Smad-5, and Smad-8).'

In [11]:
splits = ner_df.iloc[1].text.split(' ')
for i in range(len(splits)):
    if ws.iloc[1]['tokens'][i]:
        print(splits[i])

BMP-6
can
signal
through
the
ligation
of
the
type
I
receptors
BMP-RIB
II
receptors
and
Act-RIIB,
which
lead
to
the
receptor
Smad-5,
and


In [12]:
splits = ner_df.iloc[1].text.split(' ')
for i in range(len(splits)):
    if not ws.iloc[1]['tokens'][i]:
        print(splits[i])

Act-RIA,
BMP-RIA,
and
and
the
type
BMP-RII,
Act-RIIA
phosphorylation
of
the
Smads
(Smad-1,
Smad-8).


In [13]:
ws['sum'] = ws['tokens'].apply(lambda x: np.sum(x) / len(x))

In [14]:
ws['sum'].values.mean()

0.7881091715649026

In [15]:
print(ner_df.shape)

(149, 10)


In [16]:
ner_df.iloc[0]

text           Furthermore, Smad-phosphorylation was followed...
spans          [{'text': 'Smad', 'start': 13, 'token_start': ...
meta           {'source': 'BioNLP 2011 Genia Shared Task, PMC...
_input_hash                                            -13216227
_task_hash                                           -1944660624
tokens         [{'text': 'Furthermore', 'start': 0, 'end': 11...
_session_id                                                 None
_view_id                                               relations
relations      [{'head': 2, 'child': 10, 'head_span': {'start...
answer                                                    accept
Name: 0, dtype: object

In [17]:
ner_df.meta.iloc[0]

{'source': 'BioNLP 2011 Genia Shared Task, PMC-1134658-00-TIAB.txt'}

In [18]:
ner_df['entities'] = ner_df['spans'].apply(lambda x: [x[i]['text'] for i in range(len(x))])
entities = set([item for sublist in list(ner_df['entities'].values) for item in sublist])

In [19]:
# Ner labels
ner_df['labels'] = ner_df['spans'].apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [20]:
labels = set([item for sublist in list(ner_df['labels'].values) for item in sublist])

In [21]:
labels

{'GGP'}

In [22]:
ner_df.text.iloc[1]

'BMP-6 can signal through the ligation of the type I receptors Act-RIA, BMP-RIA, and BMP-RIB and the type II receptors BMP-RII, Act-RIIA and Act-RIIB, which lead to the phosphorylation of the receptor Smads (Smad-1, Smad-5, and Smad-8).'

In [23]:
len(ner_df.spans.iloc[1])

10

In [24]:
len(ner_df.relations.iloc[0])

4

In [25]:
# Rel labels
ner_df['relation_labels'] = ner_df['relations'].apply(lambda x: [x[i]['label'] for i in range(len(x))])

In [26]:
total_labels = [item for sublist in list(ner_df['relation_labels'].values) for item in sublist]

In [27]:
for label in np.unique(total_labels):
    print(label, total_labels.count(label) / len(total_labels))

Binds 0.13108614232209737
Neg-Reg 0.15730337078651685
No-rel 0.12734082397003746
Pos-Reg 0.40074906367041196
Reg 0.18352059925093633


In [28]:
def print_relations(relation):
    row = ner_df[ner_df['relation_labels'].apply(lambda x: relation in x)].sample().iloc[0]
    sentence = row.text
    print('Text :', sentence)
    print('')
    for rel in row.relations:
        if rel['label'] == relation:
            print(sentence[rel['head_span']['start']:rel['head_span']['end']], relation, 
                  sentence[rel['child_span']['start']:rel['child_span']['end']])

In [29]:
print_relations('Binds')

Text : Transcriptional activation of the human TF gene in monocytic cells exposed to bacterial lipopolysaccharide (LPS) is mediated by binding of c-Rel/p65 heterodimers to a kappa B site in the TF promoter.

c-Rel Binds TF
p65 Binds TF


In [30]:
print_relations('Neg-Reg')

Text : We demonstrated that interleukin 4 (IL-4) present at the time of T cell priming inhibits FOXP3.

interleukin 4 Neg-Reg FOXP3
IL-4 Neg-Reg FOXP3


In [31]:
print_relations('No-rel')

Text : We observed no change in GATA3 expression in Th2 cells, T-bet expression in Th1 cells, or RORC2 mRNA expression in Th17 cells in which RUNX1 and RUNX3 were knocked down compared with control cells.

RUNX1 No-rel RORC2
RUNX1 No-rel T-bet
RUNX1 No-rel GATA3
RUNX3 No-rel RORC2
RUNX3 No-rel T-bet
RUNX3 No-rel GATA3


In [32]:
print_relations('Pos-Reg')

Text : Of note, GATA3 was also induced in the presence of TGF-beta at high IL-4 concentration (Figure 5A and 5B).

TGF-beta Pos-Reg GATA3
IL-4 Pos-Reg GATA3


In [33]:
print_relations('Reg')

Text : The transactivation of multiple cis elements, especially S and X2, of the DR alpha proximal promoter in group II CID cells is CIITA dependent.

CIITA Reg DR alpha


In [44]:
ner_df.iloc[2].text

'TGF-beta can activate RUNX genes at the transcriptional level, and at the posttranscriptional level through activation or stabilization of RUNX proteins (Jin et al., 2004).'

In [45]:
ner_df.iloc[2].spans

[{'text': 'TGF-beta',
  'start': 0,
  'token_start': 0,
  'token_end': 2,
  'end': 8,
  'type': 'span',
  'label': 'GGP'},
 {'text': 'RUNX',
  'start': 22,
  'token_start': 5,
  'token_end': 5,
  'end': 26,
  'type': 'span',
  'label': 'GGP'},
 {'text': 'RUNX',
  'start': 139,
  'token_start': 22,
  'token_end': 22,
  'end': 143,
  'type': 'span',
  'label': 'GGP'}]

In [46]:
ner_df.iloc[2].relations

[{'head': 2,
  'child': 5,
  'head_span': {'start': 0,
   'end': 8,
   'token_start': 2,
   'token_end': 2,
   'label': 'GGP'},
  'child_span': {'start': 22,
   'end': 26,
   'token_start': 5,
   'token_end': 5,
   'label': 'GGP'},
  'color': '#ffd882',
  'label': 'Pos-Reg'},
 {'head': 2,
  'child': 22,
  'head_span': {'start': 0,
   'end': 8,
   'token_start': 2,
   'token_end': 2,
   'label': 'GGP'},
  'child_span': {'start': 139,
   'end': 143,
   'token_start': 22,
   'token_end': 22,
   'label': 'GGP'},
  'color': '#ffd882',
  'label': 'Pos-Reg'}]