In [1]:
from google.cloud import bigquery
import spacy
from spacy.util import filter_spans
from tqdm import tqdm
import numpy as np
import json
import re

tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
write_all = True

In [3]:
client = bigquery.Client()

In [4]:
query = """Select targets.entities as target_ents, targets.sentence, diseases.entities as disease_ents
            From tagged_data.targets
            Inner join tagged_data.diseases
            On targets.field=diseases.field
            And targets.sentence_id=diseases.sentence_id
            And targets.servier_doc_id=diseases.servier_doc_id"""

In [5]:
tagged = client.query(query).to_dataframe()

In [6]:
tagged.shape

(2131481, 3)

In [7]:
tagged = tagged.sample(50000)

In [8]:
tagged['all_entities'] = tagged.apply(lambda x: list(x.target_ents) + list(x.disease_ents), axis=1)

In [9]:
nlp = spacy.load('en_core_web_sm')
tagged['doc'] = tagged['sentence'].progress_apply(lambda x: nlp(x))

100%|██████████| 50000/50000 [06:42<00:00, 124.37it/s]


In [11]:
# Get tokens
def get_tokens(x, i):
    doc = x['doc'].copy()
    text = str(doc[i])
    start = doc[i:i+1].start_char
    end = doc[i:i+1].end_char
    if len(str(doc)) > end:
        next_char = str(doc)[end]
    else:
        next_char = ''
    if next_char == ' ':
        ws = True
    else:
        ws = False
    return {'text': text, 
            'start': start, 
            'end': end, 
            'id': i, 
            'ws': ws,
            'disabled': True}

In [12]:
tagged['tokens'] = tagged.progress_apply(lambda x: [get_tokens(x, i) for i in range(len(x.doc))], axis=1)

100%|██████████| 50000/50000 [01:45<00:00, 473.35it/s]


In [19]:
# Get spans
def get_spans(x):
    spans = []
    doc = x['doc'].copy()
    for d in x.all_entities:
        start = d['start']
        end = d['end']
        substring = x['sentence'][start: end]
        lspaces = len(substring) - len(substring.lstrip())
        rspaces = len(substring) - len(substring.rstrip())
        start += lspaces
        end -= rspaces
        label = d['origin'].split('_')[-1] if d['origin'].split('_')[-1].isupper() else d['origin'].split('_')[0]
        span = doc.char_span(start, end, label)
        if span is not None:
            spans.append(span)
    filtered_spans = filter_spans(spans)
    return [{
            'text': str(span), 
            'start': span.start_char, 
            'token_start': span.start,
            'token_end': span.end, 
            'end': span.end_char, 
            'type': 'span',
            'label': span.label_}
           for span in filtered_spans]

In [20]:
tagged['spans'] = tagged.progress_apply(lambda x: get_spans(x), axis=1)

100%|██████████| 50000/50000 [00:04<00:00, 11356.67it/s]


In [32]:
tagged['answer'] = 'accept'
tagged['meta'] = None
tagged['meta'] = tagged['meta'].apply(lambda x: {'source': 'unkown'})

In [33]:
tagged = tagged[['sentence', 'tokens', 'spans', 'answer', 'meta']]

In [34]:
tagged.columns = ['text', 'tokens', 'spans', 'answer', 'meta']

In [35]:
def write_jsonl(df, path):
    our_jsonl = df.to_dict(orient='records')
    with open(path, 'w') as outfile:
        for json_line in our_jsonl:
            json.dump(json_line, outfile)
            outfile.write('\n')

In [36]:
if write_all:
    np.random.seed(0)
    n1 = int(tagged.shape[0] * 65/100)
    n2 = int(tagged.shape[0] * 85/100) 
    df_shuffled = tagged.sample(frac=1)
    annotations_train = df_shuffled[:n1]
    annotations_dev = df_shuffled[n1:n2]
    annotations_test = df_shuffled[n2:]
    # Write for NER
    write_jsonl(annotations_train, '../NER/assets/annotations_train.jsonl')
    write_jsonl(annotations_dev, '../NER/assets/annotations_dev.jsonl')
    write_jsonl(annotations_test, '../NER/assets/annotations_test.jsonl')