The purpose of this notebook is to generate a train.json file compatible with the run_ner script using the sentences retrieved with the retrieve-sentences notebook.

In [1]:
import json

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import transformers

In [4]:
DATA_PATH = './'

In [7]:
df = pd.read_csv(DATA_PATH+'train-with-cleaned-sentences.csv', index_col=0)

In [8]:
df

Unnamed: 0,Id,section_title,sentence,dataset_label,label_length
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,Introduction,in fact organizations are now identifying digi...,program for the international assessment of ad...,62
1,0008656f-0ba2-4632-8602-3017b44c2e90,LITERATURE REVIEW,international studies on student achievement s...,trends in international mathematics and scienc...,53
2,000e04d6-d6ef-442f-b070-4309493221ba,Example: Farm Income and Farm Household Wealth,the agricultural resources management survey a...,agricultural resources management survey,40
3,000e04d6-d6ef-442f-b070-4309493221ba,Highlights,1 manages access to results of the agricultura...,agricultural resources management survey,40
4,000efc17-13d8-433d-8f62-a3932fe4f3b8,Study subjects,the adni data set is from a multicenter longit...,adni,4
...,...,...,...,...,...
51752,ffd4d86a-0f26-44cc-baed-f0e209cc22af,II.1. MRI Brain Image database,data used in the preparation of this article w...,alzheimer s disease neuroimaging initiative adni,49
51753,ffe7f334-245a-4de7-b600-d7ff4e28bfca,Characterization of the SARS-CoV-2 virus,interestingly the genome sequences of sars cov...,genome sequences of sars cov 2,30
51754,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,Polish research on the perception of mathematics,as part of the program for international stude...,trends in international mathematics and scienc...,53
51755,ffee2676-a778-4521-b947-e1e420b126c5,,analysis considered first time beginning posts...,beginning postsecondary students,32


# Generate tokens

In [5]:
def generate_tokens(sentence, label):
    tokens_sequence = [0] * len(sentence.split())
    start_char = sentence.find(label)
    start_token = len(sentence[:start_char].split())

    label_len = len(label.split())
    tokens_sequence[start_token:start_token+label_len] = [1]*label_len
    return tokens_sequence

In [9]:
def generate_tokens(sentence, label):
    tokens_sequence = ['O'] * len(sentence.split())
    start_char = sentence.find(label)
    start_token = len(sentence[:start_char].split())

    label_len = len(label.split())
    tokens_sequence[start_token:start_token+label_len] = ['D']*label_len
    return tokens_sequence

In [10]:
texts = []
tags = []
for i, row in df.iterrows():
    texts.append(row.sentence.split())
    tags.append(generate_tokens(row.sentence, row.dataset_label))

In [11]:
df['tokens'] = texts
df['ner_tags'] = tags
df

Unnamed: 0,Id,section_title,sentence,dataset_label,label_length,tokens,ner_tags
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,Introduction,in fact organizations are now identifying digi...,program for the international assessment of ad...,62,"[in, fact, organizations, are, now, identifyin...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,0008656f-0ba2-4632-8602-3017b44c2e90,LITERATURE REVIEW,international studies on student achievement s...,trends in international mathematics and scienc...,53,"[international, studies, on, student, achievem...","[O, O, O, O, O, O, O, D, D, D, D, D, D, D, O, ..."
2,000e04d6-d6ef-442f-b070-4309493221ba,Example: Farm Income and Farm Household Wealth,the agricultural resources management survey a...,agricultural resources management survey,40,"[the, agricultural, resources, management, sur...","[O, D, D, D, D, O, O, O, O, O, O, O, O, O, O, ..."
3,000e04d6-d6ef-442f-b070-4309493221ba,Highlights,1 manages access to results of the agricultura...,agricultural resources management survey,40,"[1, manages, access, to, results, of, the, agr...","[O, O, O, O, O, O, O, D, D, D, D, O, O, O, O, ..."
4,000efc17-13d8-433d-8f62-a3932fe4f3b8,Study subjects,the adni data set is from a multicenter longit...,adni,4,"[the, adni, data, set, is, from, a, multicente...","[O, D, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...,...,...
51752,ffd4d86a-0f26-44cc-baed-f0e209cc22af,II.1. MRI Brain Image database,data used in the preparation of this article w...,alzheimer s disease neuroimaging initiative adni,49,"[data, used, in, the, preparation, of, this, a...","[O, O, O, O, O, O, O, O, O, O, O, O, D, D, D, ..."
51753,ffe7f334-245a-4de7-b600-d7ff4e28bfca,Characterization of the SARS-CoV-2 virus,interestingly the genome sequences of sars cov...,genome sequences of sars cov 2,30,"[interestingly, the, genome, sequences, of, sa...","[O, O, D, D, D, D, D, D, O, O, O, O, O, O, O, ..."
51754,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,Polish research on the perception of mathematics,as part of the program for international stude...,trends in international mathematics and scienc...,53,"[as, part, of, the, program, for, internationa...","[O, O, O, O, O, O, O, O, O, O, O, O, D, D, D, ..."
51755,ffee2676-a778-4521-b947-e1e420b126c5,,analysis considered first time beginning posts...,beginning postsecondary students,32,"[analysis, considered, first, time, beginning,...","[O, O, O, O, D, D, D, O, O, O, O, O, O, O, O, ..."


In [12]:
long_df = df[df['tokens'].apply(len) <= 256]

In [14]:
seed = 6
train_df, val_df = train_test_split(long_df[['Id', 'section_title', 'sentence', 'tokens', 'ner_tags']], test_size=.2, random_state=seed)
print(len(train_df), len(val_df))

41314 10329


# Writing data to file
To be used with the run_ner script

In [12]:
train_df

Unnamed: 0,Id,section_title,sentence,tokens,ner_tags
1677,0813d6a6-f14c-44f8-a791-def8e4e86cd6,Applications of geospatial technologies for hu...,among the most widely used geospatial tools ar...,"[among, the, most, widely, used, geospatial, t...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, D, ..."
1601,07d8261f-7678-41ac-a598-6ceeedba0f5f,Phylogenetic analysis,alignment of the complete genome sequences of ...,"[alignment, of, the, complete, genome, sequenc...","[O, O, O, O, D, D, D, D, D, D, O, O, O, O, O, ..."
832,03771ca7-69b6-4d55-a47d-1fba6101a118,Level 6 tests and England's strong accountabil...,whilst the debate has most recently decisively...,"[whilst, the, debate, has, most, recently, dec...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
994,0516bb14-5228-4cf6-b535-ae8de095a3da,Results,3 chs cs n 113 critical uncorrected p value 0 ...,"[3, chs, cs, n, 113, critical, uncorrected, p,...","[O, O, O, O, O, O, O, O, O, O, O, D, O, O, O, ..."
674,02fc3630-fab1-4198-980c-3ffb4ed864b3,À5,using a permutation based approach there was l...,"[using, a, permutation, based, approach, there...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...,...,...,...
849,038e65c0-3dce-4fe7-bbaa-3712fa28c4c8,Tropical cyclones' best tracks,we have downloaded the tcs best tracks from th...,"[we, have, downloaded, the, tcs, best, tracks,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
336,01ff79c7-bb0f-4172-b7e4-401b7aedd986,Abstract,the devised method has been assessed using 818...,"[the, devised, method, has, been, assessed, us...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
619,02e588e6-cf88-4e9a-ba16-7dfef6057447,Discussion,however the lack of subjects with high levels ...,"[however, the, lack, of, subjects, with, high,...","[O, O, O, O, O, O, O, O, O, O, O, O, D, O, O, ..."
2015,09e03074-0995-4ac4-be5f-0ad73ae4964b,Abstract,here we analyzed adni3 data from 317 participa...,"[here, we, analyzed, adni3, data, from, 317, p...","[O, O, O, D, O, O, O, O, O, O, O, O, O, O, O, ..."


In [15]:
train_df.to_json('../input/coleridge-sentences/ner_train-256.json', orient='records', lines=True)
val_df.to_json('../input/coleridge-sentences/ner_val-256.json', orient='records', lines=True)

# old

In [13]:
short_texts = [text for text in texts if len(text) <= 128]
short_tags = [tag for text, tag in zip(texts, tags) if len(text) <= 128]
long_texts_idx = [i for i in range(len(texts)) if len(texts[i]) > 128]

texts = short_texts
tags = short_tags

In [20]:
def save_json(filename, texts, tags):
    with open(filename, 'w') as f:
        for text, tag in zip(texts, tags):
            json_el = {'tokens': text, 'tags': tag}
            json.dump(json_el, f)
            f.write('\n')

In [21]:
seed = 6
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2, random_state=seed)
print(len(train_texts), len(val_texts))

1851 463


In [22]:
save_json(DATA_PATH+'ner_train.json', train_texts, train_tags)
save_json(DATA_PATH+'ner_val.json', val_texts, val_tags)

In [28]:
save_json('cleaned_ner_train_small.json', train_texts[:4000], train_tags[:4000])
save_json('cleaned_ner_val.json', val_texts[:2000], val_tags[:2000])