# Transformer on the full-sentences extracts

In [16]:
import json

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('train_full-sentences_cleaned.csv', index_col=0)

In [3]:
df

Unnamed: 0,Id,section_title,sentence,dataset_label,label_length
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,Introduction,in fact organizations are now identifying digi...,program for the international assessment of ad...,62
1,0008656f-0ba2-4632-8602-3017b44c2e90,LITERATURE REVIEW,international studies on student achievement s...,trends in international mathematics and scienc...,53
2,000e04d6-d6ef-442f-b070-4309493221ba,Example: Farm Income and Farm Household Wealth,the agricultural resources management survey a...,agricultural resources management survey,40
3,000e04d6-d6ef-442f-b070-4309493221ba,Highlights,1 manages access to results of the agricultura...,agricultural resources management survey,40
4,000efc17-13d8-433d-8f62-a3932fe4f3b8,Study subjects,the adni data set is from a multicenter longit...,adni,4
...,...,...,...,...,...
51752,ffd4d86a-0f26-44cc-baed-f0e209cc22af,II.1. MRI Brain Image database,data used in the preparation of this article w...,alzheimer s disease neuroimaging initiative adni,49
51753,ffe7f334-245a-4de7-b600-d7ff4e28bfca,Characterization of the SARS-CoV-2 virus,interestingly the genome sequences of sars cov...,genome sequences of sars cov 2,30
51754,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,Polish research on the perception of mathematics,as part of the program for international stude...,trends in international mathematics and scienc...,53
51755,ffee2676-a778-4521-b947-e1e420b126c5,,analysis considered first time beginning posts...,beginning postsecondary students,32


# Generate tokens

In [4]:
def generate_tokens(sentence, label):
    tokens_sequence = ['O'] * len(sentence.split())
    start_char = sentence.find(label)
    start_token = len(sentence[:start_char].split())

    label_len = len(label.split())
    tokens_sequence[start_token:start_token+label_len] = ['D']*label_len
    return tokens_sequence

In [5]:
texts = []
tags = []
for i, row in df.iterrows():
    texts.append(row.sentence.split())
    tags.append(generate_tokens(row.sentence, row.dataset_label))

In [6]:
short_texts = [text for text in texts if len(text) <= 128]
short_tags = [tag for text, tag in zip(texts, tags) if len(text) <= 128]

In [7]:
long_texts_idx = [i for i in range(len(texts)) if len(texts[i]) > 128]

In [8]:
texts = short_texts
tags = short_tags

# Writing data to file
To be used with the run_ner script

In [20]:
def save_json(filename, texts, tags):
    with open(filename, 'w') as f:
        for text, tag in zip(texts, tags):
            json_el = {'tokens': text, 'tags': tag}
            json.dump(json_el, f)
            f.write('\n')

In [21]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)
print(len(train_texts), len(val_texts))

In [22]:
save_json('cleaned_ner_train.json', train_texts, train_tags)
save_json('cleaned_ner_val.json', val_texts, val_tags)

In [28]:
save_json('cleaned_ner_train_small.json', train_texts[:4000], train_tags[:4000])
save_json('cleaned_ner_val.json', val_texts[:2000], val_tags[:2000])

# Training in the notebook

In [9]:
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

In [10]:
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [11]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

max_length = 512
train_encodings = tokenizer(train_texts, max_length=max_length, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, max_length=max_length, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [12]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)
        #print(doc_enc_labels.shape, len(doc_labels))
        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

ValueError: NumPy boolean array indexing assignment cannot assign 12 input values to the 11 output values where the mask is true

In [None]:
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs