In [None]:
import pandas as pd
from transformers import TrainingArguments, Trainer

In [None]:
file_path_1 = './i2b2_parsing/parsed_gold_set_1.jsonl'
test_path = './i2b2_parsing/parsed_test.jsonl'
data = pd.read_json(file_path_1, lines=True)
test_data = pd.read_json(test_path, lines=True)

In [None]:
data

In [None]:
texts = data['text'].tolist() # type: ignore
test_texts = test_data['text'].tolist() # type: ignore
texts[0]

In [None]:
spans = data['spans'].tolist() # type: ignore
test_spans = test_data['spans'].tolist() # type: ignore
spans[0]

In [None]:
model_nm = 'microsoft/deberta-v3-small'

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, convert_slow_tokenizer
tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=True)

In [None]:
def tok_function(x): return tokz(x['text'])

In [None]:
from datasets import Dataset
ds = Dataset.from_pandas(data)
test_ds = Dataset.from_pandas(test_data)

In [None]:
tok_ds = ds.map(tok_function, batched=True)
test_tok_ds = test_ds.map(tok_function, batched=True)
tok_ds

In [None]:
tok_ds['text'][0]
tok_ds['input_ids'][0]


In [None]:
data['spans']

In [None]:
input_ids = tok_ds['input_ids'][0]
tokens = tokz.convert_ids_to_tokens(input_ids)
token_positions = tokz(tok_ds['text'][0], return_offsets_mapping=True)['offset_mapping']

for token, input_id, token_pos in zip(tokens, input_ids, token_positions):
    print(f'{token}: {input_id}: {token_pos}')

In [30]:
label_map = {
    "Padding": -100,
    "O": 0,
    "B-DATE": 1,
    "I-DATE": 2,
    "B-PATIENT": 3,
    "I-PATIENT": 4,
    "B-AGE": 5,
    "I-AGE": 6,
    "B-STAFF": 7,
    "I-STAFF": 8,
    "B-PHONE": 9,
    "I-PHONE": 10,
    "B-EMAIL": 11,
    "I-EMAIL": 12,
    "B-ID": 13,
    "I-ID": 14,
    "B-HOSP": 15,
    "I-HOSP": 16,
    "B-PATORG": 17,
    "I-PATORG": 18,
    "B-LOC": 19,
    "I-LOC": 20,
    "B-OTHERPHI": 21,
    "I-OTHERPHI": 22,
}

In [None]:
def pre_process_data(text, spans, input_ids):
    tokens = tokz.convert_ids_to_tokens(input_ids)
    labels = ['O'] * len(input_ids)
    token_positions = tokz(text, return_offsets_mapping=True)['offset_mapping']


    for span in spans:
        start, end, label = span['start'], span['end'], span['label']

        token_start, token_end = None, None

        for idx, (char_start, char_end) in enumerate(token_positions):
            if tokens[idx].startswith('▁'):
                char_start += 1
            # print(tokens[idx], char_start, char_end, start, end)
            if char_start == int(start):
                token_start = idx
            if char_end == int(end):
                token_end = idx
                break
        
        if token_start is not None and token_end is not None:
            # print(token_start, token_end, label)
            labels[token_start] = f'B-{label}'
            for idx in range(token_start + 1, token_end + 1):
                labels[idx] = f'I-{label}'

    # input_ids = tokz.convert_tokens_to_ids(tokens)
    label_ids = [label_map[label] for label in labels]

    return label_ids

In [None]:
tok_ds_processed = tok_ds.add_column('labels', [pre_process_data(text, spans, input_ids) for text, spans, input_ids in zip(tok_ds['text'], tok_ds['spans'], tok_ds['input_ids'])])

In [22]:
def find_max_len(x):
    return max([len(input_ids) for input_ids in x['input_ids']])

max_len = find_max_len(tok_ds_processed)
max_len

4995

In [23]:
def pad_data(sequences, max_length, padding_value):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_seq = seq[:max_length]
        else:
            padded_seq = seq + [padding_value] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
    return padded_sequences


In [24]:
input_ids = pad_data(tok_ds_processed['input_ids'], max_len, 0)
labels = pad_data(tok_ds_processed['labels'], max_len, -100)
attention_mask = pad_data(tok_ds_processed['attention_mask'], max_len, 0)

In [25]:
tok_ds_processed = tok_ds_processed.remove_columns(['input_ids', 'labels', 'attention_mask', 'text', 'spans', 'meta'])
tok_ds_processed = tok_ds_processed.add_column('input_ids', input_ids)
tok_ds_processed = tok_ds_processed.add_column('labels', labels)
tok_ds_processed = tok_ds_processed.add_column('attention_mask', attention_mask)

In [None]:
test_tok_ds_processed = test_tok_ds.add_column('labels', [pre_process_data(text, spans, input_ids) for text, spans, input_ids in zip(test_tok_ds['text'], test_tok_ds['spans'], test_tok_ds['input_ids'])])

In [None]:
# Print out mapping of the tokens to the labels
tokens = tokz.convert_ids_to_tokens(tok_ds_processed['input_ids'][1])
labels = [list(label_map.keys())[list(label_map.values()).index(label_id)] for label_id in tok_ds_processed['labels'][1]]

for token, label in zip(tokens, labels):
    print(f'{token} - {label}')

In [26]:
## Create test and validation sets
## test_tok_ds_processed is the test set
tok_dds = tok_ds_processed.train_test_split(test_size=0.25, seed=42)
tok_dds

DatasetDict({
    train: Dataset({
        features: ['token_type_ids', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 592
    })
    test: Dataset({
        features: ['token_type_ids', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 198
    })
})

### Training the model

In [27]:
bs = 64
epochs = 4

In [28]:
lr = 8e-5

In [29]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [31]:
from transformers import AutoModelForTokenClassification, AutoConfig


In [32]:
config = AutoConfig.from_pretrained(model_nm, num_labels=len(label_map))


In [33]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

NameError: name 'evaluate' is not defined

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_nm, config=config)

In [None]:
trainer = Trainer(model, args, train_dataset=tok_dds['train'], eval_dataset=tok_dds['test'], tokenizer=tokz)

In [None]:
trainer.train()

In [None]:
# tok_dds['test']['labels'][0]
tok_dds['train']['attention_mask'][0]

In [None]:
for i in range(len(tok_dds['test']['input_ids'])):
    print(len(tok_dds['test']['input_ids'][i]))