In [None]:
import datasets
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TFAutoModelForTokenClassification, create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
import evaluate
import numpy as np
import tensorflow as tf
import pandas as pd


df = pd.read_pickle('/content/labeled_sentences.pkl')

id2label = {
    0: "O",  # Outside of a named entity
    1: "B-drug"
}

label2id = {
    "O": 0,
    "B-drug": 1,
}

tokens_train = [sent.split() for sent in df[0].values.tolist()]
ner_tags_train = [[label2id[ner_tag] for ner_tag in ner_tags] for ner_tags in df[1].values.tolist()]

drugs_train = datasets.Dataset.from_pandas(pd.DataFrame([{'tokens' : data[0], 'ner_tags' : data[1]} for data in zip(tokens_train, ner_tags_train)]))

drugs = datasets.DatasetDict({'train' : drugs_train})


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }



label_list = list(label2id.keys())

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenized_drugs = drugs.map(tokenize_and_align_labels, batched=True)

seqeval = evaluate.load("seqeval")
example = drugs["train"][0]
labels = [label_list[i] for i in example[f"ner_tags"]]
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")


model = TFAutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

import datasets
tf_train_set = model.prepare_tf_dataset(
    tokenized_drugs["train"],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
    prefetch = False
)


batch_size = 32
num_train_epochs = 3
num_train_steps = (len(tokenized_drugs["train"]) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-4,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

model.compile(optimizer=optimizer)  # No loss argument!
# loss=tf.keras.losses.SparseCategoricalCrossentropy(ignore_class=-100)

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_train_set)

push_to_hub_callback = PushToHubCallback(
    output_dir="drugobert_spec",
    tokenizer=tokenizer,
    hub_model_id="drugobert_spec")

callbacks = [metric_callback]

model.fit(x=tf_train_set, epochs=10, callbacks=callbacks)