# DistilBert model training code

## This code is based on the hugginface tutorial on [sequence classification](https://huggingface.co/docs/transformers/tasks/sequence_classification)

In [None]:
# import the data
from datasets import load_dataset

training_data = load_dataset("financial_phrasebank", "sentences_allagree")
valitdation_data = load_dataset("financial_phrasebank", "sentences_75agree")

In [38]:
# tokenize training and validation set
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

train_tokens = training_data.map(preprocess_function, batched=True)
val_tokens = valitdation_data.map(preprocess_function, batched=True)

train_tokens

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2264
    })
})

In [39]:
# import data collator with is used to create batches out of training set 
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [40]:
# set and calulate the models hyperparameters
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 3
batches_per_epoch = len(train_tokens["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

batches_per_epoch

141

In [41]:
id2label = {0: "negative", 2: "positive", 1: "neutral"}
label2id = {"negative": 0, "positive": 2,"neutral": 1}

In [42]:
# import the model
from transformers import TFDistilBertForSequenceClassification

model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [43]:
tf_train_set = model.prepare_tf_dataset(

    train_tokens["train"],

    shuffle=True,

    batch_size=16,

    collate_fn=data_collator,

)

tf_validation_set= model.prepare_tf_dataset(

    val_tokens["train"],

    shuffle=True,

    batch_size=16,

    collate_fn=data_collator,

)


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [44]:
model.compile(optimizer=optimizer)

In [45]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

recall = evaluate.load("recall")

precision = evaluate.load("precision")

def compute_acc(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    
def compute_recall(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return recall.compute(predictions=predictions, references=labels)
    
def compute_pre(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return precision.compute(predictions=predictions, references=labels)

In [46]:
# used for authenticating to push a trained model to huggingface directory
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [49]:
from transformers.keras_callbacks import KerasMetricCallback
from transformers import EarlyStoppingCallback

metric_callback = KerasMetricCallback(metric_fn=compute_acc, eval_dataset=tf_validation_set)

# callback used to upload the model to hugging face at the end of each epoch
# please comment out if you want to run tihs code 
from transformers import PushToHubCallback
push_to_hub_callback = PushToHubCallback(
    output_dir="fsentiment", tokenizer=tokenizer, hub_model_id="Dave12121/Fsentiment"
)


callbacks = [metric_callback, tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3), push_to_hub_callback]
            # additional set of metrics not used here
            #KerasMetricCallback(metric_fn=compute_recall, eval_dataset=tf_validation_set),
            #KerasMetricCallback(metric_fn=compute_pre, eval_dataset=tf_validation_set)]

Cloning https://huggingface.co/Dave12121/Fsentiment into local empty directory.


In [50]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, callbacks=callbacks,epochs=3, )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb4341ca890>