# Digital Methods - BERT Model
_____

## Table of Content

1. [Libraries](#libraries)
2. [Processing](#preprocessing)
3. [BERT model](#bert-model)
_____

## Libraries

All libraries which are needed to execute the code are listed here. Install the packages by using the `requirements.txt` file. 

The documentation can be found in the [README.md](README.md) file.

In [1]:
# load required libraries
import pandas as pd
from datasets import Dataset
import numpy as np
import logging
from sklearn.model_selection import (
    train_test_split)
from sklearn.metrics import (
    accuracy_score, precision_score, 
    recall_score, f1_score)
from transformers import (
    pipeline, Trainer, TrainingArguments, 
    AutoTokenizer, AutoModelForSequenceClassification)
logging.getLogger("datasets").setLevel(logging.WARNING)

## Preprocessing 

splitting into test and train data

In [2]:
#keep columns for text and label
columns_to_keep = ['text', 'super_claim']

#load data
dataset = pd.read_excel('immersion_journal/immersion_journal.xlsx', 
                        header = 2, usecols=columns_to_keep)

#convert to numeric
dataset['super_claim'] = pd.to_numeric(dataset['super_claim'], 
                                       errors='coerce', downcast='integer')
dataset.dropna(inplace=True)

#rename columns
dataset = dataset.rename(columns={'super_claim': 'label'})

#preprocess single labels
dataset = dataset[~dataset['label'].isin([7, 8])]
dataset['label'] = dataset['label'].fillna(0).astype(int)


In [3]:
#transfer to data dictionary
dataset = Dataset.from_pandas(dataset)

In [4]:
# Split the dataset into train and test sets
train_test_split = dataset.train_test_split(test_size=0.3, 
                                            shuffle= False)  

# Separate train and test sets
test = train_test_split['test']
train = train_test_split['train']

## BERT Model

- Create BERT Model from [Hugging Face](https://huggingface.co/prajjwal1/bert-medium)
- Train the predict 6 categories

In [5]:
#load pretrained BERT model
model_name = "prajjwal1/bert-medium"
# 8 classes in the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, 
                                                           num_labels=6)
model.to("cpu")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-medium and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-7): 8 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-1

In [6]:
# tokenizer that fits with the BERT model we use currently
tokenizer = AutoTokenizer.from_pretrained(model_name)

#function to apply that tokenizer once
#only the first 512 tokens with only_first
def tokenize(dataset):
    return tokenizer(dataset["text"], padding=True, max_length=512, 
                     truncation='only_first')

#apply the tokenizer to each row in the dataset
tokenized_train = train.map(tokenize, batched=True)
tokenized_test = test.map(tokenize, batched=True)



Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

In [7]:
#compute metrics of model
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    # Calculate accuracy, precision, recall and f1
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [8]:
training_args = TrainingArguments(output_dir="my_trainer",
                                  eval_strategy="steps",
                                  num_train_epochs=3.0, 
                                  per_device_train_batch_size=12,
                                  eval_steps=500)

In [9]:
# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

In [10]:
#fine-tune the model
trainer.train()

  0%|          | 0/21 [00:00<?, ?it/s]

{'train_runtime': 51.3165, 'train_samples_per_second': 4.326, 'train_steps_per_second': 0.409, 'train_loss': 1.5228848230271113, 'epoch': 3.0}


TrainOutput(global_step=21, training_loss=1.5228848230271113, metrics={'train_runtime': 51.3165, 'train_samples_per_second': 4.326, 'train_steps_per_second': 0.409, 'total_flos': 17380928286720.0, 'train_loss': 1.5228848230271113, 'epoch': 3.0})

In [11]:
#evaluate model
trainer.evaluate()

  0%|          | 0/5 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6678895950317383,
 'eval_accuracy': 0.6363636363636364,
 'eval_precision': 0.4049586776859504,
 'eval_recall': 0.6363636363636364,
 'eval_f1': 0.4949494949494949,
 'eval_runtime': 1.4304,
 'eval_samples_per_second': 23.07,
 'eval_steps_per_second': 3.495,
 'epoch': 3.0}