# Training

In [2]:
# Load dataset for training
import pandas as pd
tag_df = pd.read_csv('dataset/tagging_classification.csv')

In [3]:
# Download HuggingFace's transformers and datasets library
!pip install transformers
!pip install datasets



In [4]:
# Encode tag/Label to make it compatible with BERT
from sklearn.preprocessing import LabelEncoder

# Encode Label
label_encoder = LabelEncoder()
tag_df['label'] = label_encoder.fit_transform(tag_df['Tag'])
num_labels=len(label_encoder.classes_)

In [7]:
# Create tokenizer for tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/dmis-lab/biobert-base-cased-v1.1/resolve/main/config.json from cache at C:\Users\Reactive/.cache\huggingface\transformers\78e6e8ece5b58501028ce314273009ad7707ef4c5ba44251914fd6bca8a05eff.e4a2e693122d98b8b56b7dc1f0d89b644226aacef228afb5030ee3621b2829d3
Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.8.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/dmis-lab/biobert-base-cased

In [8]:
# Convert DataFrame into HuggingFace's Dataset and perform tokenization
from datasets import Dataset
raw_datasets = Dataset.from_pandas(tag_df).rename_column('Value', 'text')

# Define tokenization callback and perform tokenization
def tokenize_function(examples): return tokenizer(examples["text"], max_length=512, padding="max_length", truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [9]:
# Perform 70:30 split for train and test dataset 
SPLIT_SEED = 67

tokenized_datasets = tokenized_datasets.train_test_split(test_size=.3, seed=SPLIT_SEED)
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['test']

In [None]:
# Save splitted trainset and testset to be used by other model training
train_dataset.to_csv('dataset/tagging_classification_train.csv')
eval_dataset.to_csv('dataset/tagging_classification_test.csv')

In [None]:
# Load BioBERT pre-trained model to fine tune
# IGNORE the warning it's expected
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', num_labels=num_labels)

In [None]:
# Define evaluation metrics for each evaluation checkpoint during training
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
# Prepare Trainer and configure its arguments
from transformers import TrainingArguments, Trainer

# Small batch size because CUDA memory only 4GB (1050TI)
BATCH_SIZE=1
OUTPUT_PATH='biobert-cased-classification-klikdokter'

training_args = TrainingArguments(
    OUTPUT_PATH,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch"
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Fine-tune model
trainer.train()

# Model evaluation

In [10]:
from transformers import Trainer
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('./biobert-cased-classification-klikdokter/checkpoint-5500', num_labels=num_labels)
test_trainer = Trainer(model)
raw_pred, _, _ = test_trainer.predict(eval_dataset)

loading configuration file ./biobert-cased-classification-klikdokter/checkpoint-5500\config.json
Model config BertConfig {
  "_name_or_path": "dmis-lab/biobert-base-cased-v1.1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_10": 10,
    "LABEL_11": 11,
    "LABEL_12": 12,
    "LABEL_13": 13,
    "LABEL_14": 14,
    "LABEL_15": 15,
    "L

In [12]:
# Get predicted label
import numpy as np
y_pred = np.argmax(raw_pred, axis=1)
y_pred_labeled = label_encoder.inverse_transform(y_pred)

In [13]:
# Get actual label
y_actual = eval_dataset.to_pandas()['label']
y_actual_labeled = label_encoder.inverse_transform(y_actual)

In [14]:
# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_actual_labeled, y_pred_labeled, zero_division=0))

              precision    recall  f1-score   support

     Artikel       0.00      0.00      0.00         2
      Gejala       0.40      0.41      0.41        56
       Kapan       0.60      0.50      0.55        30
       Objek       0.72      0.65      0.68        85
   Observasi       0.00      0.00      0.00         6
     Outcome       0.46      0.55      0.50        94
      Pasien       0.95      1.00      0.97        18
     Pembuka       0.72      0.91      0.81        34
     Penutup       1.00      0.73      0.85        45
    Penyakit       0.56      0.64      0.60        53
    Penyebab       0.68      0.57      0.62        76
     Periode       0.65      0.71      0.68        21
  Pertanyaan       0.80      0.70      0.74        23
  Prakondisi       0.16      0.17      0.17        52
   Referensi       0.00      0.00      0.00         2
    Tindakan       0.73      0.76      0.74       192
 Usia pasien       1.00      1.00      1.00        18

    accuracy              