<a href="https://colab.research.google.com/github/mahopman/IEBM-Net/blob/main/intervention_classifier/intervention_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install accelerate transformers==4.27.4 keras

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.27.4
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.27.4)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.10

In [None]:
local_path = '/content/drive/MyDrive/MS_DataScience/DS595/IEBM-Net_Data'
classifier_path = f'{local_path}/intervention_classifier'

In [None]:
import random
import numpy as np
from transformers.file_utils import is_torch_available, is_tf_available

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

In [None]:
import torch

class InterventionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    true = pred.label_ids
    pred = pred.predictions.argmax(-1)

    precision = precision_score(true, pred)
    recall = recall_score(true, pred)
    accuracy = accuracy_score(true, pred)
    f1 = f1_score(true, pred)

    return {
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1
    }

In [None]:
import json

train = json.load(open(f'{classifier_path}/train.json'))
test = json.load(open(f'{classifier_path}/test.json'))

X_train = [train[key]['passage'] for key in train.keys()]
X_test = [test[key]['passage'] for key in test.keys()]

y_train = [1 if 'DRUG' in train[key]['intervention'] else 0 for key in train.keys()]
y_test = [1 if 'DRUG' in test[key]['intervention'] else 0 for key in test.keys()]

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_path = f'{local_path}/biobert-v1.1'

tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)

train_encodings = tokenizer(X_train, truncation=True, padding=True)
valid_encodings = tokenizer(X_test, truncation=True, padding=True)

train_dataset = InterventionDataset(train_encodings, y_train)
valid_dataset = InterventionDataset(valid_encodings, y_test)

model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2).to("cuda")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir                  = f'{classifier_path}/biobert-v1.1_results',
    num_train_epochs            = 1,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 20,
    warmup_steps                = 500,
    weight_decay                = 0.01,
    logging_dir                 = f'{classifier_path}/biobert-v1.1_logs',
    load_best_model_at_end      = True,
    logging_steps               = 400,
    save_steps                  = 400,
    evaluation_strategy         = "steps",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = valid_dataset,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
import pandas as pd

results = trainer.evaluate()

results_df = pd.DataFrame(results, index=[0])
results_df.to_csv(f'{classifier_path}/results.csv')

print(f'Precision: {results["eval_precision"]}')
print(f'Recall: {results["eval_recall"]}')
print(f'Accuracy: {results["eval_accuracy"]}')
print(f'F1: {results["eval_f1"]}')