<a href="https://colab.research.google.com/github/mahopman/IEBM-Net/blob/main/intervention_classifier/intervention_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
local_path = '/content/drive/MyDrive/MS_DataScience/DS595/CTP'
classifier_path = f'{local_path}/intervention_classifier'

In [None]:
import random
import numpy as np
from transformers.file_utils import is_torch_available, is_tf_available

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf

        tf.random.set_seed(seed)

In [None]:
import torch

class InterventionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(pred):
    true = pred.label_ids
    pred = pred.predictions.argmax(-1)

    precision = precision_score(true, pred)
    recall = recall_score(true, pred)
    accuracy = accuracy_score(true, pred)
    f1 = f1_score(true, pred)

    return {
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'f1': f1
    }

In [None]:
import json

train = json.load(open(f'{classifier_path}/train.json'))
test = json.load(open(f'{classifier_path}/test.json'))
val = json.load(open(f'{classifier_path}/val.json'))

X_train = [x['text'] for x in train]
X_test = [x['text'] for x in test]
X_val = [x['text'] for x in val]

y_train = [1 if 'DRUG' in x['label'] else 0 for x in train]
y_test = [1 if 'DRUG' in x['label'] else 0 for x in test]
y_val = [1 if 'DRUG' in x['label'] else 0 for x in val]

In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification

set_seed(1)

model_name = 'bert-base-uncased'

tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

train_encodings = tokenizer(X_train.to_list(), truncation=True, padding=True)
valid_encodings = tokenizer(X_test.to_list(), truncation=True, padding=True)

train_dataset = InterventionDataset(train_encodings, X_train.to_list())
valid_dataset = InterventionDataset(valid_encodings, X_test.to_list())

model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2.to("cuda")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir                  = f'{classifier_path}/results',
    num_train_epochs            = 3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 20,
    warmup_steps                = 500,
    weight_decay                = 0.01,
    logging_dir                 = f'{classifier_path}/logs',
    load_best_model_at_end      = True,
    logging_steps               = 400,
    save_steps                  = 400,
    evaluation_strategy         = "steps",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = valid_dataset,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()

precision = results['eval_precision']
recall = results['eval_recall']
accuracy = results['eval_accuracy']
precision = results['eval_precision']