### Load Dataset

In [None]:
from datasets import load_dataset

# Load the dataset
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

# View one example
print(drug_dataset_reloaded['train'][0])


### Labelling

In [None]:
# Step 1: Get all unique conditions from the train set
conditions = list(set(example['condition'] for example in drug_dataset_reloaded['train'] if example['condition']))

# Step 2: Create label mappings
label2id = {label: idx for idx, label in enumerate(sorted(conditions))}
id2label = {idx: label for label, idx in label2id.items()}

# Step 3: Apply the mapping
def encode_labels(example):
    example['label'] = label2id.get(example['condition'], -1)  # -1 for unknown/missing
    return example

# Map to all splits
encoded_dataset = drug_dataset_reloaded.map(encode_labels)
print(encoded_dataset['train'][0])


### Tokenize Dataset

In [None]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") #using this model is for building your own classifier, like predicting conditions from drug reviews (multiclass classification).

# Tokenization function
def tokenize(example):
    return tokenizer(example["review"], truncation=True, padding=True  )

# Apply tokenization to all splits
tokenized_dataset = encoded_dataset.map(tokenize, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in drug_dataset_reloaded["train"].column_names if col not in ["label"]]
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
print(tokenized_dataset["train"][0])

In [None]:
tokenized_dataset["train"]

In [None]:
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")

In [None]:
["attention_mask", "input_ids", "labels"]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

### Setup Model

In [None]:
from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm.auto import tqdm

accelerator = Accelerator()
print("Using device:", accelerator.device)

# How many classes?
num_labels = len(label2id)

# Load the model with correct output size
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
progress_bar = tqdm(range(num_training_steps))

In [None]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print(torch.cuda.is_available())
print("device name:"+torch.cuda.get_device_name(0))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

### Setup Evaluation Computing Metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


### Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
)


### Create a Trainer

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


### Train then Model

In [None]:
progress_bar = tqdm(range(num_training_steps))
trainer.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)