### Load Dataset

In [1]:
from datasets import load_dataset

# Load the dataset
data_files = {
    "train": "drug-reviews-train.jsonl",
    "validation": "drug-reviews-validation.jsonl",
    "test": "drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

# View one example
print(drug_dataset_reloaded['train'][0])


  from .autonotebook import tqdm as notebook_tqdm


{'patient_id': 89879, 'drugName': 'Cyclosporine', 'condition': 'keratoconjunctivitis sicca', 'review': '"I have used Restasis for about a year now and have seen almost no progress.  For most of my life I\'ve had red and bothersome eyes. After trying various eye drops, my doctor recommended Restasis.  He said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  When I put the drops in it burns my eyes for the first 30 - 40 minutes.  I\'ve talked with my doctor about this and he said it is normal but should go away after some time, but it hasn\'t. Every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though I\'ve been using Restasis for a year now. The only difference I notice was for the first couple weeks, but now I\'m ready to move on."', 'rating': 2.0, 'date': 'April 20, 2013', 'usefulCount': 69, 'review_length': 147}


### Labelling

In [2]:
# Step 1: Get all unique conditions from the train set
conditions = list(set(example['condition'] for example in drug_dataset_reloaded['train'] if example['condition']))

# Step 2: Create label mappings
label2id = {label: idx for idx, label in enumerate(sorted(conditions))}
id2label = {idx: label for label, idx in label2id.items()}

# Step 3: Apply the mapping
def encode_labels(example):
    example['label'] = label2id.get(example['condition'], -1)  # -1 for unknown/missing
    return example

# Map to all splits
encoded_dataset = drug_dataset_reloaded.map(encode_labels)
print(encoded_dataset['train'][0])


{'patient_id': 89879, 'drugName': 'Cyclosporine', 'condition': 'keratoconjunctivitis sicca', 'review': '"I have used Restasis for about a year now and have seen almost no progress.  For most of my life I\'ve had red and bothersome eyes. After trying various eye drops, my doctor recommended Restasis.  He said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  When I put the drops in it burns my eyes for the first 30 - 40 minutes.  I\'ve talked with my doctor about this and he said it is normal but should go away after some time, but it hasn\'t. Every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though I\'ve been using Restasis for a year now. The only difference I notice was for the first couple weeks, but now I\'m ready to move on."', 'rating': 2.0, 'date': 'April 20, 2013', 'usefulCount': 69, 'review_length': 147, 'label': 425}


### Tokenize Dataset

In [3]:
from transformers import AutoTokenizer, DataCollatorWithPadding

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") #using this model is for building your own classifier, like predicting conditions from drug reviews (multiclass classification).

# Tokenization function
def tokenize(example):
    return tokenizer(example["review"], truncation=True, padding=True  )

# Apply tokenization to all splits
tokenized_dataset = encoded_dataset.map(tokenize, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(
    [col for col in drug_dataset_reloaded["train"].column_names if col not in ["review", "condition", "label"]]
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

Map: 100%|██████████| 27703/27703 [00:11<00:00, 2463.55 examples/s]


In [4]:
print(tokenized_dataset["train"][0])

{'condition': 'keratoconjunctivitis sicca', 'review': '"I have used Restasis for about a year now and have seen almost no progress.  For most of my life I\'ve had red and bothersome eyes. After trying various eye drops, my doctor recommended Restasis.  He said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  When I put the drops in it burns my eyes for the first 30 - 40 minutes.  I\'ve talked with my doctor about this and he said it is normal but should go away after some time, but it hasn\'t. Every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though I\'ve been using Restasis for a year now. The only difference I notice was for the first couple weeks, but now I\'m ready to move on."', 'label': 425, 'input_ids': [101, 1000, 1045, 2031, 2109, 2717, 21369, 2015, 2005, 2055, 1037, 2095, 2085, 1998, 2031, 2464, 2471, 2053, 5082, 1012, 2005, 2087, 1997, 2026, 2166, 1045, 10

In [5]:
tokenized_dataset["train"]

Dataset({
    features: ['condition', 'review', 'label', 'input_ids', 'attention_mask'],
    num_rows: 110811
})

In [6]:
# tokenized_dataset = tokenized_dataset.remove_columns( ['patient_id', 'drugName', 'rating', 'date', 'usefulCount', 'review_length','condition', 'review'])
tokenized_dataset = tokenized_dataset.remove_columns( ['condition', 'review'])

In [7]:
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [8]:
tokenized_dataset.set_format("torch")

In [9]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 46108
    })
})

In [10]:
["attention_mask", "input_ids", "labels"]

['attention_mask', 'input_ids', 'labels']

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=8, collate_fn=data_collator
)

In [12]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 512]),
 'attention_mask': torch.Size([8, 512])}

### Setup Model

In [13]:
pip install accelerate


Note: you may need to restart the kernel to use updated packages.


In [22]:
from accelerate import Accelerator
from transformers import AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from tqdm.auto import tqdm

accelerator = Accelerator()
print("Using device:", accelerator.device)

# How many classes?
num_labels = len(label2id)

# Load the model with correct output size
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
optimizer = AdamW(model.parameters(), lr=3e-5)

train_dl, eval_dl, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dl)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
progress_bar = tqdm(range(num_training_steps))

Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/41556 [20:40<?, ?it/s]


In [24]:
!nvidia-smi

Wed May  7 12:46:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.02                 Driver Version: 576.02         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 1050 Ti   WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   62C    P8            N/A  / 5001W |    3517MiB /   4096MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [31]:
import torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

print(torch.cuda.is_available())
print("device name:"+torch.cuda.get_device_name(0))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

True
device name:NVIDIA GeForce GTX 1050 Ti
cuda


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


### Setup Evaluation Computing Metrics

In [25]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)


### Training Arguments

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
)


### Create a Trainer

In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


### Train then Model

In [None]:
progress_bar = tqdm(range(num_training_steps))
trainer.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/41556 [05:12<?, ?it/s]


Epoch,Training Loss,Validation Loss
