# Installing Libraries

In [None]:
# Clean install dependencies (optional but safest)
!pip uninstall -y transformers huggingface_hub
!pip install --no-cache-dir transformers huggingface_hub
!pip install --upgrade datasets

Found existing installation: transformers 4.54.0
Uninstalling transformers-4.54.0:
  Successfully uninstalled transformers-4.54.0
Found existing installation: huggingface-hub 0.34.1
Uninstalling huggingface-hub-0.34.1:
  Successfully uninstalled huggingface-hub-0.34.1
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Downloading transformers-4.54.1-py3-none-any.whl (11.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m193.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-0.34.3-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m339.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingfac

In [None]:
# If needed
#!pip uninstall -y torch torchvision torchaudio
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

# Importing Libraries and Loading data

In [None]:
# 🧠 Import packages
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## We started to train on BERT using a small sample size first

# Bert Model with 5% Sample

In [None]:
# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training
train_df = train_df.sample(frac=0.05, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# 🔁 Encode labels using all subject names
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["subject_name"])
test_df["label"] = label_encoder.transform(test_df["subject_name"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Metrics function: accuracy, precision, recall, F1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
#    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="f1",
    greater_is_better=True,
)

# 🏋️ Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9141 [00:00<?, ? examples/s]

Map:   0%|          | 0/6150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.2774,2.035584,0.46748,0.481667,0.46748,0.450845
2,1.5316,1.920889,0.500813,0.494763,0.500813,0.482427


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1144, training_loss=1.8417393210884574, metrics={'train_runtime': 1934.3991, 'train_samples_per_second': 9.451, 'train_steps_per_second': 0.591, 'total_flos': 4811016901515264.0, 'train_loss': 1.8417393210884574, 'epoch': 2.0})

In [None]:
# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)

  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.42      0.22      0.29        59
                     Anatomy       0.45      0.71      0.55       259
                Biochemistry       0.63      0.71      0.66       352
                      Dental       0.85      0.56      0.67      1203
                         ENT       0.40      0.44      0.42        86
           Forensic Medicine       0.48      0.56      0.52       132
    Gynaecology & Obstetrics       0.74      0.58      0.65       532
                    Medicine       0.31      0.47      0.38       372
                Microbiology       0.34      0.59      0.43       167
               Ophthalmology       0.53      0.59      0.56       177
                Orthopaedics       0.00      0.00      0.00         0
                   Pathology       0.26      0.37      0.30       305
                  Pediatrics       0.49      0.45      0.47    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# 💾 Save fine-tuned model and tokenizer in Colab
save_path = "/content/saved_First_Bert_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")

✅ Model and tokenizer saved to: /content/saved_First_Bert_model


# Bert Model with 10% Sample

In [None]:
# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training
train_df = train_df.sample(frac=0.1, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# 🔁 Encode labels using all subject names
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["subject_name"])
test_df["label"] = label_encoder.transform(test_df["subject_name"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Metrics function: accuracy, precision, recall, F1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
#    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="f1",
    greater_is_better=True,
)

# 🏋️ Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)


Map:   0%|          | 0/18282 [00:00<?, ? examples/s]

Map:   0%|          | 0/6150 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.8067,1.774357,0.516585,0.489524,0.516585,0.494835
2,1.2709,1.776543,0.533659,0.52494,0.533659,0.515925


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.35      0.32      0.33        59
                     Anatomy       0.52      0.72      0.60       259
                Biochemistry       0.69      0.73      0.71       352
                      Dental       0.87      0.61      0.72      1203
                         ENT       0.45      0.48      0.46        86
           Forensic Medicine       0.51      0.64      0.57       132
    Gynaecology & Obstetrics       0.75      0.65      0.70       532
                    Medicine       0.37      0.43      0.40       372
                Microbiology       0.36      0.61      0.45       167
               Ophthalmology       0.57      0.68      0.62       177
                Orthopaedics       0.00      0.00      0.00         0
                   Pathology       0.27      0.40      0.32       305
                  Pediatrics       0.53      0.54      0.54    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 50% sample

### We observed in the logistic model section using the confusion matrix that some subjects can be grouped

In [None]:
# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# 🔁 Encode labels using all subject names
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["subject_name"])
test_df["label"] = label_encoder.transform(test_df["subject_name"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Metrics function: accuracy, precision, recall, F1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
#    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
    metric_for_best_model="f1",
    greater_is_better=True,
)

# 🏋️ Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/91411 [00:00<?, ? examples/s]

Map:   0%|          | 0/6150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2776,1.592194,0.575122,0.566671,0.575122,0.55755
2,0.9026,1.545346,0.593008,0.57287,0.593008,0.572481


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  return forward_call(*args, **kwargs)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.57      0.64      0.60        59
                     Anatomy       0.57      0.78      0.66       259
                Biochemistry       0.68      0.81      0.74       352
                      Dental       0.92      0.68      0.78      1203
                         ENT       0.46      0.57      0.51        86
           Forensic Medicine       0.54      0.74      0.62       132
    Gynaecology & Obstetrics       0.81      0.70      0.76       532
                    Medicine       0.50      0.50      0.50       372
                Microbiology       0.45      0.72      0.55       167
               Ophthalmology       0.72      0.77      0.75       177
                Orthopaedics       0.00      0.00      0.00         0
                   Pathology       0.36      0.43      0.39       305
                  Pediatrics       0.53      0.66      0.59    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Now 50% sample with grouped subjects

In [None]:
# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training (optional)
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# ✅ Group similar subjects
group_map = {
    "Medicine": "Medicine & Pathology",
    "Pathology": "Medicine & Pathology",
    "Orthopaedics": "Surgery & Orthopaedics",
    "Surgery": "Surgery & Orthopaedics",
    "Skin": "Skin & Dental",
    "Dental": "Skin & Dental"
}
train_df["grouped_subject"] = train_df["subject_name"].replace(group_map)
test_df["grouped_subject"] = test_df["subject_name"].replace(group_map)

# 🔁 Encode grouped labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["grouped_subject"])
test_df["label"] = label_encoder.transform(test_df["grouped_subject"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Compute metrics: Accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# 🏁 Set training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
 #   weight_decay=0.01, #look into this and add learning rate and early stopping
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
)

# 🏋️ Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 🚀 Run training
trainer.train()

# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/91411 [00:00<?, ? examples/s]

Map:   0%|          | 0/6150 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1536,1.474704,0.596585


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.1536,1.474704,0.596585
2,0.8131,1.438585,0.622602


  return forward_call(*args, **kwargs)


📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.54      0.68      0.60        59
                     Anatomy       0.57      0.76      0.65       259
                Biochemistry       0.70      0.83      0.76       352
                         ENT       0.49      0.60      0.54        86
           Forensic Medicine       0.53      0.71      0.61       132
    Gynaecology & Obstetrics       0.81      0.68      0.74       532
        Medicine & Pathology       0.51      0.60      0.55       677
                Microbiology       0.47      0.69      0.56       167
               Ophthalmology       0.73      0.76      0.75       177
                  Pediatrics       0.55      0.66      0.60       190
                Pharmacology       0.53      0.81      0.64       317
                  Physiology       0.65      0.64      0.65       388
                  Psychiatry       0.06      0.67      0.11    

# 50% Sample with grouped subjects, removing unknowns and concatenating answer options

In [None]:
# 🧠 Import packages
import pandas as pd
import torch
from datasets import Dataset
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training (optional)
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# ✅ Group similar subjects
group_map = {
    "Medicine": "Medicine & Pathology",
    "Pathology": "Medicine & Pathology",
    "Orthopaedics": "Surgery & Orthopaedics",
    "Surgery": "Surgery & Orthopaedics",
    "Skin": "Skin & Dental",
    "Dental": "Skin & Dental"
}
train_df["grouped_subject"] = train_df["subject_name"].replace(group_map)
test_df["grouped_subject"] = test_df["subject_name"].replace(group_map)

# ❌ Remove 'Unknown' category
train_df = train_df[train_df["grouped_subject"] != "Unknown"].copy()
test_df = test_df[test_df["grouped_subject"] != "Unknown"].copy()

# 🔗 Concatenate question + options
def concat_q_with_options(df):
    for col in ["question", "opa", "opb", "opc", "opd", "exp"]:
        df[col] = df[col].fillna("").astype(str)
    return (
        df["question"] + " [SEP] " +
        df["opa"] + " [SEP] " +
        df["opb"] + " [SEP] " +
        df["opc"] + " [SEP] " +
        df["opd"] + " [SEP] " +
        df["exp"]
    )

X_all = concat_q_with_options(pd.concat([train_df, test_df], ignore_index=True))
y_all = pd.concat([train_df["grouped_subject"], test_df["grouped_subject"]], ignore_index=True)

# 🧪 Step 3: Stratified train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, stratify=y_all, random_state=42
)

# 🧾 Rebuild DataFrames from split
train_df = pd.DataFrame({"text": X_train, "grouped_subject": y_train})
test_df = pd.DataFrame({"text": X_test, "grouped_subject": y_test})

# 🔁 Encode grouped labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["grouped_subject"])
test_df["label"] = label_encoder.transform(test_df["grouped_subject"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize input text
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# 🏁 Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    load_best_model_at_end=True,
    report_to="none",
)

# 🏋️ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 🚀 Train the model
trainer.train()

# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)


Map:   0%|          | 0/76275 [00:00<?, ? examples/s]

Map:   0%|          | 0/19069 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7018,0.644247,0.804657


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7018,0.644247,0.804657
2,0.4369,0.564593,0.831664


  return forward_call(*args, **kwargs)


📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.83      0.82      0.82       334
                     Anatomy       0.85      0.81      0.83      1513
                Biochemistry       0.87      0.88      0.88       894
                         ENT       0.84      0.87      0.86       516
           Forensic Medicine       0.90      0.89      0.90       607
    Gynaecology & Obstetrics       0.88      0.87      0.87      1126
        Medicine & Pathology       0.79      0.80      0.80      3408
                Microbiology       0.83      0.83      0.83      1149
               Ophthalmology       0.88      0.93      0.90       734
                  Pediatrics       0.74      0.75      0.74       853
                Pharmacology       0.84      0.87      0.86      1412
                  Physiology       0.82      0.81      0.82       966
                  Psychiatry       0.89      0.85      0.87    

In [None]:
# 💾 Save fine-tuned model and tokenizer in Colab
save_path = "/content/saved_model_with50%_Bert"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")


✅ Model and tokenizer saved to: /content/saved_model_with50%_Bert


In [None]:
import os

save_path = "/content/saved_model_with50%_Bert"
print("📁 Contents of saved_model folder:")
print(os.listdir(save_path))


📁 Contents of saved_model folder:
['model.safetensors', 'special_tokens_map.json', 'vocab.txt', 'tokenizer.json', 'config.json', 'tokenizer_config.json']


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("/content/saved_model_with50%_Bert")
tokenizer = AutoTokenizer.from_pretrained("/content/saved_model_with50%_Bert")


In [None]:
train_df["grouped_subject"].value_counts()


Unnamed: 0_level_0,count
grouped_subject,Unnamed: 1_level_1
Medicine & Pathology,16362
Surgery & Orthopaedics,10001
Anatomy,7308
Pharmacology,6743
Social & Preventive Medicine,5942
Microbiology,5577
Skin & Dental,5347
Gynaecology & Obstetrics,5097
Physiology,4443
Biochemistry,4118


In [None]:
from sklearn.utils import resample

# ⚖️ Define minimum number of samples per class
min_samples = 5000

# 🔁 Upsample minority classes
dfs = []
target_col = "grouped_subject"

for label, group in train_df.groupby(target_col):
    if len(group) < min_samples:
        group_upsampled = resample(
            group,
            replace=True,
            n_samples=min_samples,
            random_state=42
        )
        dfs.append(group_upsampled)
    else:
        dfs.append(group)  # keep as-is

# ✅ Combine & shuffle
balanced_train_df = pd.concat(dfs)
balanced_train_df = balanced_train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 🧾 Show new class distribution
print(balanced_train_df["grouped_subject"].value_counts())


grouped_subject
Medicine & Pathology            16362
Surgery & Orthopaedics          10001
Anatomy                          7308
Pharmacology                     6743
Social & Preventive Medicine     5942
Microbiology                     5577
Skin & Dental                    5347
Gynaecology & Obstetrics         5097
Pediatrics                       5000
Psychiatry                       5000
Unknown                          5000
Physiology                       5000
Ophthalmology                    5000
ENT                              5000
Biochemistry                     5000
Anaesthesia                      5000
Forensic Medicine                5000
Radiology                        5000
Name: count, dtype: int64


In [None]:
# 🧠 Import packages
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training (optional)
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=1, random_state=42).copy()

# ✅ Group similar subjects
group_map = {
    "Medicine": "Medicine & Pathology",
    "Pathology": "Medicine & Pathology",
    "Orthopaedics": "Surgery & Orthopaedics",
    "Surgery": "Surgery & Orthopaedics",
    "Skin": "Skin & Dental",
    "Dental": "Skin & Dental"
}
train_df["grouped_subject"] = train_df["subject_name"].replace(group_map)
test_df["grouped_subject"] = test_df["subject_name"].replace(group_map)

# 🔁 Encode grouped labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["grouped_subject"])
test_df["label"] = label_encoder.transform(test_df["grouped_subject"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(balanced_train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Compute metrics: accuracy, precision, recall, f1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training arguments with learning rate
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,                # <-- Set to 2 epochs here
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)

# 🏋️ Trainer with early stopping patience 1
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

# 🚀 Run training
trainer.train()

# 📊 Print epoch-wise metrics
print("\n📈 Metrics by Epoch:")
for log in trainer.state.log_history:
    if all(k in log for k in ["eval_f1", "epoch"]):
        print(f"Epoch {int(log['epoch'])}: "
              f"Accuracy = {log.get('eval_accuracy', 0):.4f}, "
              f"Precision = {log.get('eval_precision', 0):.4f}, "
              f"Recall = {log.get('eval_recall', 0):.4f}, "
              f"F1 = {log.get('eval_f1', 0):.4f}")


Map:   0%|          | 0/112377 [00:00<?, ? examples/s]

Map:   0%|          | 0/3075 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0759,1.458033,0.587967,0.569854,0.587967,0.567409
2,0.77,1.490104,0.605203,0.58265,0.605203,0.58437



📈 Metrics by Epoch:
Epoch 1: Accuracy = 0.5880, Precision = 0.5699, Recall = 0.5880, F1 = 0.5674
Epoch 2: Accuracy = 0.6052, Precision = 0.5827, Recall = 0.6052, F1 = 0.5844


In [None]:
# 🔮 Predict on test set
import numpy as np
from sklearn.metrics import classification_report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

# 🏷️ Convert label IDs to subject names
y_pred_labels = [id2label[i] for i in y_pred]
y_true_labels = [id2label[i] for i in y_true]

# 📊 Classification report with accuracy & F1 per subject
report = classification_report(y_true_labels, y_pred_labels, output_dict=False)
print("📊 Classification Report:\n")
print(report)

📊 Classification Report:

                              precision    recall  f1-score   support

                 Anaesthesia       0.42      0.62      0.50        24
                     Anatomy       0.58      0.74      0.65       122
                Biochemistry       0.72      0.82      0.77       169
                         ENT       0.42      0.64      0.50        44
           Forensic Medicine       0.51      0.70      0.59        66
    Gynaecology & Obstetrics       0.80      0.66      0.72       262
        Medicine & Pathology       0.49      0.60      0.54       319
                Microbiology       0.45      0.63      0.52        84
               Ophthalmology       0.65      0.74      0.69        88
                  Pediatrics       0.52      0.62      0.57       104
                Pharmacology       0.56      0.80      0.66       171
                  Physiology       0.68      0.67      0.67       211
                  Psychiatry       0.03      0.33      0.05    

In [None]:
# 💾 Save fine-tuned model and tokenizer in Colab
save_path = "/content/saved_model_with50%_Bert_oversampled"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Model and tokenizer saved to: {save_path}")

✅ Model and tokenizer saved to: /content/saved_model_with50%_Bert_oversampled


In [None]:
import os

save_path = "/content/saved_model_with50%_Bert_oversampled"
print("📁 Contents of saved_model folder:")
print(os.listdir(save_path))

📁 Contents of saved_model folder:
['special_tokens_map.json', 'config.json', 'model.safetensors', 'tokenizer_config.json', 'tokenizer.json', 'vocab.txt']


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("/content/saved_model_with50%_Bert_oversampled")
tokenizer = AutoTokenizer.from_pretrained("/content/saved_model_with50%_Bert_oversampled")

## re-running BERT after removing unknowns

In [None]:
# 🧠 Import packages
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training (optional)
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=0.5, random_state=42).copy()

# ✅ Group similar subjects
group_map = {
    "Medicine": "Medicine & Pathology",
    "Pathology": "Medicine & Pathology",
    "Orthopaedics": "Surgery & Orthopaedics",
    "Surgery": "Surgery & Orthopaedics",
    "Skin": "Skin & Dental",
    "Dental": "Skin & Dental"
}
train_df["grouped_subject"] = train_df["subject_name"].replace(group_map)
test_df["grouped_subject"] = test_df["subject_name"].replace(group_map)

# ❌ Remove 'Unknown' category
train_df = train_df[train_df["grouped_subject"] != "Unknown"].copy()
test_df = test_df[test_df["grouped_subject"] != "Unknown"].copy()

# 🔁 Encode grouped labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["grouped_subject"])
test_df["label"] = label_encoder.transform(test_df["grouped_subject"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Compute metrics: accuracy, precision, recall, f1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training arguments with learning rate
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)

# 🏋️ Trainer with early stopping patience 1
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

# 🚀 Run training
trainer.train()

# 📊 Print epoch-wise metrics
print("\n📈 Metrics by Epoch:")
for log in trainer.state.log_history:
    if all(k in log for k in ["eval_f1", "epoch"]):
        print(f"Epoch {int(log['epoch'])}: "
              f"Accuracy = {log.get('eval_accuracy', 0):.4f}, "
              f"Precision = {log.get('eval_precision', 0):.4f}, "
              f"Recall = {log.get('eval_recall', 0):.4f}, "
              f"F1 = {log.get('eval_f1', 0):.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/89876 [00:00<?, ? examples/s]

Map:   0%|          | 0/2730 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1292,1.123923,0.666667,0.708326,0.666667,0.675493


In [None]:
# 🧠 Import packages
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 🔽 Load MedMCQA data from Hugging Face Hub (as DataFrame)
splits = {
    'train': 'data/train-00000-of-00001.parquet',
    'test': 'data/test-00000-of-00001.parquet',
}
train_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["train"])
test_df = pd.read_parquet("hf://datasets/openlifescienceai/medmcqa/" + splits["test"])

# 🔥 Reduce size for faster training (optional)
train_df = train_df.sample(frac=0.5, random_state=42).copy()
test_df = test_df.sample(frac=0.5, random_state=42).copy()

# ✅ Group similar subjects
group_map = {
    "Medicine": "Medicine & Pathology",
    "Pathology": "Medicine & Pathology",
    "Orthopaedics": "Surgery & Orthopaedics",
    "Surgery": "Surgery & Orthopaedics",
    "Skin": "Skin & Dental",
    "Dental": "Skin & Dental"
}
train_df["grouped_subject"] = train_df["subject_name"].replace(group_map)
test_df["grouped_subject"] = test_df["subject_name"].replace(group_map)

# ❌ Remove 'Unknown' category
train_df = train_df[train_df["grouped_subject"] != "Unknown"].copy()
test_df = test_df[test_df["grouped_subject"] != "Unknown"].copy()

# 🔁 Encode grouped labels
label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["grouped_subject"])
test_df["label"] = label_encoder.transform(test_df["grouped_subject"])
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
id2label = {i: label for label, i in label2id.items()}

# 📦 Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# ✂️ Tokenize questions
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["question"], padding="max_length", truncation=True)
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# 🧹 Keep only input columns for model
keep_cols = tokenizer.model_input_names + ["label"]
train_dataset = train_dataset.remove_columns([c for c in train_dataset.column_names if c not in keep_cols])
test_dataset = test_dataset.remove_columns([c for c in test_dataset.column_names if c not in keep_cols])
train_dataset.set_format("torch")
test_dataset.set_format("torch")

# 🤖 Load BERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
)

# 🧮 Compute metrics: accuracy, precision, recall, f1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# 🏁 Set training arguments with learning rate
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to="none",
)

# 🏋️ Trainer with early stopping patience 1
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)],
)

# 🚀 Run training
trainer.train()

# 📊 Print epoch-wise metrics
print("\n📈 Metrics by Epoch:")
for log in trainer.state.log_history:
    if all(k in log for k in ["eval_f1", "epoch"]):
        print(f"Epoch {int(log['epoch'])}: "
              f"Accuracy = {log.get('eval_accuracy', 0):.4f}, "
              f"Precision = {log.get('eval_precision', 0):.4f}, "
              f"Recall = {log.get('eval_recall', 0):.4f}, "
              f"F1 = {log.get('eval_f1', 0):.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/89876 [00:00<?, ? examples/s]

Map:   0%|          | 0/2730 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
