In [6]:
!pip install -q --upgrade transformers datasets peft accelerate bitsandbytes sentencepiece


In [7]:
from datasets import load_dataset

ds = load_dataset("gretelai/symptom_to_diagnosis")
print(ds.keys())
print(len(ds["train"]))
print(ds["train"][0])

import json

# Load the training split of the dataset into a list of dictionaries
file = ds["train"]

dict_keys(['train', 'test'])
853
{'output_text': 'cervical spondylosis', 'input_text': "I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak."}


In [8]:
def reformat_data(example):
  """Reformats a dataset example into the desired JSON structure."""
  return {
      "instruction": example["input_text"],
      "output": example["output_text"]
  }

# Apply the reformatting function to the training split and remove original columns
reformatted_train_data = ds["train"].map(reformat_data, remove_columns=["input_text", "output_text"])

In [9]:
import transformers
print(transformers.__version__)

4.55.3


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, Dataset
import numpy as np # Import numpy for np.where in the evaluation cell
import torch # Import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Load and reformat the dataset (assuming this is done in previous cells)
# If not, you would need to add the code from those cells here as well.
# For now, assuming 'reformatted_train_data' and 'reformatted_test_data' are available.

# Tokenize the dataset
def tokenize_function(examples):
    inputs = examples["instruction"] # inputs is a single string when batched=False
    # Tokenize the input string and return tensors
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    # Tokenize labels (output string) and return tensors
    labels = tokenizer(
        text_target=examples["output"],
        max_length=8,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # When batched=False in map, tokenizer returns BatchEncoding with tensor values (shape [1, sequence_length]).
    # We need to remove the batch dimension [1].
    input_ids = model_inputs["input_ids"].squeeze(0)
    attention_mask = model_inputs["attention_mask"].squeeze(0)
    labels_input_ids = labels["input_ids"].squeeze(0)


    # Replace pad tokens in labels with -100 so loss ignores them
    # This needs to be done on a tensor.
    labels_with_ignored_padding = torch.where(labels_input_ids == tokenizer.pad_token_id, torch.tensor(-100, dtype=torch.long), labels_input_ids)

    # Return a dictionary with tensor values (shape [sequence_length]).
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels_with_ignored_padding}


# Tokenize the datasets (assuming reformatted_train_data and reformatted_test_data are defined)
# If not defined, you need to run the cells that define them first.
# For the purpose of fixing the NameError, I will add the code to load and reformat the data as well to make this cell self-contained for demonstration.

ds = load_dataset("gretelai/symptom_to_diagnosis")

def reformat_data(example):
  """Reformats a dataset example into the desired JSON structure."""
  return {
      "instruction": example["input_text"],
      "output": example["output_text"]
  }

# Apply the reformatting function to the training split and remove original columns
reformatted_train_data = ds["train"].map(reformat_data, remove_columns=["input_text", "output_text"])

# Create a validation split using select and then apply the reformatting function
ds["validation"] = ds["test"].select(range(100))
reformatted_test_data = ds["validation"].map(reformat_data, remove_columns=["input_text", "output_text"])


# Ensure batched=False here
tokenized_train_datasets = reformatted_train_data.map(tokenize_function, batched=False)
tokenized_test_datasets = reformatted_test_data.map(tokenize_function, batched=False)


# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-lora-samsum",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=1e-5,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    eval_strategy="steps",  # ✅ correct for 4.55.2
    eval_steps=50,
    save_steps=100,
    fp16=False,
    report_to="none",
)



# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_test_datasets, # Add an evaluation dataset if you split your data
    #train_dataset=reformatted_train_data,
    #eval_dataset=reformatted_test_data,
    #tokenizer=tokenizer,
    processing_class=tokenizer,
    data_collator=data_collator,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
# Start training
trainer.train()

Step,Training Loss,Validation Loss
50,2.4323,2.198553
100,1.8892,1.71256
150,1.9515,1.540012
200,1.8223,1.480714


TrainOutput(global_step=214, training_loss=2.248125965350142, metrics={'train_runtime': 98.304, 'train_samples_per_second': 8.677, 'train_steps_per_second': 2.177, 'total_flos': 584098021638144.0, 'train_loss': 2.248125965350142, 'epoch': 1.0})

In [14]:
# -------------------------
# Evaluation
# -------------------------
from tqdm import tqdm # Import tqdm
import torch
from torch.utils.data import DataLoader

predictions = []
references = []

# Create a DataLoader for the test dataset subset
# Assuming `tokenized_test_datasets` is available from previous steps
# Limit to the first 100 examples as indicated in the print statement
test_dataset_subset = tokenized_test_datasets.select(range(100))
test_dataloader_subset = DataLoader(test_dataset_subset, batch_size=4, collate_fn=data_collator)


print("Starting evaluation on 100 random test examples...")
for batch in tqdm(test_dataloader_subset):
    if torch.cuda.is_available():
        batch = {k: v.to("cuda") for k, v in batch.items()}

    with torch.no_grad():
        generated_tokens = model.generate(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            max_length=20  # match labels
        )

    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # Replace -100 in labels to decode
    labels = batch["labels"].cpu().numpy()
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

print("Evaluation complete.")

# -------------------------
# Simple Accuracy
# -------------------------
correct_predictions = sum([1 for pred, label in zip(predictions, references) if pred.strip() == label.strip()])
accuracy = correct_predictions / len(references) if len(references) > 0 else 0
print(f"Accuracy on 100 random test examples: {accuracy:.4f}")

Starting evaluation on 100 random test examples...


  0%|          | 0/25 [00:00<?, ?it/s]


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`instruction` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
# Inspect the first few examples of the tokenized training data
for i in range(3):
    print(f"Example {i+1}:")
    example = tokenized_train_datasets[i]
    print("Input IDs:", example["input_ids"][:20]) # Print first 20 tokens for brevity
    print("Labels:", example["labels"][:20]) # Print first 20 tokens for brevity
    # Optionally, decode to see the text representation
    print("Decoded Input:", tokenizer.decode(example["input_ids"], skip_special_tokens=True))
    # Need to handle -100 in labels before decoding
    labels = example["labels"].copy()
    labels = [label if label != -100 else tokenizer.pad_token_id for label in labels]
    print("Decoded Labels:", tokenizer.decode(labels, skip_special_tokens=True))
    print("-" * 30)