In [None]:
!pip install transformers datasets nltk torch wandb
!python -m nltk.downloader punkt


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
from datasets import load_dataset, Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
)
import os

In [None]:
dataset = load_dataset("squad")

# Preprocess dataset: group questions by context
def preprocess_dataset(dataset_split):
    grouped_data = {}

    for example in dataset_split:
        context = example["context"]
        question = example["question"]

        # Group questions by context
        if context not in grouped_data:
            grouped_data[context] = []
        grouped_data[context].append(question)

    # Prepare the dataset with source_text and target_text
    processed_data = {
        "source_text": [],
        "target_text": []
    }
    for context, questions in grouped_data.items():
        processed_data["source_text"].append(f"context: {context}")
        processed_data["target_text"].append(" <sep> ".join(questions))

    return processed_data

# Process the train and validation datasets
train_processed = preprocess_dataset(dataset["train"])
validation_processed = preprocess_dataset(dataset["validation"])

# Convert processed data into Dataset objects
train_dataset = Dataset.from_dict(train_processed)
validation_dataset = Dataset.from_dict(validation_processed)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(validation_dataset)}")
print(train_dataset[0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Training set size: 18891
Validation set size: 2067
{'source_text': 'context: Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'target_text': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? <sep> What is in front of the Notre Dame Main Building? <sep> The Basilica of the Sacred heart at Notre Dame is beside to which structur

In [None]:
# Load T5 model and tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Add the custom <sep> token to the tokenizer and resize embeddings
tokenizer.add_special_tokens({"additional_special_tokens": ["<sep>"]})
model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Embedding(32101, 512)

In [None]:
# Tokenize the dataset
def tokenize_function(example):
    model_inputs = tokenizer(
        example["source_text"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target_text"],
            max_length=256,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

# Create a data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/18891 [00:00<?, ? examples/s]



Map:   0%|          | 0/2067 [00:00<?, ? examples/s]

In [None]:

os.environ["WANDB_MODE"] = "disabled"


In [None]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
)



In [None]:
# Initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.6603,0.64537
2,0.6198,0.616589
3,0.6089,0.609972


TrainOutput(global_step=7086, training_loss=0.6925475943939786, metrics={'train_runtime': 3650.7933, 'train_samples_per_second': 15.523, 'train_steps_per_second': 1.941, 'total_flos': 7670225914822656.0, 'train_loss': 0.6925475943939786, 'epoch': 3.0})

In [None]:
# Save the fine-tuned model and tokenizer
output_dir = "./fine_tuned_t5_question_generator"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


('./fine_tuned_t5_question_generator/tokenizer_config.json',
 './fine_tuned_t5_question_generator/special_tokens_map.json',
 './fine_tuned_t5_question_generator/spiece.model',
 './fine_tuned_t5_question_generator/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model_dir = "./fine_tuned_t5_question_generator"

tokenizer = T5Tokenizer.from_pretrained(model_dir)
model = T5ForConditionalGeneration.from_pretrained(model_dir)

# Ensure the additional tokens (e.g., <sep>) are still recognized
tokenizer.add_special_tokens({"additional_special_tokens": ["<sep>"]})
model.resize_token_embeddings(len(tokenizer))


Embedding(32101, 512)

In [None]:
def generate_questions(context):
    input_ids = tokenizer.encode(f"context: {context}", return_tensors="pt", max_length=512, truncation=True)
    output_ids = model.generate(input_ids, max_length=256, num_beams=5, num_return_sequences=1)
    questions = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return questions



In [None]:
# Example context
context = """
Albert Einstein was a theoretical physicist who developed the theory of relativity, one of the two pillars of modern physics
(alongside quantum mechanics). His work is also known for its influence on the philosophy of science.
"""

# Generate questions
generated_questions = generate_questions(context)

# Print the generated questions
print("Generated Questions:")
print(generated_questions)


Generated Questions:
What is Albert Einstein's theory of relativity? What is Albert Einstein's theory of relativity known for? What is Albert Einstein's theory of relativity known for? What is Albert Einstein's theory of relativity known for? What is Albert Einstein's theory of relativity known for?


In [None]:
# Example context
context = """
The Amazon rainforest, often referred to as the "lungs of the Earth," produces 20% of the world's oxygen. It is home to millions of species
 of plants and animals, many of which are not found anywhere else on the planet. Deforestation in the Amazon has raised concerns about
 climate change and biodiversity loss.

"""

# Generate questions
generated_questions = generate_questions(context)

# Print the generated questions
print("Generated Questions:")
print(generated_questions)


In [None]:
# Test the model on a few examples
references = []
predictions = []

for example in validation_dataset:
    # Access the correct key names
    context = example["source_text"]  # Correct key is 'source_text'
    generated_questions = generate_questions(context)  # Your custom question generation function

    print("Context:", context)
    print("Generated Questions:", generated_questions)
    print("-" * 50)

    reference_questions = example["target_text"]  # Ground truth questions
    # Append the reference and prediction to lists
    references.append(reference_questions)  # Ground truth
    predictions.append(generated_questions)  # Generated questions


Context: context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
Generated Questions: What was the name of the Super Bowl 50? What was the name of the Super Bowl 50? What was the name of the Super Bowl 50? What was the name of the Super Bowl 50? What was the name of the 