<a href="https://colab.research.google.com/github/madharapu-Reethika/2203A51141nlp-assignments/blob/main/transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
import os
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset
os.environ["WANDB_DISABLED"] = "true"


In [26]:

text_data = """Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods..."""


In [27]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, text, max_length=50):
        self.input_ids = []
        self.attn_masks = []

        encodings = tokenizer(text, truncation=True, padding="max_length", max_length=max_length)
        self.input_ids.append(torch.tensor(encodings['input_ids']))
        self.attn_masks.append(torch.tensor(encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attn_masks[idx]
        }


In [28]:

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)




In [29]:

dataset = TextDataset(tokenizer=tokenizer, text=text_data)


In [30]:
def compute_loss(model, inputs, return_outputs=False):
    """
    Computes the loss for the GPT-2 model.

    Args:
        model: The GPT-2 model.
        inputs: A dictionary containing the input tensors.
        return_outputs: Whether to return the model outputs along with the loss.

    Returns:
        The loss value, or a tuple of (loss, outputs) if return_outputs is True.
    """
    labels = inputs.get("input_ids").clone()
    labels[labels == tokenizer.pad_token_id] = -100
    outputs = model(**inputs, labels=labels)
    return (outputs.loss, outputs) if return_outputs else outputs.loss


In [31]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return compute_loss(model, inputs, return_outputs)


In [32]:
def train_model(epochs):
    training_args = TrainingArguments(
        output_dir="./results",
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=1,
        save_steps=10_000,
        save_total_limit=2,
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )
    trainer.train()


In [33]:

for epochs in [20, 60, 70]:
    print(f"\nTraining model for {epochs} epochs...")
    train_model(epochs)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Training model for 20 epochs...


Step,Training Loss


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Training model for 60 epochs...


Step,Training Loss


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



Training model for 70 epochs...


Step,Training Loss


In [34]:
def generate_text(seed_text, max_length=50):
    inputs = tokenizer.encode(seed_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [36]:

seed_text = "Once upon a time"
generated_text = generate_text(seed_text)
print("Generated Text:\n", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 Once upon a time, there was a little girl named Red Riding Hood. She loved to visit her grandmother, who lived in the woods... She loved to visit her grandmother, who lived in the woods... She loved to visit her grandmother, who lived


In [40]:
import optuna

In [47]:
def objective(trial):
    global inputs
    seed_text = "Once upon a time"
    inputs = tokenizer.encode(seed_text, return_tensors="pt")
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-4)
    epochs = trial.suggest_int('epochs', 10, 80)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    model.train()
    for epoch in range(epochs):

        outputs = model(input_ids=inputs["input_ids"], labels=inputs["input_ids"])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return loss.item()