In [None]:
!pip install transformers



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import random
import json
import time
import datetime
import os

In [None]:
import torch
torch.manual_seed(64)
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [None]:
!pip show torch

Name: torch
Version: 2.5.1+cu121
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, fastai, peft, sentence-transformers, timm, torchaudio, torchvision


In [None]:
import requests

# Your file's direct download link
url = "https://drive.google.com/uc?id=11jaUwVcO78NT-NurlYPldyaOvNEwo8iV"

# Download the file
response = requests.get(url)
with open("data.json", "wb") as file:
    file.write(response.content)

# Check if the file is saved
print("File downloaded successfully!")


File downloaded successfully!


In [None]:
with open("data.json", "r") as f:
  data = json.load(f)
print(len(data))
data[:5]

2852


[{'poem': "The line-storm clouds fly tattered and swift,\nThe road is forlorn all day,\nWhere a myriad snowy quartz stones lift,\nAnd the hoof-prints vanish away.\nThe roadside flowers, too wet for the bee,\nExpend their bloom in vain.\nCome over the hills and far with me,\nAnd be my love in the rain.\n\nThe birds have less to say for themselves\nIn the wood-world's torn despair\nThan now these numberless years the elves,\nAlthough they are no less there\nAll song of the woods is crushed like some\nWild, easily shattered rose.\nCome, be my love in the wet woods come,\nWhere the boughs rain when it blows.\n\nThere is the gale to urge behind\nAnd bruit our singing down,\nAnd the shallow waters aflutter with wind\nFrom which to gather your gown.\nWhat matter if we go clear to the west,\nAnd come not through dry-shod?\nFor wilding brooch shall wet your breast\nThe rain-fresh goldenrod.\n\nOh, never this whelming east wind swells\nBut it seems like the sea's return\nTo the ancient lands whe

In [None]:
!nvidia-smi

Sat Jan 25 14:39:00 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   56C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
class PoemDataset(Dataset):
  def __init__(self, poems, tokenizer, max_length=768, gpt2_type="gpt2"):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for poem in poems:

      encodings_dict = tokenizer("<BOS>"+poem["poem"]+"<EOS>",
                                 truncation=True,
                                 max_length=max_length,
                                 padding="max_length",return_tensors="pt")

      self.input_ids.append(encodings_dict["input_ids"].squeeze(0))
      self.attn_masks.append(encodings_dict["attention_mask"].squeeze(0))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]


tokenizer = GPT2Tokenizer.from_pretrained('gpt2',bos_token='<BOS>',eos_token='<EOS>',pad_token='<PAD>')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
max_length = 256
batch_size = 16
dataset = PoemDataset(data, tokenizer, max_length=max_length)

train_size = int(0.85*len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)

val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [None]:
input_ids, attn_mask = dataset[0]
decoded_text = tokenizer.decode(input_ids.tolist(), skip_special_tokens=False)
print("Token IDs:", input_ids)
print("Attention Mask:", attn_mask)
print("Decoded Text:", decoded_text)

Token IDs: tensor([50257,   464,  1627,    12, 12135, 15114,  6129,   256, 10228,   290,
        14622,    11,   198,   464,  2975,   318,   329,    75,  1211,   477,
         1110,    11,   198,  8496,   257, 24862, 46742, 47969, 14966, 10303,
           11,   198,  1870,   262,  8169,  1659,    12, 17190, 39572,  1497,
           13,   198,   464, 39479, 12734,    11,  1165,  9583,   329,   262,
        20697,    11,   198, 16870,   437,   511, 29955,   287, 23469,    13,
          198, 16773,   625,   262, 18639,   290,  1290,   351,   502,    11,
          198,  1870,   307,   616,  1842,   287,   262,  6290,    13,   198,
          198,   464, 10087,   423,  1342,   284,   910,   329,  2405,   198,
          818,   262,  4898,    12,  6894,   338, 12445, 20234,   198,   817,
          272,   783,   777,  1271,  1203,   812,   262, 31959,    11,   198,
         7003,   484,   389,   645,  1342,   612,   198,  3237,  3496,   286,
          262, 16479,   318, 18577,   588,   617,   1

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device


device(type='cuda', index=0)

In [None]:
# Load model configuration
config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)
epochs = 4
warmup_steps = 1e2
sample_every = 100
optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8)


# Toatl training steps is the number of data points times the number of epochs
total_training_steps = len(train_dataloader)*epochs


# Setting a variable learning rate using scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_training_steps)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
# Function to format time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round(elapsed))))

# Training loop
epochs = 4
for epoch_i in range(epochs):
    print(f"Beginning epoch {epoch_i + 1} of {epochs}")
    t0 = time.time()
    total_train_loss = 0
    model.train()  # Set model to training mode

    # Training step
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_masks = [x.to(device) for x in batch]  # Move to GPU

        # Zero out gradients
        model.zero_grad()

        # Forward pass
        outputs = model(b_input_ids, labels=b_input_ids, attention_mask=b_masks)
        loss = outputs.loss
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Backward pass
        loss.backward()

        # Optimize
        optimizer.step()
        scheduler.step()

        # Sampling and logging (reduce sampling frequency)
        if step % 1000 == 0 and step != 0:
            elapsed = format_time(time.time() - t0)
            print(f"Batch {step} of {len(train_dataloader)}. Loss: {batch_loss}. Time: {elapsed}")

            # Sample text generation
            model.eval()
            with torch.no_grad():  # Disable gradient calculation during generation
                sample_output = model.generate(
                    b_input_ids,
                    do_sample=True,
                    max_length=100,
                    top_p=0.95,
                    top_k=50,
                    num_return_sequences=1
                )
                print(tokenizer.decode(sample_output[0], skip_special_tokens=True))
            model.train()

        # Delete variables after each batch to free up memory
        del b_input_ids, b_masks, outputs, loss

    # Average loss per epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print(f"Average Training Loss: {avg_train_loss}. Epoch time: {training_time}")

# Saving the model after training
output_dir = "/content/drive/MyDrive/poem_model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


Beginning epoch 1 of 4
Average Training Loss: 3.9166069454268406. Epoch time: 0:02:54
Beginning epoch 2 of 4
Average Training Loss: 2.8244083774717232. Epoch time: 0:02:55
Beginning epoch 3 of 4
Average Training Loss: 2.5295165902689885. Epoch time: 0:02:55
Beginning epoch 4 of 4
Average Training Loss: 2.279672374850825. Epoch time: 0:02:55
Model and tokenizer saved to /content/drive/MyDrive/poem_model


In [None]:
# Validation loop
model.eval()
total_eval_loss = 0
for batch in val_dataloader:
    b_input_ids, b_masks = [x.to(device) for x in batch]

    with torch.no_grad():  # No gradient computation
        outputs = model(b_input_ids, labels=b_input_ids, attention_mask=b_masks)
        loss = outputs.loss

    total_eval_loss += loss.item()

avg_val_loss = total_eval_loss / len(val_dataloader)
print(f"Validation Loss: {avg_val_loss}")


Validation Loss: 3.026543105090106


In [None]:
del model
del tokenizer
torch.cuda.empty_cache()  # Clear cache


In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Reload the model and tokenizer
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

# Move the model to the appropriate device (GPU/CPU)
model = model.to(device)

# Set the model to evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50260, bias=False)
)

In [None]:
def generate_poem(prompt, max_length=200, temperature=1.0, top_p=0.95, top_k=50):
    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate the poem
    outputs = model.generate(
        inputs,
        do_sample=True,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_return_sequences=1
    )

    # Decode and return the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [None]:
prompt=input()
final_prompt = f"<BOS> {prompt}"
generated_poem = generate_poem(final_prompt)
print(generated_poem)


The sound of a broken heart


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 The sound of a broken heart
Where the soul has fled
When the soul is broken by sin

Or death by love
With sorrow, or sin,

I cannot hear
I cannot hear the voice of God

The voice of God sings to me
O have you stolen my soul

But I do not know you nor hear
I do not see
The power, the wealth of your love
Built my soul

Where the heart of youth
Is broken in faith, or love

Is broken in love, or death in hate
When my heart is broken by hate

Who can lift my hand in thanksgiving
Who can remember my love?
