In [1]:
!git clone https://github.com/onkarkawade/DeepLearning_ComputerVision.git

Cloning into 'DeepLearning_ComputerVision'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 61 (delta 9), reused 31 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (61/61), 7.28 MiB | 9.58 MiB/s, done.
Resolving deltas: 100% (9/9), done.


In [2]:
cd /content/DeepLearning_ComputerVision/projects/Transformer

/content/DeepLearning_ComputerVision/projects/Transformer


In [3]:
import torch
from model import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
    load_model_from_checkpoint
)


In [20]:
# raw data
#path_do_data = "data/english.txt"
path_do_data = "data/Sample_project_report.txt"
data_raw = open(path_do_data, encoding="utf-8").read()

# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size



In [21]:
# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]



Token indices sequence length is longer than the specified maximum sequence length for this model (28377 > 512). Running this sequence through the model will result in indexing errors


In [22]:
# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
model

Transformer(
  (token_embedding_table): Embedding(30522, 768)
  (position_embedding_table): Embedding(64, 768)
  (blocks): Sequential(
    (0): TransformerBlock(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x AttentionHead(
            (key): Linear(in_features=768, out_features=128, bias=False)
            (query): Linear(in_features=768, out_features=128, bias=False)
            (value): Linear(in_features=768, out_features=128, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): ReLU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((768,), eps=1e-05

In [23]:
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

Model with 89.48M parameters


In [24]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)


In [25]:
for step in range(MAX_ITER):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()

save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoints", epoch=step)



step          0 | train loss 10.7509 | val loss 10.7534
step        500 | train loss 0.2771 | val loss 7.7868
step       1000 | train loss 0.1336 | val loss 8.8329
step       1500 | train loss 0.1146 | val loss 9.0112
step       2000 | train loss 0.1169 | val loss 9.6346
step       2500 | train loss 0.1143 | val loss 9.2942
step       3000 | train loss 0.1047 | val loss 9.4503
step       3500 | train loss 0.1047 | val loss 9.8033
step       4000 | train loss 0.1009 | val loss 9.5543
step       4500 | train loss 0.1061 | val loss 9.6153
step       4999 | train loss 0.0958 | val loss 9.9655
Successfully saved the model to checkpoints/checkpoint_epoch-4999_21.05.2025_06:38:03.pt


In [28]:
# generate some output based on the context
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(
    decode(
        enc_sec=m.generate(idx=context, max_new_tokens=50, block_size=BLOCK_SIZE)[0],
        tokenizer=tokenizer,
    )
)


[PAD]. upon the foundation laid by r - cnn initially employed selective search ( uijlings, et al., 2013 ) as a region proposal technique to generate approximately 2000 region proposals per image. these proposals were then fed into a convolutional


In [29]:
# Ask quetion

model.to(DEVICE)
# Define your prompt
prompt = "What is the this project about?"

# Tokenize prompt
encoded_prompt = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)

with torch.no_grad():
    output_ids = model.generate(idx=encoded_prompt, max_new_tokens=50, block_size=BLOCK_SIZE)
answer = decode(enc_sec=output_ids[0], tokenizer=tokenizer)
print("Answer:", answer)


Answer: [CLS] who is the author for this project? [SEP] of the accuracy and quality of deep learning - based object detection. table 4. 1 : r - cnn series speed comparison ( kim, et al., 2020 ) 34 table 4. 2 : performance matrix of different scaled versions of yolo on coco


In [38]:
import torch
from transformers import AutoTokenizer
from model import Transformer
from utils import DEVICE, BLOCK_SIZE, decode,NUM_EMBED,NUM_HEAD,NUM_LAYER,DROPOUT

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define your prompt
#prompt = "What is the capital of France?"
prompt = "What is this project about?"

# Tokenize prompt
encoded_prompt = tokenizer.encode(prompt, return_tensors="pt").to(DEVICE)

# Load model (same config as before)

model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
model
model.load_state_dict(torch.load("checkpoints/checkpoint_epoch-4999_21.05.2025_06:38:03.pt"))  # update path
model.to(DEVICE)
model.eval()

# Generate answer
with torch.no_grad():
    output_ids = model.generate(idx=encoded_prompt, max_new_tokens=50, block_size=BLOCK_SIZE)

# Decode and print answer
answer = decode(enc_sec=output_ids[0], tokenizer=tokenizer)
print("Answer:", answer)



Answer: [CLS] what is the capital of france? [SEP] are opportunities for further enhancement. future research could focus on leveraging specific helmet characteristics for improved detection against complex backgrounds and exploring combinations with additional vision algorithms for heightened precision. ( nath, et al., 2020 ) in his paper “ deep learning for
