<a href="https://colab.research.google.com/github/ntirupathirao18/python_codes/blob/main/Untitled20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install ultralytics pillow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import urllib.request
import zipfile
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import cv2
from ultralytics import YOLO
from torch import nn
from torchvision import transforms
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
from torch.nn import functional as F
import math

In [None]:

from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import requests

# Download and extract the dataset
dataset_url = "https://rrc.cvc.uab.es/downloads/ch4_training_images.zip"
gt_url = "https://rrc.cvc.uab.es/downloads/ch4_training_localization_transcription_gt.zip"

def download_and_extract(zipurl, target_dir):
    # zip_path = os.path.join(target_dir, os.path.basename(url))
    os.makedirs(target_dir, exist_ok=True)
    # urllib.request.urlretrieve(url, zip_path)
    # with urlopen(zipurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall(target_dir)
    # with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    #     zip_ref.extractall(target_dir)


def extract_zipfiles(zip_file, path):
    os.makedirs(os.path.join(path, os.path.basename(zip_file).replace('.zip','')), exist_ok=True)
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(path)

base_dir = '/content/icdar2015_dataset'


extract_zipfiles('/content/icdar2015_dataset/ch4_training_images.zip' , '/content/icdar2015_dataset')
extract_zipfiles('/content/icdar2015_dataset/ch4_training_localization_transcription_gt.zip', '/content/icdar2015_dataset')


# Custom Dataset class
class ICDAR2015Dataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_dir = os.path.join(root_dir, "ch4_training_images")
        self.gt_dir = os.path.join(root_dir, "ch4_training_localization_transcription_gt")
        self.image_files = sorted([f for f in os.listdir(self.image_dir) if f.endswith('.jpg')])

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        gt_path = os.path.join(self.gt_dir, f"gt_{img_name[:-4]}.txt")

        image = Image.open(img_path).convert('RGB')

        bboxes = []
        texts = []
        with open(gt_path, 'r', encoding='utf-8-sig') as f:
            for line in f:
                parts = line.strip().split(',')
                coords = list(map(int, parts[:8]))
                text = ','.join(parts[8:]).strip('"')
                bboxes.append(coords)
                texts.append(text)

        if self.transform:
            image = self.transform(image)

        return {
            'image': image,
            'bboxes': torch.tensor(bboxes, dtype=torch.float32),
            'texts': texts
        }

# Define transforms
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create dataset and dataloader
dataset = ICDAR2015Dataset(base_dir, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)

# Character set
char_set = set()
for item in dataset:
    for text in item['texts']:
        char_set.update(text)
char_to_idx = {char: idx for idx, char in enumerate(sorted(char_set))}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
num_chars = len(char_set)

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
class TransformerOCR(nn.Module):
    def __init__(self, input_dim, num_chars, num_heads=2, num_encoder_layers=2, dim_feedforward=2048, dropout=0.1):
        super(TransformerOCR, self).__init__()
        self.pos_encoder = PositionalEncoding(input_dim, dropout)
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.fc_out = nn.Linear(input_dim, num_chars)
        self.input_dim = input_dim

    def forward(self, x):
        x = x.flatten(2).permute(2, 0, 1)  # (seq_len, batch, input_dim)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # Global average pooling
        return self.fc_out(x)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)


# Modified OCRModel
class OCRModel(nn.Module):
    def __init__(self, num_chars):
        super(OCRModel, self).__init__()
        self.yolo = YOLO('yolov8n.pt')

        dummy_input = torch.randn(1, 3, 640, 640)
        with torch.no_grad():
            features = self.yolo.model(dummy_input, augment=False)[1]
        input_dim = features[-1].shape[1]

        self.char_recognizer = TransformerOCR(input_dim, num_chars)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        yolo_results = self.yolo(x)
        features = self.yolo.model(x, augment=False)[1]
        last_feature_map = features[-1]

        char_results = []
        for box in yolo_results[0].boxes.xyxy:
            x1, y1, x2, y2 = box
            region = last_feature_map[:, :, int(y1/8):int(y2/8), int(x1/8):int(x2/8)]
            char_logits = self.char_recognizer(region)
            char_probs = self.sigmoid(char_logits)
            char_results.append(char_probs)

        return yolo_results, char_results

# Initialize model
num_chars = 15
model = OCRModel(num_chars)

# Loss and optimizer
# yolo_loss = model.yolo.loss
char_criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        images = batch['image'].to(device)
        gt_bboxes = batch['bboxes'].to(device)
        gt_texts = batch['texts']

        # Forward pass
        yolo_results, char_results = model(images)
        yolo_loss_value = yolo_results[0].box_loss + yolo_results[0].cls_loss


        # YOLO loss
        # yolo_loss_value = yolo_loss(yolo_results)

        # Character recognition loss
        char_loss = 0
        for pred, gt_text in zip(char_results, gt_texts):
            gt_chars = torch.zeros(num_chars)
            for char in gt_text:
                gt_chars[char_to_idx[char]] = 1
            char_loss += char_criterion(pred, gt_chars.to(device))

        # Total loss
        total_loss = yolo_loss_value + char_loss

        # Backward pass and optimization
        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss.item()}")


In [None]:
yolo

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [None]:
yolo = YOLO('yolov8n.pt')

dummy_input = torch.randn(1, 3, 640, 640)
with torch.no_grad():
    features = yolo.model(dummy_input, augment=False)[0]
len(features)
for indfeat in features :
  print(indfeat.shape)

torch.Size([84, 8400])


In [None]:
from torchsummary import summary
from torchvision import models

# summary(model, (3, 640, 640))
# vgg = models.vgg16()
# summary(vgg, (3, 224, 224))
# model.yolo
!pip -q install torchviz torchview
from torchviz import make_dot
from torchview import draw_graph



model_graph = draw_graph(OCRModel(15), input_size=(1,3,640, 640), expand_nested=True)
model_graph.visual_graph


In [None]:

# Inference function
def detect_and_recognize(image_path, model, char_threshold=0.5):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_tensor = torch.from_numpy(img).permute(2, 0, 1).float().unsqueeze(0) / 255.0
    img_tensor = img_tensor.to(device)

    with torch.no_grad():
        yolo_results, char_results = model(img_tensor)

    detections = []
    for box, char_probs in zip(yolo_results[0].boxes.xyxy, char_results):
        x1, y1, x2, y2 = box.tolist()
        conf = yolo_results[0].boxes.conf[0].item()
        chars = (char_probs > char_threshold).nonzero().flatten().tolist()
        text = ''.join([idx_to_char[c] for c in chars])
        detections.append({
            'bbox': [x1, y1, x2, y2],
            'confidence': conf,
            'text': text
        })

    return detections

# Example usage
image_path = 'path/to/test/image.jpg'
detections = detect_and_recognize(image_path, model)

# Visualize results
img = cv2.imread(image_path)
for det in detections:
    x1, y1, x2, y2 = map(int, det['bbox'])
    cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
    cv2.putText(img, det['text'], (x1, y1 - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)

cv2.imshow('Detection Results', img)
cv2.waitKey(0)
cv2.destroyAllWindows()