In [1]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F 
import numpy as np 
from matplotlib import pyplot as plt 
import torchvision.transforms as transforms 
from PIL import Image 
from scipy.ndimage import label

import os

In [None]:
# Define a lightweight U-Net model 
class SimpleUNet(nn.Module):
    def __init__(self, in_channels=3, out_channels=1): 
        super(SimpleUNet, self).__init__()

        def conv_block(in_ch, out_ch): 
            return nn.sequential(
                nn.Conv2d(in_ch, out_ch, 3, padding=1), 
                nn.ReLU(inplace=True), 
                nn.Conv2d(out_ch, out_ch, 3, padding=1), 
                nn.ReLU(inplace=True)
            )

        # Encoder 
        self.enc1 = conv_block(in_channels, 32)
        self.enc2 = conv_block(32, 64)

        # Bottleneck -> Base 
        self.bottleneck = conv_block(64, 128),

        # Decoder (Climbing up)
        self.upconv2 = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec2 = conv_block(128, 64)
        self.upconv1 = nn.ConvTranspose2d(64, 32, 2, stride=2)
        self.dec1 = conv_block(64, 32)

        # output 
        self.final = nn.Conv2d(32, out_channels, 1) # out_channels eq num of classes 


    def forward(self, x):
        # Encoder
        enc1 = self.enc1(x)
        pool1 = F.max_pool2d(enc1, 2)
        enc2 = self.enc2(pool1)
        pool2 = F.max_pool2d(enc2, 2)

        # Bottleneck BaseLayer 
        bottleneck = self.bottleneck(pool2)

        # Decoder with skip connections 
        up2 = self.upconv2(bottleneck)
        dec2 = self.dec2(torch.cat([up2, enc2], dim=1))

        up1 = self.upconv1(dec2)
        dec1 = self.dec1(torch.cat([up1, enc1], dim=1))

        # output 
        out = self.final(dec1)
        return torch.sigmoid(out)

    

In [10]:
def segment_characters_deep_learning(image_path, device='cpu'):
    # Load and preprocess image 
    try: 
        image = Image.open(image_path).convert('RGB')
    except Exception as e: 
        printf(f"Error loading image: {e}")
        return 

    # Resize image to a fixed size for model input 
    # Data augmentation 
    transform = transforms.Compose([
        transforms.Resize((256, 256)), 
        transforms.ToTensor(), 
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    img_tensor = transform(image).unsqueeze(0).to(device)

    # Initialize model
    model = SimpleUNet().to(device)
    model.eval()  

    # model.load_state_dict(torch.load('unet_weight-s.pth'))

    # Perform inference 
    with torch.no_grad():
        pred_mask = model(img_tensor)

    # Convert original image to numpy 
    img_np = img_tensor.squeeze(0).cpu().permute(1, 2, 0).numpy()
    img_np = (img_np * np.array([0.299, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])).clip(0, 1)

    # Contour detection on predict mask 
    labeled, num_features = label(pred_mask)
    if num_features == 0:
        print("No Characters detected")
        return 
   
    # Extract and filter character bounding boxes 
    characters = []
    min_area = 100   # Minimum area
    max_aspect = 2.0  # Maximum width / height 

    for i in range(1, num_features + 1):
        contour_mask = (labeled == i).astype(np.uint8)
        area = contour_mask.sum()

        if area < min_area:
            continue 

        # The goal is to find the coordinate of the characters
        y, x = np.where(contour_mask)
        x_min, x_max = x.min(), x.max()
        y_min, y_max = y.min(), y.max()
        width = x_max - x_min + 1 
        height = y_max - y_min + 1 
        aspect_ratio = width / height 

        if aspect_ration > max_aspect:
            num_chars = int(np.ceil(aspect_ratio))
            char_width = width // num_chars 
            for j in range(num_chars):
                x_start = x_min + j * char_width 
                x_end  = min(x_start + char_width, x_max)
                characters.append((x_start, x_end, y_min, y_max))
        else: 
            characters.append((x_min, x_max, y_min, y_max)) 

    # Sort characters by x - coordinate 
    characters.sort(key=lambda x: x[0])
    
    # Visualize 
    plt.figure(figsize=(15, 5))
    
    plt.subplot(131)
    plt.imshow(img_np)
    plt.title('Original Image')
    plt.axis('off')
    
    plt.subplot(132)
    plt.imshow(pred_mask, cmap='gray')
    plt.title('Predict Mask')
    plt.axis('off')

    # Draw bounding boxes 
    result_img = img_np.copy()
    for i, (x_min, x_max, y_min, y_max) in enumerate(characters):
        result_img[y_min:y_max+1, x_min:x_min+2] = [0, 1, 0]
        result_img[y_min:y_max+1, x_max-1:x_max+1] = [0, 1, 0]
        result_img[y_min:y_max+2, x_min:x_max+1] = [0, 1, 0]
        result_img[y_max-1:y_max+1, x_min:x_max+1] = [0, 1, 0]


    plt.subplot(133)
    plt.imshow(result_img)
    plt.title('Segmented Characters')
    plt.axis('off')


    # Save individual characters 
    for i, (x_min, x_max, y_min, y_max) in enumerate(characters):
        char_img = img_np[y_min:y_max+1, x_min:x_max+1]
        outpath = f'char_{i}'
        Image.fromarray((char_img * 255).astype(np.uint8)).save(output_path)
        print('Saved character {i} as:', {output_path})
                
    


In [13]:
dirname = "datasets"
image_path = os.path.join(dirname, "sample_text.JPG")

device = 'cuda' if torch.cuda.is_available() else 'cpu'

segment_characters_deep_learning(image_path, device)

AttributeError: module 'torch.nn' has no attribute 'sequential'