In [41]:
!pip install opencv-python
!pip install numpy



In [59]:
import torch
from torch.utils.data import DataLoader,Dataset, random_split
from torch import optim
import torchvision.transforms as transforms
from torchvision.io import read_image
import torch.nn as nn
import torch.nn.functional as F

from PIL import Image
import cv2
import numpy as np
import matplotlib as plt
import math
import random # sampling captcha text
import os # used for path and image storage
from captcha.image import ImageCaptcha  # Module that will generate all captcha images# pip install captcha
import time


In [35]:
# Set device to point to a GPU if we have one, CPU otherwise.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [21]:
def remove_noise(image_path, save_path):
    # Open the image using Pillow
    image = Image.open(image_path)

    # Convert the image to grayscale
    gray_image = image.convert('L')

    # Convert PIL image to numpy array
    np_image = np.array(gray_image)

    # Apply binary thresholding
    _, binary_image = cv2.threshold(np_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Apply morphological operations
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(binary_image, cv2.MORPH_OPEN, kernel, iterations=1)  # Increase iterations for more noise removal

    # Apply Gaussian blur
    denoised_image = cv2.GaussianBlur(opening, (3, 3), 0)

    # Apply median blur to further reduce noise, particularly small circles
    denoised_image = cv2.medianBlur(denoised_image, 3)

    # Save the denoised image
    denoised_image_pil = Image.fromarray(denoised_image)
    denoised_image_pil.save(save_path)


In [22]:
# trying to resize the image
def resize_image(image, new_width, new_height):
    # Resize the image
    resized_image = cv2.resize(image, (new_width, new_height))
    return resized_image

def save_contours_as_images(image_path, output_directory, image_id):
    # Load the image in grayscale
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Threshold the image to obtain binary image
    _, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)

    # Find contours in the binary image
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours based on x-coordinate
    contours = sorted(contours, key=lambda contour: cv2.boundingRect(contour)[0])

    # Create output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    counter = 0 # keep track of how many characters have been saved
    label = image_path.split('/')[0].split('.')[0].split("\\")[1]
    print(label)
    image_name = label.split("--")[0]
    char_labels = [char_label for char_label in label.split("_")[0]]
    # print(char_labels)

    for i, contour in enumerate(contours):
        # Get bounding box for each contour
        x, y, w, h = cv2.boundingRect(contour)

        if counter > 3:
            break

        # Check if contour is too small (possibly noise)
        if w > 5 and h > 5:
            # Add some padding around the character bounding box
            padding = 10
            x_padding = max(0, x - padding)
            y_padding = max(0, y - padding)
            w_padding = min(image.shape[1], w + 2 * padding)
            h_padding = min(image.shape[0], h + 2 * padding)

            # Create a black canvas with padded dimensions
            padded_image = np.zeros((h_padding, w_padding), dtype=np.uint8)

            # Calculate coordinates to place the character in the center
            x_offset = (w_padding - w) // 2
            y_offset = (h_padding - h) // 2

            # Copy the character region from the original image to padded image
            padded_image[y_offset:y_offset+h, x_offset:x_offset+w] = image[y:y+h, x:x+w]

            # Resize the padded image
            resized_image = resize_image(padded_image, 100, 100)

            # Save the resized image as a separate image
            character_filename = os.path.join(output_directory, f'{image_name}_{char_labels[counter]}--{image_id}.png')
            cv2.imwrite(character_filename, resized_image)
            # print(f"contour saved: {character_filename}")
            counter += 1
            image_id += 1

### Passing images to the filter (new image generation)

In [28]:
# Generate denoised images
folder = 'four_cap_36'
output_folder = 'denoised_images'

# Get list of all files in the folder
file_list = os.listdir(folder)

# Iterate through the first 10 images in the folder
for i, filename in enumerate(file_list):    
    # Check if the file is an image (you can add more image extensions if needed)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Construct the full path to the image
        image_path = os.path.join(folder, filename)
        
        # Save path for the denoised image
        # original image -> denoised image (now named after its label)
        label = filename.split('-')[0]
        save_filename = f'{label}_{i}--denoised.png' # There is a possibility that the images might have the same label -- TODO: FIX IT later
        save_path = os.path.join(output_folder, save_filename)
        
        # Call the remove_noise function
        remove_noise(image_path, save_path)


### Cropping characters (new image generation)

In [None]:
# Folder path containing the images
folder_path = 'denoised_images'

# Output directory for saved contour images
output_directory = 'cropped_characters'

# Get list of all files in the folder
file_list = os.listdir(folder_path)

image_id = 0

# Iterate through the first 10 images in the folder
for i, filename in enumerate(file_list):
    # Check if the file is an image (you can add more image extensions if needed)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        # Construct the full path to the image
        image_path = os.path.join(folder_path, filename)
        
        # Call the save_contours_as_images function
        save_contours_as_images(image_path, output_directory, image_id)

        image_id += 4

### Custom Class for characters

In [52]:
class CroppedCharacterDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = [os.path.join(root_dir, img) for img in os.listdir(root_dir) if img.endswith('.png')]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('L')  # Convert image to grayscale
        
        # Extract label and image id from file name
        filename = os.path.splitext(os.path.basename(img_path))[0]

        # format: {ParentImageName}_{image_num}_{char_label}--{id}.png
        parts = filename.split('_')
        label, image_id = parts[-1].split('--')  # Split last part
        label = label.strip()  # Remove any leading/trailing whitespace
        
        if self.transform:
            image = self.transform(image)

        return image, label

In [53]:
# root directory where images are stored
root_dir = "cropped_characters"

# Define transformations if needed
transform = transforms.Compose([
    transforms.ToTensor()
])

# Create dataset instance
cropped_chars_dataset = CroppedCharacterDataset(root_dir, transform=transform)

In [65]:
# Define the sizes of each split
print(len(cropped_chars_dataset)) # total number of images in dataset
train_size = int(0.6 * len(cropped_chars_dataset))
dev_size = int(0.2 * len(cropped_chars_dataset))
test_size = len(cropped_chars_dataset) - train_size - dev_size

# Split dataset into train, validation, and test sets
train_data, dev_data, test_data = random_split(cropped_chars_dataset, [train_size, dev_size, test_size])

# Create data loaders
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

print(f"Train: {len(train_loader.dataset)} examples into {len(train_loader)} batches")
print(f"Test: {len(test_loader.dataset)} examples into {len(test_loader)} batches")
print(f"Test: {len(dev_loader.dataset)} examples into {len(dev_loader)} batches")

7683
Train: 4609 examples into 145 batches
Test: 1538 examples into 49 batches
Test: 1536 examples into 48 batches


## Important functions for next steps

Credits: Mark Liffiton; CS387 Spring 2024 Assignment 5

### `test_model(model, dataloader)`

This function calculates accuracy statistics for a given model on given data.

**Arguments**
 * `model`: A PyTorch model.  It should have already been moved to `device` (and you should assure that is the case when calling this function).
 * `dataloader`: A PyTorch DataLoader.

**Return value**
 * A tuple of two values:
    1. Overall accuracy of the given model on the given data.
    2. A dictionary mapping class names for the given dataset to accuracies for the model's predictions of examples within each class.

To call it and print the accuracies it returns, you can use code like this:
```python
acc, class_acc = test_model(model, testloader)
print(f"Accuracy: {acc:.4f}   Class accuracies: {class_acc}")
```

In [76]:
def test_model(model, dataloader):
    classes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] # here are all the possible classes present in the data (sorted).
    class_correct = {clsname: 0 for clsname in classes}
    class_total = {clsname: 0 for clsname in classes}
    total_correct = 0
    total = 0

    counter = 0
    for data in dataloader:
        if counter >= 1:
            break

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = [x for x in data]

        print(inputs)
        print(labels)
        counter += 1

    # with torch.no_grad():
    #     for data in dataloader:
    #         # get the inputs; data is a list of [inputs, labels]
    #         inputs, labels = [x.to(device) for x in data]

    #         # forward
    #         outputs = model(inputs)

    #         # get predictions from multiple class outputs
    #         _, predicted = torch.max(outputs, 1)
    #         # find and count the correct predictions
    #         corrects = (predicted == labels).squeeze()
    #         total_correct += corrects.sum()
    #         total += outputs.shape[0]
 
    #         # count correct predictions within each clsas
    #         for label, correct in zip(labels, corrects):
    #             clsname = classes[label]
    #             class_correct[clsname] += correct.item()
    #             class_total[clsname] += 1
 
    # # compute overall accuracies
    # accuracy = (total_correct / total).item()
    # class_accuracies = {clsname: class_correct[clsname]/class_total[clsname]
    #                     for clsname in classes}
 
    # return accuracy, class_accuracies

## `StatReporter` Class

Credits: Mark Liffiton; CS387 Spring 2024 Assignment 5

This class will help report training statistics as training is running.  This cell creates a single global object named `stats` that you should use in any training run.

Basically, use this in `train_model()` as follows:
1. Call `stats.start()` right when each training run begins,
2. After each iteration of training, call `stats.iteration(epoch, i, loss)` with the current epoch number, current iteration number, and current loss value.
3. Call `stats.end()` right when a training run concludes.

In [58]:
class StatReporter:
    def start(self):
        self.start_time = time.time()
        self.elapsed = 0
        self.target = 0
        self.loss = None

    def iteration(self, epoch, i, loss):
        self.loss = loss

        iteration_time = time.time() - self.start_time
        self.elapsed += iteration_time
        self.start_time = time.time()
        if self.elapsed > self.target:
            print(f"Epoch {epoch+1:2d}, iteration {i+1:3d}:  Loss = {loss:.3f}  Iteration time = {iteration_time:0.3f}")
            self.target += 10

    def end(self):
        print(f"Training complete.  Elapsed time: {self.elapsed:.2f} seconds  Final loss: {self.loss:0.3f}")

stats = StatReporter()  # one StatReporter object to use throughout

## `train_model(model, dataloader, epochs)`

This function should train the given model using the given data for the given number of epochs.

**Arguments**
 * `model`: A PyTorch model.  You can assume here that it has already been moved to `device` (and you should assure that is the case when calling this function).
 * `dataloader`: A PyTorch DataLoader.
 * `epochs`: The number of full epochs to train.

**Return value**
 * None.

Use `torch.nn.CrossEntropyLoss()` for the loss function and `optim.Adam` with its default hyperparameters for the optimization algorithm.

Use the `stats` object to print training statistics before, during, and after the training run (see above for instructions).

Don't forget to move all tensors to `device` so they are placed on the GPU if one is present.

In [77]:
def train_model(model, dataloader, epochs):
    # Starting stats object
    # stats.start()

    # Loss function
    criterion = nn.CrossEntropyLoss()
    # Optimizer
    optimizer = optim.Adam(model.parameters())

    # Loop over the dataset for the given number of epochs
    for epoch in range(epochs):
        i = 0 # iteration counter for stats function call

        # Iterate over the batches of data
        for inputs, labels in dataloader:
            # Move inputs and labels to GPU if possible
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Get outputs from forward propagation
            outputs = model(inputs)

            # Compute the loss
            loss = criterion(outputs, labels)

            # Backward propagation
            loss.backward()

            # Perform a single optimization step (parameter update)
            optimizer.step()

            # Calculate accuracy after each iteration
            # stats.iteration(epoch, i, loss)

            # i++
            i += 1

        # Ending stats object
        # stats.end()

# Model

In [62]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 25 * 25, 128)  # 64 channels after max pooling with 25x25 spatial size
        self.fc2 = nn.Linear(128, 35) # 35 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 25 * 25)  # Flatten before fully connected layer
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Training

In [81]:
def test_model(model, dataloader):
    classes = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] # here are all the possible classes present in the data (sorted).
    class_correct = {clsname: 0 for clsname in classes}
    class_total = {clsname: 0 for clsname in classes}
    total_correct = 0
    total = 0

    counter = 0
    for data in dataloader:
        if counter >= 1:
            break

        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = [x for x in data]

        print(len(inputs))
        print(labels)
        counter += 1

    # with torch.no_grad():
    #     for data in dataloader:
    #         # get the inputs; data is a list of [inputs, labels]
    #         inputs, labels = [x.to(device) for x in data]

    #         # forward
    #         outputs = model(inputs)

    #         # get predictions from multiple class outputs
    #         _, predicted = torch.max(outputs, 1)
    #         # find and count the correct predictions
    #         corrects = (predicted == labels).squeeze()
    #         total_correct += corrects.sum()
    #         total += outputs.shape[0]
 
    #         # count correct predictions within each clsas
    #         for label, correct in zip(labels, corrects):
    #             clsname = classes[label]
    #             class_correct[clsname] += correct.item()
    #             class_total[clsname] += 1
 
    # # compute overall accuracies
    # accuracy = (total_correct / total).item()
    # class_accuracies = {clsname: class_correct[clsname]/class_total[clsname]
    #                     for clsname in classes}
 
    # return accuracy, class_accuracies

In [82]:
# Get ResNet18 model with randomized weights
model = CNNModel()

# Move the model to GPU if not already in there
test_model(model, train_loader)

# Check accuacy without training
# overall_accuracy, class_accuracy = test_model(model, test_loader)
# print(f'Accuracy before training: {overall_accuracy}')
# 
# # Train the model for 3 epochs
# train_model(model, train_loader, 5)
# 
# # Checki accuracy after training
# accuracy_after_training, class_accuracies_after_training = test_model(model, test_loader)
# print(f'Accuracy after training: {accuracy_after_training}')

32
('P', 'Q', '9', 'J', 'Z', 'G', 'X', '9', 'D', 'L', 'W', 'O', 'Z', 'F', 'C', 'P', '5', 'G', 'J', 'B', 'V', 'W', 'O', '2', 'D', '9', 'O', 'X', 'F', 'Z', 'F', '2')
