# CAPTCHA-Recognition-System (Train on Colab)

#### Step 1 : Mount Google Drive

In [None]:
# Mounts your drive to /content/drive folder
from google.colab import drive
drive.mount('/content/drive')

#### Step 2 : Copy dataset.zip from the data folder of the GitHub Repo to a folder in your Drive. Copy as Path for dataset.zip file from Colab Files and paste it below. ***Add Path Here.***

In [None]:
# Copy and Unzip using shell command
!cp "/content/drive/MyDrive/path/to/your/file.zip" "/content/"
!unzip "/content/dataset.zip" -d "/content/"

#### Import all the dependencies (Colab has it all in-built, no need to pip install anything.)

In [3]:
import pandas as pd
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#### Step 3 : Pre-Processing the train images to Binary and removing noise. The unzipped images are directly processed and replaced as it is stored on Colab Disk for faster processing.

In [None]:
# Threshold value for Binary Thresholding of CAPTCHA images
thresh_value=220

# Iterate through all the images in the parent folder and subfolders.
for root, _, files in os.walk("/content/dataset"):
    for filename in files:
        if filename.lower().endswith('.png'):
            image_path = os.path.join(root, filename)

            # Read image
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            if img is None:
                print(f"Could not read {image_path}")
                continue

            # Step 1: Median blur
            blurred = cv2.medianBlur(img, 3)

            # Step 2: Thresholding
            _, thresholded = cv2.threshold(blurred, thresh_value, 255, cv2.THRESH_BINARY)

            # Step 3: Another median blur to clean noise on Binary image
            final = cv2.medianBlur(thresholded, 3)

            # Save processed image (overwrite original)
            cv2.imwrite(image_path, final)
            print(f"Processed and replaced: {image_path}")

print("Processing complete!")

#### Step 4 : Defining the Dataset

In [5]:
class CaptchaDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    # Return length of the image
    def __len__(self):
        return len(self.images)

    # Returns Tensor for image and label
    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        return torch.FloatTensor(image), torch.LongTensor(label)

#### Step 5 : Loading the Normalised Images and Sliced Labels to NumPy arrays. Data is split to Train/Val/Test reading from .csv file.

In [6]:
def load_data(csv_path, test_images_present):

    # Read labels CSV
    df = pd.read_csv(csv_path)

    # Create empty arrays for train, val, and test sets
    train_images = []
    train_labels = []
    val_images = []
    val_labels = []
    test_images = []
    test_labels = []

    # Load and preprocess each image
    for i, row in df.iterrows():
        img_path = row['image_path']

        # Prevents reading test-images when not present in the dataset
        if test_images_present or not img_path.startswith('test-images/'):

            # Read image
            pos = img_path.index("/") + 1
            substring_path = img_path[:pos]
            full_img_path = os.path.join('/content/dataset/'+substring_path, img_path)
            img = cv2.imread(full_img_path, cv2.IMREAD_GRAYSCALE)

            # Normalize image (0-1)
            img = img / 255.0

            # Add channel dimension
            img = np.expand_dims(img, axis=0)  # Shape: (1, height, width)

        # Process label (6 digits)
        label = row['solution']

        # Ensure label is a string before slicing
        label = str(label)

        # Pad with zeros to ensure 6 digits
        label = label.zfill(6)  # Pad with leading zeros to prevent losing zeros in the beginning of the label

        # Convert each digit to numerical values
        digit_labels = [int(digit) for digit in label[-6:]]  # Take last 6 digits

        # Split into train/val/test based on directory prefix
        if img_path.startswith('train-images/'):
            train_images.append(img)
            train_labels.append(digit_labels)
        elif img_path.startswith('validation-images/'):
            val_images.append(img)
            val_labels.append(digit_labels)
        elif img_path.startswith('test-images/'):
            test_images.append(img)
            test_labels.append(digit_labels)

    # Convert to numpy arrays
    X_train = np.array(train_images)
    y_train = np.array(train_labels)
    X_val = np.array(val_images)
    y_val = np.array(val_labels)
    X_test = np.array(test_images)
    y_test = np.array(test_labels)

    print("X_train shape:", X_train.shape)
    print("y_train shape:", y_train.shape)
    print("X_val shape:", X_val.shape)
    print("y_val shape:", y_val.shape)

    return X_train, y_train, X_val, y_val, X_test, y_test

#### Step 6 : Defining the CNN Model Architecture. Six output layers are defined, one for each digit.

In [None]:
class CaptchaCNN(nn.Module):
    def __init__(self):
        super(CaptchaCNN, self).__init__()

        # Convolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        # Calculate size after convolutions (for 200x50 input)
        # After 3 max pooling layers with stride 2: 200/8=25, 50/8=6
        conv_output_size = 64 * 25 * 6

        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Flatten(),
            nn.Linear(conv_output_size, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
        )

        # Output layers - one for each digit (6 digits)
        self.digit1 = nn.Linear(512, 10)
        self.digit2 = nn.Linear(512, 10)
        self.digit3 = nn.Linear(512, 10)
        self.digit4 = nn.Linear(512, 10)
        self.digit5 = nn.Linear(512, 10)
        self.digit6 = nn.Linear(512, 10)

    def forward(self, x):
        # Pass through convolutional layers
        x = self.conv_layers(x)

        # Pass through fully connected layers
        x = self.fc_layers(x)

        # Get output for each digit
        digit1 = self.digit1(x)
        digit2 = self.digit2(x)
        digit3 = self.digit3(x)
        digit4 = self.digit4(x)
        digit5 = self.digit5(x)
        digit6 = self.digit6(x)

        return [digit1, digit2, digit3, digit4, digit5, digit6]

#### Step 7 : Train the Model. Loss Function, Optimizer and BackPropagation. At the end of each Epoch; Train Loss, Val Loss and Val Accuracy is calculated.

In [8]:
def train_model(model, train_loader, val_loader, num_epochs, learning_rate):
    # Define loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), learning_rate)

    # Assign GPU or CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for images, labels in train_loader:
            images = images.to(device)

            # Forward pass
            outputs = model(images)

            # Calculate loss for each digit
            loss = 0
            for i, output in enumerate(outputs):
                target = labels[:, i].to(device)
                loss += criterion(output, target)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total_digits = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images = images.to(device)

                # Forward pass
                outputs = model(images)

                # Calculate validation loss and accuracy
                for i, output in enumerate(outputs):
                    target = labels[:, i].to(device)
                    val_loss += criterion(output, target).item()

                    _, predicted = torch.max(output, 1)
                    correct += (predicted == target).sum().item()
                    total_digits += target.size(0)

                    val_accuracy = correct / total_digits * 100

        print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss/len(train_loader):.4f}, '
              f'Val Loss: {val_loss/len(val_loader):.4f}, '
              f'Val Accuracy: {correct/total_digits*100:.2f}%')

    return model

#### Step 8 : Evaluate the Model with calculating accuracy for Complete CAPTCHAS and Individual Digits.

In [9]:
def evaluate_model(model, test_loader):

    # Assign GPU or CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()

    all_predicted = []
    all_actual = []

    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(device)

            # Forward pass
            outputs = model(images)

            # Get predictions for each digit
            batch_predictions = []
            for i, output in enumerate(outputs):
                _, predicted = torch.max(output, 1)
                batch_predictions.append(predicted.cpu().numpy())

            # Transpose to get predictions per image
            batch_predictions = np.array(batch_predictions).T
            all_predicted.extend(batch_predictions)
            all_actual.extend(labels.cpu().numpy())

    all_predicted = np.array(all_predicted)
    all_actual = np.array(all_actual)

    # Calculate accuracy for complete captchas
    correct_captchas = 0
    for i in range(len(all_predicted)):
        if np.array_equal(all_predicted[i], all_actual[i]):
            correct_captchas += 1

    captcha_accuracy = correct_captchas / len(all_predicted) * 100

    # Calculate accuracy for individual digits
    digit_correct = np.sum(all_predicted == all_actual)
    digit_accuracy = digit_correct / (all_actual.size) * 100

    print(f' Model Captcha Accuracy: {captcha_accuracy:.2f}%')
    print(f' Model Digit Accuracy: {digit_accuracy:.2f}%')

    return captcha_accuracy, digit_accuracy

#### Step 9 : Run all the functions and save the model in your Drive.

In [None]:
# Set random seed
torch.manual_seed(42)

# Set False if no test images in dataset.zip
test_images_in_zip = False

# Load data
X_train, y_train, X_val, y_val, X_test, y_test = load_data('/content/dataset/captcha_data.csv', test_images_in_zip)

# Create datasets
train_dataset = CaptchaDataset(X_train, y_train)
val_dataset = CaptchaDataset(X_val, y_val)
test_dataset = CaptchaDataset(X_test, y_test)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Initialize the model
model = CaptchaCNN()

# Set the number of epochs
num_epochs = 300

# Set the learning rate
learning_rate = 0.0001

# Train the model
trained_model = train_model(model, train_loader, val_loader, num_epochs, learning_rate)

# Evaluate the model
evaluate_model(trained_model, val_loader)

# Drive path to the save the model
custom_path = r'/content/drive/MyDrive/CAPTCHA_models'
os.makedirs(custom_path, exist_ok=True)

# Save the model
torch.save(trained_model.state_dict(), os.path.join(custom_path,'captcha_model_best.pth'))
print("Last Model saved successfully!")