# EE 467 Lab 2: Breaking CAPTCHAs with PyTorch


In [11]:
# ========================================
# Breaking CAPTCHAs with PyTorch
# ========================================

# 0️⃣ Install required libraries
# Use CPU or GPU version of PyTorch
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install matplotlib scikit-learn "opencv-python>4" imutils

# 0️⃣.1 Extract CAPTCHA images
!tar -xf captcha-images.tar.xz

# ========================================
# 1️⃣ Imports
# ========================================
import os, pickle
import numpy as np
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from imutils import paths
from lab_2_helpers import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Looking in indexes: https://download.pytorch.org/whl/cpu
Using device: cpu


In [24]:

# ========================================
# 2️⃣ Load CAPTCHA images
# ========================================
CAPTCHA_IMAGE_FOLDER = "./captcha-images"

captcha_image_paths = list(paths.list_images(CAPTCHA_IMAGE_FOLDER))

def extract_captcha_text(path):
    return os.path.splitext(os.path.basename(path))[0]

captcha_texts = [extract_captcha_text(p) for p in captcha_image_paths]

def load_transform_image(path):
    img = cv2.imread(path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    padded = cv2.copyMakeBorder(gray, 8, 8, 8, 8, cv2.BORDER_REPLICATE)
    return padded

captcha_images = [load_transform_image(p) for p in captcha_image_paths]

In [25]:
# ========================================
# 3️⃣ Train / Test split
# ========================================
TVT_SPLIT_SEED = 31528476

imgs_tv, imgs_test, texts_tv, texts_test = train_test_split(
    captcha_images, captcha_texts, test_size=0.2, random_state=TVT_SPLIT_SEED
)

In [26]:

# ========================================
# 4️⃣ Character extraction
# ========================================
def extract_chars(image):
    bw = cv2.threshold(image, 0, 255,
                       cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
    contours = cv2.findContours(
        bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )[0]

    regions = []
    for c in contours:
        x, y, w, h = cv2.boundingRect(c)
        if w / h > 1.25:
            hw = w // 2
            regions += [(x, y, hw, h), (x+hw, y, hw, h)]
        else:
            regions.append((x, y, w, h))

    if len(regions) != 4:
        return None

    regions.sort(key=lambda r: r[0])
    chars = [
        image[y-2:y+h+2, x-2:x+w+2]
        for x, y, w, h in regions
    ]
    return chars

CHAR_IMAGE_FOLDER = f"./char-images-{TVT_SPLIT_SEED}"
os.makedirs(CHAR_IMAGE_FOLDER, exist_ok=True)
char_counts = {}

def save_chars(chars, text):
    for img, ch in zip(chars, text):
        folder = os.path.join(CHAR_IMAGE_FOLDER, ch)
        os.makedirs(folder, exist_ok=True)
        count = char_counts.get(ch, 1)
        cv2.imwrite(os.path.join(folder, f"{count}.png"), img)
        char_counts[ch] = count + 1

for img, text in zip(imgs_tv, texts_tv):
    chars = extract_chars(img)
    if chars:
        save_chars(chars, text)

In [27]:
# ========================================
# 5️⃣ Feature and label encoding
# ========================================
def make_feature(img):
    img = resize_to_fit(img, 20, 20)
    return img[..., None]

X, y = [], []
for path in paths.list_images(CHAR_IMAGE_FOLDER):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) # Corrected flag here
    X.append(make_feature(img))
    y.append(os.path.basename(os.path.dirname(path)))

X = np.array(X, dtype="float32") / 255.0

lb = LabelBinarizer()
y_onehot = lb.fit_transform(y)
n_classes = len(lb.classes_)

with open("labels.pkl", "wb") as f:
    pickle.dump(lb, f)

X_train, X_val, y_train, y_val = train_test_split(
    X, np.argmax(y_onehot, axis=1), test_size=0.25, random_state=955996
)

In [28]:
# ========================================
# 6️⃣ PyTorch Dataset
# ========================================
class CharDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X).permute(0, 3, 1, 2)
        self.y = torch.tensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

train_loader = DataLoader(CharDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(CharDataset(X_val, y_val), batch_size=32)

In [29]:

# ========================================
# 7️⃣ CNN Model
# ========================================
class CaptchaCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 20, 5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(20, 50, 5, padding=2),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        self.fc = nn.Sequential(
            nn.Linear(50*5*5, 500),
            nn.ReLU(),
            nn.Linear(500, n_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.reshape(x.size(0), -1)
        return self.fc(x)

model = CaptchaCNN(n_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [30]:

# ========================================
# 8️⃣ Training
# ========================================
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(x), y)
        loss.backward()
        optimizer.step()

    model.eval()
    correct = total = 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    print(f"Epoch {epoch+1}/{EPOCHS} | Val Acc: {correct/total:.4f}")

torch.save(model.state_dict(), "captcha-model-pytorch.pth")


Epoch 1/10 | Val Acc: 0.8709
Epoch 2/10 | Val Acc: 0.9461
Epoch 3/10 | Val Acc: 0.9719
Epoch 4/10 | Val Acc: 0.9809
Epoch 5/10 | Val Acc: 0.9877
Epoch 6/10 | Val Acc: 0.9854
Epoch 7/10 | Val Acc: 0.9843
Epoch 8/10 | Val Acc: 0.9854
Epoch 9/10 | Val Acc: 0.9854
Epoch 10/10 | Val Acc: 0.9854


In [31]:

# ========================================
# 9️⃣ End-to-end evaluation on CAPTCHA images
# ========================================
model.eval()
features_test = []
failed = []

for i, img in enumerate(imgs_test):
    chars = extract_chars(img)
    if chars:
        features_test.extend([make_feature(c) for c in chars])
    else:
        failed.append(i)
        features_test.extend(np.zeros((4,20,20,1)))

X_test = torch.tensor(features_test, dtype=torch.float32).permute(0,3,1,2).to(device)
with torch.no_grad():
    preds = model(X_test).argmax(1).cpu().numpy()

pred_chars = lb.classes_[preds]
pred_texts = ["".join(c) for c in group_every(pred_chars, 4)]

for i in failed:
    pred_texts[i] = "-"

accuracy = sum(p==t for p,t in zip(pred_texts, texts_test)) / len(texts_test)
print("Test Accuracy:", accuracy)


Test Accuracy: 0.9605263157894737


In [32]:
import pickle

with open("labels.pkl", "rb") as f:
    lb_loaded = pickle.load(f)

print("LabelBinarizer classes:", lb_loaded.classes_)

# Also inspect a few predicted vs actual texts to see the mismatch clearly
print("\nSample predicted texts vs actual texts:")
for i in range(min(5, len(pred_texts))):
    print(f"Predicted: {pred_texts[i]}, Actual: {texts_test[i]}")


LabelBinarizer classes: ['2' '3' '4' '5' '6' '7' '8' '9' 'A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'J' 'K'
 'L' 'M' 'N' 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']

Sample predicted texts vs actual texts:
Predicted: 2QSL, Actual: 2QSL
Predicted: 2SKC, Actual: 2SKC
Predicted: 8AWH, Actual: 8AWH
Predicted: 4GUC, Actual: 4GUC
Predicted: B5PF, Actual: B5PF
