# Breaking a CAPTCHA with Deep Learning
This notebook demonstrates how to break simple 5-character CAPTCHA images using a Convolutional Neural Network (CNN).

We:
- Generate synthetic CAPTCHA images
- Train a CNN model to recognize characters
- Predict CAPTCHA text from real or synthetic inputs

---





##Generate CAPTCHA Training and Validation Datasets

In [5]:
import os
import random
import string
from PIL import Image, ImageDraw, ImageFont
import glob


TRAIN_DIR = 'train_captcha_images/'
VAL_DIR = 'val_captcha_images/'
NUM_TRAIN_IMAGES = 40000
NUM_VAL_IMAGES = 10000
CHARS = string.ascii_uppercase + string.digits
IMAGE_SIZE = (100, 40)
FONT_SIZE = 30


os.makedirs(TRAIN_DIR, exist_ok=True)
os.makedirs(VAL_DIR, exist_ok=True)


fonts = []
with open("fonts.txt") as f:
    content = f.readlines()
acceptable_fonts = set(x.strip() for x in content)


for filename in glob.iglob('/usr/share/fonts/truetype/**/*.ttf', recursive=True):
    if filename in acceptable_fonts:
        try:
            fonts.append(ImageFont.truetype(filename, FONT_SIZE))
        except:
            continue

print(f"Loaded {len(fonts)} fonts.")


def generate_images(output_dir, num_images):
    words = set()
    while len(words) < num_images:
        word = ''.join(random.choices(CHARS, k=5))
        if word in words:
            continue
        words.add(word)
        img = Image.new("L", IMAGE_SIZE, color=255)
        draw = ImageDraw.Draw(img)
        font = random.choice(fonts)
        draw.text((10, 5), word, font=font, fill=0)
        img.save(os.path.join(output_dir, f"{word}.png"))
    print(f"Generated {num_images} images in {output_dir}")


generate_images(TRAIN_DIR, NUM_TRAIN_IMAGES)
generate_images(VAL_DIR, NUM_VAL_IMAGES)


Loaded 16 fonts.
Generated 40000 images in train_captcha_images/
Generated 10000 images in val_captcha_images/


### Load CAPTCHA Images and Encode Labels

In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from PIL import Image
import os
import string

IMG_WIDTH, IMG_HEIGHT = 100, 40
NUM_CLASSES = 36
CAPTCHA_LENGTH = 5
CHARS = string.digits + string.ascii_uppercase
char_to_num = {c: i for i, c in enumerate(CHARS)}
num_to_char = {i: c for i, c in enumerate(CHARS)}

def load_data(data_dir):
    images = []
    labels = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.png'):
            img_path = os.path.join(data_dir, filename)
            img = Image.open(img_path).convert('L').resize((IMG_WIDTH, IMG_HEIGHT))
            img = np.array(img) / 255.0
            images.append(img)

            label_str = filename[:-4]
            label_encoded = [char_to_num[c] for c in label_str]
            labels.append(label_encoded)
    images = np.array(images).reshape(-1, IMG_HEIGHT, IMG_WIDTH, 1)
    labels = np.array(labels)
    return images, labels


### Load and Inspect Training & Validation Data

In [7]:
print("Loading training data...")
X_train, y_train = load_data(TRAIN_DIR)
print(f"Training data shape: {X_train.shape}, Labels shape: {y_train.shape}")

print("Loading validation data...")
X_val, y_val = load_data(VAL_DIR)
print(f"Validation data shape: {X_val.shape}, Labels shape: {y_val.shape}")


Loading training data...
Training data shape: (40000, 40, 100, 1), Labels shape: (40000, 5)
Loading validation data...
Validation data shape: (10000, 40, 100, 1), Labels shape: (10000, 5)


### One-Hot Encode CAPTCHA Labels

In [8]:
def one_hot_encode_labels(labels):
    encoded = np.zeros((labels.shape[0], CAPTCHA_LENGTH, NUM_CLASSES))
    for i in range(labels.shape[0]):
        for j in range(CAPTCHA_LENGTH):
            encoded[i, j, labels[i, j]] = 1
    return encoded

y_train_ohe = one_hot_encode_labels(y_train)
y_val_ohe = one_hot_encode_labels(y_val)

print("One-hot encoded labels shape:", y_train_ohe.shape)


One-hot encoded labels shape: (40000, 5, 36)


### Build CNN Model for Multi-Character CAPTCHA Recognition

In [9]:
def build_model():
    input_layer = layers.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 1))

    x = layers.Conv2D(32, (3,3), activation='relu', padding='same')(input_layer)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(64, (3,3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Conv2D(128, (3,3), activation='relu', padding='same')(x)
    x = layers.MaxPooling2D((2,2))(x)

    x = layers.Flatten()(x)
    x = layers.Dropout(0.5)(x)

    output_layers = []
    for i in range(CAPTCHA_LENGTH):
        output_layers.append(layers.Dense(NUM_CLASSES, activation='softmax', name=f'char_{i}')(x))

    model = models.Model(inputs=input_layer, outputs=output_layers)
    return model

model = build_model()
model.summary()


### Compile the Model and Prepare Multi-Output Labels

In [10]:
model.compile(
    optimizer='adam',
    loss=['categorical_crossentropy'] * CAPTCHA_LENGTH,
    metrics=['accuracy'] * CAPTCHA_LENGTH
)


y_train_list = [y_train_ohe[:, i, :] for i in range(CAPTCHA_LENGTH)]
y_val_list = [y_val_ohe[:, i, :] for i in range(CAPTCHA_LENGTH)]


### Train the CAPTCHA Recognition Model

In [12]:
history = model.fit(
    X_train, y_train_list,
    validation_data=(X_val, y_val_list),
    batch_size=64,
    epochs=10
)


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 16ms/step - char_0_accuracy: 0.9999 - char_0_loss: 0.0015 - char_1_accuracy: 0.9957 - char_1_loss: 0.0173 - char_2_accuracy: 0.9678 - char_2_loss: 0.1063 - char_3_accuracy: 0.9397 - char_3_loss: 0.2094 - char_4_accuracy: 0.8946 - char_4_loss: 0.4049 - loss: 0.7395 - val_char_0_accuracy: 1.0000 - val_char_0_loss: 1.2998e-04 - val_char_1_accuracy: 0.9967 - val_char_1_loss: 0.0127 - val_char_2_accuracy: 0.9634 - val_char_2_loss: 0.1472 - val_char_3_accuracy: 0.9416 - val_char_3_loss: 0.2200 - val_char_4_accuracy: 0.8970 - val_char_4_loss: 0.4551 - val_loss: 0.8331
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - char_0_accuracy: 0.9998 - char_0_loss: 0.0014 - char_1_accuracy: 0.9962 - char_1_loss: 0.0161 - char_2_accuracy: 0.9700 - char_2_loss: 0.1063 - char_3_accuracy: 0.9421 - char_3_loss: 0.1949 - char_4_accuracy: 0.8977 - char_4_loss: 0.3876 - loss: 0.7063 - val_char_0

### Save the model

In [14]:
model.save('captcha_model.h5')




## Testing

In [22]:
import numpy as np

def decode_predictions(preds):
    """
    preds: list of 5 arrays, each shape (num_samples, num_classes)
    Returns: list of decoded strings
    """
    decoded = []
    for i in range(len(preds[0])):
        word = ''
        for j in range(CAPTCHA_LENGTH):
            pred_char = np.argmax(preds[j][i])
            word += num_to_char[pred_char]
        decoded.append(word)
    return decoded


In [23]:
def predict_and_show(model, images, true_labels=None, num_to_show=10):
    preds = model.predict(images)
    decoded_preds = decode_predictions(preds)

    for i in range(min(num_to_show, len(decoded_preds))):
        print(f"Prediction: {decoded_preds[i]}", end='')
        if true_labels is not None:
            true_word = ''.join([num_to_char[c] for c in true_labels[i]])
            print(f" - True: {true_word}")
        else:
            print()


In [24]:
predict_and_show(model, X_val, y_val, num_to_show=20)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
Prediction: X6HY8 - True: X6HY8
Prediction: ST3TH - True: ST3TH
Prediction: P10XT - True: P10XT
Prediction: XMN5B - True: XMN5B
Prediction: E7Y61 - True: E7Y61
Prediction: W55VI - True: W5YVM
Prediction: IYIUI - True: IY3IU
Prediction: ZWFYU - True: ZWFYU
Prediction: Y1RZZ - True: Y1RZZ
Prediction: 28VDF - True: 28VDF
Prediction: XMXQG - True: XMXQG
Prediction: UJQIQ - True: UJQIQ
Prediction: 28KV0 - True: 28KV0
Prediction: ZNS5Z - True: ZNS5Z
Prediction: 10TZ8 - True: 10TZ8
Prediction: WUPP8 - True: WUPY8
Prediction: L8ZH5 - True: L8ZH5
Prediction: HHLJB - True: HHLJB
Prediction: 7WEI6 - True: 7WEI6
Prediction: 8ZC6D - True: 8ZC6D


In [25]:
from PIL import Image

def generate_single_captcha_image(word, font):
    img = Image.new("L", IMAGE_SIZE, color=255)
    draw = ImageDraw.Draw(img)
    draw.text((10, 5), word, font=font, fill=0)
    img = img.resize((IMG_WIDTH, IMG_HEIGHT))
    img = np.array(img) / 255.0
    img = img.reshape(1, IMG_HEIGHT, IMG_WIDTH, 1)
    return img


import random

random_word = ''.join(random.choices(CHARS, k=5))
random_font = random.choice(fonts)
print("Generated word:", random_word)

test_img = generate_single_captcha_image(random_word, random_font)
predict_and_show(model, test_img, true_labels=[ [char_to_num[c] for c in random_word] ], num_to_show=1)


Generated word: IOQE7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 210ms/step
Prediction: IOQE7 - True: IOQE7
