# Image Captioning - Data Preprocessing

In [8]:
import re
import string
from pathlib import Path
from collections import defaultdict, Counter

In [None]:

# Path to captions
captions_file = Path("../data/Flickr8k_text/Flickr8k.token.txt")

# Load all lines
with open(captions_file, 'r') as f:
    lines = f.readlines()

# Preview a few lines
for line in lines[:5]:
    print(line.strip())

1000268201_693b08cb0e.jpg#0	A child in a pink dress is climbing up a set of stairs in an entry way .
1000268201_693b08cb0e.jpg#1	A girl going into a wooden building .
1000268201_693b08cb0e.jpg#2	A little girl climbing into a wooden playhouse .
1000268201_693b08cb0e.jpg#3	A little girl climbing the stairs to her playhouse .
1000268201_693b08cb0e.jpg#4	A little girl in a pink dress going into a wooden cabin .


In [None]:
# Initialize a dictionary with lists
image_captions = defaultdict(list)

# Parse each line
for line in lines:
    line = line.strip()
    if '\t' in line:
        img_id_with_index, caption = line.split('\t')
        img_id = img_id_with_index.split('#')[0]
        image_captions[img_id].append(caption)

# Preview example
example_key = list(image_captions.keys())[0]
print(f"Image: {example_key}")
print("Captions:")
for cap in image_captions[example_key]:
    print(f"- {cap}")

Image: 1000268201_693b08cb0e.jpg
Captions:
- A child in a pink dress is climbing up a set of stairs in an entry way .
- A girl going into a wooden building .
- A little girl climbing into a wooden playhouse .
- A little girl climbing the stairs to her playhouse .
- A little girl in a pink dress going into a wooden cabin .


In [4]:
def clean_caption(caption):
    # Convert to lowercase
    caption = caption.lower()
    
    # Remove punctuation
    caption = caption.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers (optional)
    caption = re.sub(r'\d+', '', caption)
    
    # Remove extra whitespace
    caption = caption.strip()
    
    # Add special tokens
    caption = f"<start> {caption} <end>"
    
    return caption

In [7]:
# Apply cleaning to all captions
for img_id in image_captions:
    cleaned = [clean_caption(c) for c in image_captions[img_id]]
    image_captions[img_id] = cleaned

# Preview cleaned captions
print(f"Cleaned captions for {example_key}:")
for cap in image_captions[example_key]:
    print(f"- {cap}")

Cleaned captions for 1000268201_693b08cb0e.jpg:
- <start> a child in a pink dress is climbing up a set of stairs in an entry way <end>
- <start> a girl going into a wooden building <end>
- <start> a little girl climbing into a wooden playhouse <end>
- <start> a little girl climbing the stairs to her playhouse <end>
- <start> a little girl in a pink dress going into a wooden cabin <end>


In [9]:
# Flatten all captions into a single list of words
all_captions = []
for captions in image_captions.values():
    for cap in captions:
        all_captions.extend(cap.split())

# Count word frequencies
word_freq = Counter(all_captions)

# Minimum word frequency threshold
min_word_freq = 5

# Filter out rare words
words = [w for w in word_freq if word_freq[w] >= min_word_freq]

# Special tokens
special_tokens = ['<pad>', '<start>', '<end>', '<unk>']

# Final vocabulary
vocab = special_tokens + sorted(words)

# Build mappings
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"Total words in vocab (after filtering): {len(vocab)}")
print(f"Sample mapping: 'girl' → {word2idx.get('girl')}")

Total words in vocab (after filtering): 2990
Sample mapping: 'girl' → 1055


In [10]:
# Convert a caption into a list of word indices
def caption_to_indices(caption, word2idx):
    return [word2idx.get(word, word2idx['<unk>']) for word in caption.split()]

# Store caption sequences in a new dictionary
image_caption_seqs = {}

for img_id, captions in image_captions.items():
    image_caption_seqs[img_id] = [caption_to_indices(cap, word2idx) for cap in captions]

# Preview example
print(f"Indexed captions for {example_key}:")
for seq in image_caption_seqs[example_key]:
    print(seq)

Indexed captions for 1000268201_693b08cb0e.jpg:
[5, 6, 496, 1278, 6, 1875, 768, 1311, 530, 2821, 6, 2220, 1695, 2468, 1278, 54, 3, 2902, 4]
[5, 6, 1055, 1076, 1309, 6, 2957, 354, 4]
[5, 6, 1479, 1055, 530, 1309, 6, 2957, 1906, 4]
[5, 6, 1479, 1055, 530, 2657, 2468, 2699, 1198, 1906, 4]
[5, 6, 1479, 1055, 1278, 6, 1875, 768, 1076, 1309, 6, 2957, 3, 4]
