In [1]:
CAPTIONS_PATH = "Flickr8k/captions.txt"

In [2]:
def caption_preprocessing():
    items = dict()
    with open(CAPTIONS_PATH, "r") as f:
        raw_data = f.read()
    f.close()
    lines = raw_data.split("\n")
    for line in lines[1:]:  # Skip the first line which is a header
        if len(line) > 0:
            img_path, caption = line.split(',', 1)
            if img_path not in items:
                items[img_path] = []
            caption = caption.lower()
            caption = caption.strip(" .")
            # caption = '<START> ' + caption + ' <END>'
            items[img_path].append(caption)
    return items

In [3]:
items = caption_preprocessing()

In [4]:
all_captions = []
for image in items:
    for caption in items[image]:
        all_captions.append(caption)

In [5]:
word_freq = {}
max_len = 0
for caption in all_captions:
    max_len = max(max_len, len(caption.split(' ')))
    for word in caption.split(' '):
        word_freq[word] = word_freq.get(word, 0) + 1
min_freq = 5
special_tokens = ["<PAD>", "<START>", "<END>", "<UNK>"]
word_to_index = {token: idx for idx, token in enumerate(special_tokens)}
index_to_word = {idx: token for idx, token in enumerate(special_tokens)}
idx = 4
for word, freq in word_freq.items():
    if freq >= min_freq and word not in special_tokens:
        word_to_index[word] = idx
        index_to_word[idx] = word
        idx += 1

In [6]:
def make_sequence(items):
    sequences = {}
    for image, captions in items.items():
        sequences[image] = []
        for caption in captions:
            sequence = [1]
            for word in caption.split(' '):
                if word not in word_to_index:
                    word = "<UNK>"
                sequence.append(word_to_index[word])
            if len(sequence) >= max_len:
                sequence = sequence[:max_len - 1]
                sequence.append(2)
            elif len(sequence) < max_len:
                sequence.append(2)
                while(len(sequence) < max_len):
                    sequence.append(0)
            sequences[image].append(sequence)
    return sequences

In [7]:
all_sequences = make_sequence(items)

In [8]:
import os
import urllib.request
import zipfile
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

def download_glove_embeddings():
    """Download GloVe embeddings automatically"""
    
    embeddings_dir = 'embeddings'
    if not os.path.exists(embeddings_dir):
        os.makedirs(embeddings_dir)
    
    glove_file = os.path.join(embeddings_dir, 'glove.6B.300d.txt')
    
    if os.path.exists(glove_file):
        print("✅ GloVe embeddings already exist!")
        return glove_file
    
    print("📥 Downloading GloVe embeddings...")
    
    # Download zip file
    zip_url = "http://nlp.stanford.edu/data/glove.6B.zip"
    zip_path = "glove.6B.zip"
    
    try:
        urllib.request.urlretrieve(zip_url, zip_path)
        print("✅ Download completed!")
        
        # Extract specific file
        print("📂 Extracting embeddings...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract('glove.6B.300d.txt', embeddings_dir)
        
        # Clean up
        os.remove(zip_path)
        print(f"🎉 GloVe embeddings ready: {glove_file}")
        
        return glove_file
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print("🔗 Manual download: http://nlp.stanford.edu/data/glove.6B.zip")
        return None

# Usage
glove_path = download_glove_embeddings()

✅ GloVe embeddings already exist!


In [9]:
import numpy as np

embeddings_index = {}
file = open(glove_path, encoding="utf-8")

for line in file:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
file.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [10]:
embedding_dim = 300
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

for word, idx in word_to_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

print("Embeddings matrix shape:", embedding_matrix.shape)

Embeddings matrix shape: (3040, 300)


In [11]:
import os
from pickle import dump

# Tạo thư mục nếu chưa tồn tại
os.makedirs("Processed Data", exist_ok=True)

# Lưu embedding matrix
with open("Processed Data/embedding_matrix.pkl", "wb") as f:
    dump(embedding_matrix, f)

# Lưu word mappings
with open("Processed Data/word_to_index.pkl", "wb") as f:
    dump(word_to_index, f)

with open("Processed Data/index_to_word.pkl", "wb") as f:
    dump(index_to_word, f)


In [15]:
from PIL import Image

images = {}
captions = {}

for image_path in all_sequences:
    images[image_path] = Image.open(os.path.join("Flickr8k/Images", image_path))
    try:
        captions[image_path].append(all_sequences[image_path])
    except:
        captions[image_path] = all_sequences[image_path]

len(images), len(captions)


(8091, 8091)

In [None]:
with open("Processed Data/encoded_captions.pkl", "wb") as f:
    dump(captions, f)
with open("Processed Data/images.pkl", "wb") as f:
    dump(images, f)

In [None]:
import torch
from torchvision import models, transforms
from PIL import Image
import pickle
import os

# Load ResNet50, bỏ fully connected cuối
resnet = models.resnet50(pretrained=True)
modules = list(resnet.children())[:-1]  # Bỏ fc
resnet = torch.nn.Sequential(*modules)
resnet.eval()

# Transform cho ảnh
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load images.pkl
with open('Processed Data/images.pkl', 'rb') as f:
    images = pickle.load(f)

features = {}
for img_name, img in images.items():
    img = transform(img).unsqueeze(0)  # (1, 3, 224, 224)
    with torch.no_grad():
        feat = resnet(img).squeeze().numpy()  # (2048,)
    features[img_name] = feat

# Lưu lại
with open('Processed Data/image_features_resnet50.pkl', 'wb') as f:
    pickle.dump(features, f)