In [12]:
CAPTIONS_PATH = "Flickr8k/captions.txt"

In [13]:
def caption_preprocessing():
    items = dict()
    with open(CAPTIONS_PATH, "r") as f:
        raw_data = f.read()
    f.close()
    lines = raw_data.split("\n")
    for line in lines[1:]:  # Skip the first line which is a header
        if len(line) > 0:
            img_path, caption = line.split(',', 1)
            if img_path not in items:
                items[img_path] = []
            caption = caption.lower()
            caption = caption.strip(" .")
            # caption = '<START> ' + caption + ' <END>'
            items[img_path].append(caption)
    return items

In [14]:
items = caption_preprocessing()

In [2]:
all_captions = []
for image in items:
    for caption in items[image]:
        all_captions.append(caption)

In [3]:
word_freq = {}
max_len = 18
for caption in all_captions:
    for word in caption.split(' '):
        word_freq[word] = word_freq.get(word, 0) + 1
min_freq = 3
special_tokens = ["<PAD>", "<START>", "<END>", "<UNK>"]
word_to_index = {token: idx for idx, token in enumerate(special_tokens)}
index_to_word = {idx: token for idx, token in enumerate(special_tokens)}
idx = 4
for word, freq in word_freq.items():
    if freq >= min_freq and word not in special_tokens:
        word_to_index[word] = idx
        index_to_word[idx] = word
        idx += 1

In [17]:
def make_sequence(items):
    sequences = {}
    for image, captions in items.items():
        sequences[image] = []
        for caption in captions:
            sequence = [1]
            for word in caption.split(' '):
                if word not in word_to_index:
                    word = "<UNK>"
                sequence.append(word_to_index[word])
            if len(sequence) >= max_len:
                sequence = sequence[:max_len - 1]
                sequence.append(2)
            elif len(sequence) < max_len:
                sequence.append(2)
                while(len(sequence) < max_len):
                    sequence.append(0)
            sequences[image].append(sequence)
    return sequences

In [18]:
all_sequences = make_sequence(items)

In [19]:
import os
import urllib.request
import zipfile
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

def download_glove_embeddings():
    """Download GloVe embeddings automatically"""
    
    embeddings_dir = 'embeddings'
    if not os.path.exists(embeddings_dir):
        os.makedirs(embeddings_dir)
    
    glove_file = os.path.join(embeddings_dir, 'glove.6B.300d.txt')
    
    if os.path.exists(glove_file):
        print("✅ GloVe embeddings already exist!")
        return glove_file
    
    print("📥 Downloading GloVe embeddings...")
    
    # Download zip file
    zip_url = "http://nlp.stanford.edu/data/glove.6B.zip"
    zip_path = "glove.6B.zip"
    
    try:
        urllib.request.urlretrieve(zip_url, zip_path)
        print("✅ Download completed!")
        
        # Extract specific file
        print("📂 Extracting embeddings...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract('glove.6B.300d.txt', embeddings_dir)
        
        # Clean up
        os.remove(zip_path)
        print(f"🎉 GloVe embeddings ready: {glove_file}")
        
        return glove_file
        
    except Exception as e:
        print(f"❌ Download failed: {e}")
        print("🔗 Manual download: http://nlp.stanford.edu/data/glove.6B.zip")
        return None

# Usage
glove_path = download_glove_embeddings()

✅ GloVe embeddings already exist!


In [20]:
import numpy as np

embeddings_index = {}
file = open(glove_path, encoding="utf-8")

for line in file:
  values = line.split()
  word = values[0]
  coefs = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coefs
file.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [21]:
embedding_dim = 300
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

for word, idx in word_to_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx] = embedding_vector

print("Embeddings matrix shape:", embedding_matrix.shape)

Embeddings matrix shape: (4160, 300)


In [22]:
import os
from pickle import dump

# Tạo thư mục nếu chưa tồn tại
os.makedirs("Processed Data", exist_ok=True)

# Lưu embedding matrix
with open("Processed Data/embedding_matrix.pkl", "wb") as f:
    dump(embedding_matrix, f)

# Lưu word mappings
with open("Processed Data/word_to_index.pkl", "wb") as f:
    dump(word_to_index, f)

with open("Processed Data/index_to_word.pkl", "wb") as f:
    dump(index_to_word, f)

with open("Processed Data/encoded_captions.pkl", "wb") as f:
    dump(all_sequences, f)
