In [None]:
#Embedding queries and captions:

#Process captions/queries by lowercasing the text, removing punctuation, and tokenizing words based on white space. Refer to the “bag of words” exercise notebook for efficient code for striping punctuation out of a string

#Take our vocabulary to be all words across all captions in the COCO dataset. Treating each caption as its own “document” compute the inverse document frequency for each word in the vocabulary. Efficiency is important here!

#Make a function that can embed any caption / query text (using GloVe-200 embeddings weighted by IDFs of words across captions)

#An individual word not in the GloVe or IDF vocabulary should yield an embedding vector of just zeros.

In [15]:
from cogworks_data.language import get_data_path
import numpy as np
from pathlib import Path
import json
from collections import Counter

# load COCO metadata
filename = get_data_path("captions_train2014.json")
with Path(filename).open() as f:
    coco_data = json.load(f)

In [10]:
import re, string
punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))
from gensim.models import KeyedVectors
filename = "glove.6B.200d.txt.w2v"

# this takes a while to load -- keep this in mind when designing your capstone project
glove = KeyedVectors.load_word2vec_format(get_data_path(filename), binary=False)
    


Downloading file 'glove.6B.200d.txt.w2v' from 'https://github.com/rsokl/cog_data/releases/download/language-files/glove.6B.200d.txt.w2v' to '/Users/andrew/Library/Caches/cog_data'.


In [2]:
def preprocess_captions(captions):
    words = []
    for caption in captions:
        cleaned_caption = punc_regex.sub('', caption).lower()
        words = cleaned_caption.split()
        words.extend(words)

    word_counts = Counter(words)
    num_docs = len(captions)
    weights = {word: np.log10(num_docs / count) for word, count in word_counts.items()}
    return weights


In [None]:
all_captions = [caption_info["caption"] for caption_info in coco_data["annotations"]]
idf_weights = preprocess_captions(all_captions)

In [72]:
def embed_caption(caption, dim=200):
    
    cleanCap = punc_regex.sub('', caption).lower()
    tokens = cleanCap.split()

    real_embeddings = []
    for word in tokens:
        if word in glove:
            idf = idf_weights.get(word)
            real_embeddings.append(idf * glove[word])  # Directly calculate weighted embedding
        else:
            real_embeddings.append(np.zeros(dim))  # Zero vector for missing words
    if not real_embeddings:  #No words
        return np.zeros(dim)
    
    # Sum
    caption_embedding = np.sum(real_embeddings, axis=0) 
    normalized_embedding = caption_embedding / np.linalg.norm(caption_embedding)  # Normalize
    return normalized_embedding

all_embeddings = [embed_caption(caption) for caption in all_captions]

In [1]:
#Make a function that can embed any caption / query text (using GloVe-200 embeddings weighted by IDFs of words across captions)

    
