*Dataset Preprocessing*

In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\klkum\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\klkum\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
file_path = 'data/COCO_overlaping_dataset.txt'

data = []
with open(file_path, 'r') as f:
    for line in f:
        parts = line.strip().split(',')
        if len(parts) >= 3:
          keywords, descriptions, target_keywords = parts[0], parts[1], parts[2]
          data.append((keywords, descriptions, target_keywords))
  
df = pd.DataFrame(data, columns=['keywords', 'descriptions', 'target_keywords'])
df.head()

Unnamed: 0,keywords,descriptions,target_keywords
0,laptop carton comicbook,a laptop that has stickers on its cover is sit...,{'laptop'}
1,snorkel ski tennis ball,two frames of a woman in the air on a tennis c...,{'tennis'}
2,sorrel hog barrel,a brown horse eating from a hallowed out metal...,{'barrel'}
3,ballplayer baseball footballhelmet,a man throwing a baseball from a mound on a field,{'baseball'}
4,ballplayer baseball football helmet,a boys baseball game with a batter catcher and...,{'baseball'}


In [20]:
print(f"Total number of records: {len(df)}")

Total number of records: 70797


In [21]:
def preprocess_text(text):
  text = re.sub(r"[^a-zA-Z]", " ", text)
  text = text.lower()
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word not in stopwords.words("english")]
  return " ".join(tokens)

df['tokenized_descriptions'] = df['descriptions'].apply(preprocess_text).apply(word_tokenize)
print("Descriptions cleaned successfully!")
df[['descriptions', 'tokenized_descriptions']].head()

Descriptions cleaned successfully!


Unnamed: 0,descriptions,tokenized_descriptions
0,a laptop that has stickers on its cover is sit...,"[laptop, stickers, cover, sitting, table]"
1,two frames of a woman in the air on a tennis c...,"[two, frames, woman, air, tennis, court]"
2,a brown horse eating from a hallowed out metal...,"[brown, horse, eating, hallowed, metal, barrel]"
3,a man throwing a baseball from a mound on a field,"[man, throwing, baseball, mound, field]"
4,a boys baseball game with a batter catcher and...,"[boys, baseball, game, batter, catcher, umpire]"


In [22]:
# df['primary_keyword'] = df['keywords'].apply(lambda x: x.split()[0] if x else '')
# df['primary_keyword'] = df['primary_keyword'].apply(preprocess_text)

# print('Keywords processed successfully!')
# df[['keywords', 'primary_keyword']].head()

In [23]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.2, random_state=42)

print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(val)}")
print(f"Test set size: {len(test)}")

Training set size: 45309
Validation set size: 11328
Test set size: 14160


In [24]:
train['tokenized_descriptions'] = train['descriptions'].apply(word_tokenize)
val['tokenized_descriptions'] = val['descriptions'].apply(word_tokenize)
test['tokenized_descriptions'] = test['descriptions'].apply(word_tokenize)

In [25]:
from collections import Counter

all_tokens = [token for tokens in train['tokenized_descriptions'] for token in tokens]

token_freq = Counter(all_tokens)

vocab = {word: idx + 1 for idx, (word, _) in enumerate(token_freq.most_common())}
vocab['<PAD>'] = 0
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 9128


In [26]:
def tokens_to_sequence(tokens, vocab):
    return [vocab[token] for token in tokens if token in vocab]

train['sequences'] = train['tokenized_descriptions'].apply(lambda x: tokens_to_sequence(x, vocab))
val['sequences'] = val['tokenized_descriptions'].apply(lambda x: tokens_to_sequence(x, vocab))
test['sequences'] = test['tokenized_descriptions'].apply(lambda x: tokens_to_sequence(x, vocab))

train[['tokenized_descriptions', 'sequences']].head()

Unnamed: 0,tokenized_descriptions,sequences
55564,"[a, well, cooked, pizza, on, a, plate, not, ye...","[1, 691, 402, 20, 2, 1, 14, 676, 2280, 247, 15..."
19778,"[a, bear, sitting, on, a, bench, in, a, park]","[1, 42, 11, 2, 1, 63, 7, 1, 154]"
15434,"[an, old, model, truck, riding, past, a, park,...","[10, 159, 1241, 62, 128, 531, 1, 154, 136, 4, ..."
61716,"[four, girls, smile, while, posing, with, a, p...","[303, 469, 1448, 70, 340, 4, 1, 20]"
56965,"[work, truck, backed, up, to, the, business, o...","[473, 62, 2281, 37, 9, 5, 737, 315]"


In [27]:
def pad_sequence(sequence, max_length, pad_token=0):
    """
    Pads a sequence with the pad_token to match the max_length.
    Truncates if the sequence is longer than max_length.
    """
    if len(sequence) < max_length:
        return sequence + [pad_token] * (max_length - len(sequence))
    else:
        return sequence[:max_length]

max_length = max(len(seq) for seq in train['sequences'])
print(f"Maximum sequence length: {max_length}")

train['padded_sequences'] = train['sequences'].apply(lambda x: pad_sequence(x, max_length))
val['padded_sequences'] = val['sequences'].apply(lambda x: pad_sequence(x, max_length))
test['padded_sequences'] = test['sequences'].apply(lambda x: pad_sequence(x, max_length))

train[['sequences', 'padded_sequences']].head()

Maximum sequence length: 41


Unnamed: 0,sequences,padded_sequences
55564,"[1, 691, 402, 20, 2, 1, 14, 676, 2280, 247, 15...","[1, 691, 402, 20, 2, 1, 14, 676, 2280, 247, 15..."
19778,"[1, 42, 11, 2, 1, 63, 7, 1, 154]","[1, 42, 11, 2, 1, 63, 7, 1, 154, 0, 0, 0, 0, 0..."
15434,"[10, 159, 1241, 62, 128, 531, 1, 154, 136, 4, ...","[10, 159, 1241, 62, 128, 531, 1, 154, 136, 4, ..."
61716,"[303, 469, 1448, 70, 340, 4, 1, 20]","[303, 469, 1448, 70, 340, 4, 1, 20, 0, 0, 0, 0..."
56965,"[473, 62, 2281, 37, 9, 5, 737, 315]","[473, 62, 2281, 37, 9, 5, 737, 315, 0, 0, 0, 0..."


In [None]:
train[['padded_sequences']].to_csv('data/train_padded.csv', index=False)
val[['padded_sequences']].to_csv('data/val_padded.csv', index=False)
test[['padded_sequences']].to_csv('data/test_padded.csv', index=False)

import json
with open('vocabulary.json', 'w') as vocab_file:
    json.dump(vocab, vocab_file)

print("Padded sequences and vocabulary saved successfully!")

Padded sequences and vocabulary saved successfully!
