# 🤗 Multimodal generation - Part 1: sentence dataset 🤗

In [1]:
cd ..

/Users/simonbrandeis/dev/multimodal-code-exercise


In [2]:
import os
import torch
import transformers
from transformers import AutoTokenizer

## Parameters definition

In [3]:
# Directory where data is to be stored and loaded
DATA_DIR = "./data/"

# Name of the pretrained language model to use
PRETRAINED_LM_NAME = 'distilbert-base-uncased'

# Maximum length of a sentence, in number of tokens
SENTENCE_MAX_TOKEN_LEN = 32

# Patterns to generate the dataset - <WORD> is replaced by the name of a class in ImageNet
PATTERNS = (
    "i saw a <WORD>.",
    "people love <WORD>s!",
    "what do you think of <WORD>s?",
    "a <WORD> in a field.",
    "<WORD>s on a roof.",
    "<WORD>s are great.",
    "i hate <WORD>s.",
    "the whole thing is about <WORD>s",
    "<WORD>s <WORD>s <WORD>s <WORD>s",
    "<WORD>s are more kind of like a philosophical concept."
)

## Tokenizer instanciation

In [4]:
lm_tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_LM_NAME)
pad_token_id = lm_tokenizer.pad_token_id

## Generation and tokenization of the sentences

In [5]:
# Copy-paste of prepare_data.py script
from text2img.data.generate import generate_dataset, tokenize_dataset

sentences = generate_dataset(patterns=PATTERNS, lm_tokenizer=lm_tokenizer)

tokenized_sentences, labels, texts = tokenize_dataset(dataset=sentences,
                                         lm_tokenizer=lm_tokenizer)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/simonbrandeis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
INFO:text2img.data.generate:Building a list of simple words associated to ImageNet classes
100%|██████████| 1000/1000 [00:00<00:00, 8309.17it/s]
INFO:text2img.data.generate:Building a list of sentences associated to each selected ImageNet class
100%|██████████| 598/598 [00:00<00:00, 223786.03it/s]
100%|██████████| 598/598 [00:01<00:00, 441.47it/s]


## Padding / clipping of the tokenized sentences

In [6]:
from text2img.data.utils import pad_or_clip

padded_tokens = [pad_or_clip(tokens, SENTENCE_MAX_TOKEN_LEN, pad_token_id)
                 for tokens in tokenized_sentences]

## Save the dataset to DATA_DIR

In [7]:
labels_tensor = torch.tensor(labels)
tokens_tensor = torch.tensor(padded_tokens)

torch.save(labels_tensor, os.path.join(DATA_DIR, 'labels_tensor.bin'))
torch.save(tokens_tensor, os.path.join(DATA_DIR, 'tokens_tensor.bin'))

with open(os.path.join(DATA_DIR, 'input_texts.txt'), 'w') as f:
    for item in texts:
        f.write("%s\n" % item)

# Discussion on how to improve the dataset generation

Our dataset of sentences lacks of diversity: all samples are from the same patterns. Plus, some sentences might not make sense at all using this method of generation. Take for instance the pattern `<WORD>s on a roof` and the word `shark`. One way to remedy both is to sample sentences containing the word of interest from the Internet, for instance from... Wait for it... Wikipedia.

As pointed out in the code exercise description, ImageNet classes are very diverse and sometimes very specific. Dropping the classes that are not in the tokenizer vocabulary is a first step, but some "undesired" classes still pass through (eg poodle, husky for the dog races). We could go further and remove those classes manually.