In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

# create a function to yeild list of tokens
tokenizer = get_tokenizer('basic_english')
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# create vocabulary
vocab_size=8
vocab = build_vocab_from_iterator(yield_tokens(data),
                                  max_tokens=vocab_size,
                                  specials=["<unk>",
                                            "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [6]:
vocab.get_stoi()

{'<unk>': 0,
 '<pad>': 1,
 'ai': 2,
 'a': 3,
 'is': 6,
 'are': 4,
 'learning': 7,
 'cs': 5}

In [7]:
tokens = tokenizer(sample1)
print(tokens)

sample1_tokens = [vocab[token] for token in tokens]
print(sample1_tokens)

['we', 'are', 'learning', 'ai']
[0, 4, 7, 2]


In [8]:
tokens = tokenizer(sample2)
print(tokens)

sample2_tokens = [vocab[token] for token in tokens]
print(sample2_tokens)

['ai', 'is', 'a', 'cs', 'topic']
[2, 6, 3, 5, 0]


In [9]:
import torch

def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens]

    num_pads = sequence_length - len(tokens) 
    tokens = tokens + [vocab["<pad>"]] * num_pads

    return torch.tensor(tokens, dtype=torch.long)

# Vectorize the samples
sequence_length = 5
vectorized_sample1 = vectorize(sample1,
                               vocab,
                               sequence_length)
vectorized_sample2 = vectorize(sample2,
                               vocab,
                               sequence_length)
print("Vectorized Sample 1: ", vectorized_sample1)
print("Vectorized Sample 2: ", vectorized_sample2)

Vectorized Sample 1:  tensor([0, 4, 7, 2, 1])
Vectorized Sample 2:  tensor([2, 6, 3, 5, 0])


In [12]:
import torch

def vectorize(text, vocab, sequence_length):
    tokens = tokenizer(text)
    tokens = [vocab[token] for token in tokens]

    num_pads = sequence_length - len(tokens) 
    tokens = tokens + [vocab["<pad>"]] * num_pads

    return torch.tensor(tokens, dtype=torch.long)

# Vectorize the samples
sample3 = 'AI topic in CS is difficult'

vectorized_sample3 = vectorize(sample3,
                               vocab,
                               sequence_length)
print("Vectorized Sample 3: ", vectorized_sample3)

Vectorized Sample 3:  tensor([2, 0, 0, 5, 6, 0])


In [15]:
import torch.nn as nn
embed_dim = 4
embedding = nn.Embedding(vocab_size, embed_dim)

embedding

Embedding(8, 4)