In [None]:
# !pip3 install gensim

In [1]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')

In [2]:
word_vectors = model

print(word_vectors['computer']) # Example: Accessing the vector for the word 'computer'

[ 1.07421875e-01 -2.01171875e-01  1.23046875e-01  2.11914062e-01
 -9.13085938e-02  2.16796875e-01 -1.31835938e-01  8.30078125e-02
  2.02148438e-01  4.78515625e-02  3.66210938e-02 -2.45361328e-02
  2.39257812e-02 -1.60156250e-01 -2.61230469e-02  9.71679688e-02
 -6.34765625e-02  1.84570312e-01  1.70898438e-01 -1.63085938e-01
 -1.09375000e-01  1.49414062e-01 -4.65393066e-04  9.61914062e-02
  1.68945312e-01  2.60925293e-03  8.93554688e-02  6.49414062e-02
  3.56445312e-02 -6.93359375e-02 -1.46484375e-01 -1.21093750e-01
 -2.27539062e-01  2.45361328e-02 -1.24511719e-01 -3.18359375e-01
 -2.20703125e-01  1.30859375e-01  3.66210938e-02 -3.63769531e-02
 -1.13281250e-01  1.95312500e-01  9.76562500e-02  1.26953125e-01
  6.59179688e-02  6.93359375e-02  1.02539062e-02  1.75781250e-01
 -1.68945312e-01  1.21307373e-03 -2.98828125e-01 -1.15234375e-01
  5.66406250e-02 -1.77734375e-01 -2.08984375e-01  1.76757812e-01
  2.38037109e-02 -2.57812500e-01 -4.46777344e-02  1.88476562e-01
  5.51757812e-02  5.02929

In [3]:
print(word_vectors['cat'].shape)

(300,)


#### Similar Words

King + Woman - Man = ?

In [4]:
# Example of using most_similar
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man']))

[('queen', 0.7118192911148071), ('monarch', 0.6189674735069275), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('Queen_Consort', 0.5235945582389832), ('queens', 0.5181134343147278), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411403656006)]


In [5]:
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water'))

0.76640123
0.6510956
0.7643474
0.8543272
0.7594367
0.11408084


#### Most similar words

In [6]:
print(word_vectors.most_similar('tower', topn=5))

[('towers', 0.8531749844551086), ('skyscraper', 0.6417425870895386), ('Tower', 0.639177143573761), ('spire', 0.5946877598762512), ('responded_Understood_Atlasjet', 0.5931612849235535)]


### Word Embeddings

In [7]:
import torch
import torch.nn as nn

torch.nn.Embedding() -> The embedding layers is essentially a look-up operation, that retrieves rows from the embedding layer's  weights matrix via a token ID

In [9]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = nn.Embedding(vocab_size, output_dim)

In [10]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [11]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [12]:
# Let's say that we converted words from a sequence into a token ID array and now we want to get 
# word embeddings associated with those token IDs
input_ids = torch.tensor([2, 3, 5, 1])

print(embedding_layer(input_ids))


tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### Positional Embeddings (Encoding Word Positions)

In [4]:
import torch
import torch.nn as nn
import tiktoken

from torch.utils.data import Dataset, DataLoader

In [8]:
# Read the text

with open('the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()

print('Total number of characters: ',  len(raw_text))

Total number of characters:  20479


#### Dataset & Dataloader

In [5]:
class GPTDatasetV1(Dataset):
    
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids) - max_length, stride):
            self.input_ids.append(torch.tensor(token_ids[i:i+max_length]))
            self.target_ids.append(torch.tensor(token_ids[i+1: i+max_length+1]))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [6]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):
    
    tokenizer = tiktoken.get_encoding('gpt2')
    
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        drop_last=drop_last,
        shuffle=shuffle,
        num_workers=num_workers
    )
    
    return dataloader

#### Creating positional embeddings (workflow)

In [2]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = nn.Embedding(vocab_size, output_dim)

In [9]:
max_length = 4
dataloader = create_dataloader_v1(raw_text,
                                batch_size=8,
                                max_length=max_length,
                                stride=max_length,
                                shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [16]:
print('Input Tensor: \n', inputs)
print('Input Shape: ', inputs.shape)

print('\nTarget Tensor: \n', targets)
print('Target Shape: ', targets.shape)

Input Tensor: 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Input Shape:  torch.Size([8, 4])

Target Tensor: 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Target Shape:  torch.Size([8, 4])


In [18]:
# Generate token embeddings

token_embeddings = token_embedding_layer(inputs)
print('Token Embedding Shape: ', token_embeddings.shape)

Token Embedding Shape:  torch.Size([8, 4, 256])


In [19]:
# Generate positional embeddings
# since our sequence length is 4, we need to apply positional embeddings for 4 positions

context_length = max_length
pos_embedding_layer = nn.Embedding(context_length, output_dim)

In [26]:
# Since we'll use absolute positional encoding, 
# we'll pass the positional as vector to the positional_embedding_layer
print(torch.arange(max_length))

tensor([0, 1, 2, 3])


In [27]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print('Positional Embedding Shape: ', pos_embeddings.shape)

Positional Embedding Shape:  torch.Size([4, 256])


#### Obtain general token embedding representation

Embedding = Token embeddings + Positional Embeddings

In [35]:
inputs_embeddings = token_embeddings + pos_embeddings
inputs_embeddings.shape

torch.Size([8, 4, 256])

In [29]:
# Check for broadcasting
(token_embeddings + pos_embeddings).shape, (token_embeddings + pos_embeddings.unsqueeze(0)).shape

(torch.Size([8, 4, 256]), torch.Size([8, 4, 256]))

In [33]:
# Check for broadcasting
torch.all(token_embeddings + pos_embeddings == token_embeddings + pos_embeddings.unsqueeze(0))

tensor(True)