In [2]:
import torch
from torch import nn

In [3]:
class Tokenizer:

    def __init__(self):
        self.dictionary = {}
        self.reverse_dictionary = {}

        # Add the padding token
        self.__add_to_dict('<pad>')

        # Add characters and numbers to the dictionary
        for i in range(10):
            self.__add_to_dict(str(i))
        for i in range(26):
            self.__add_to_dict(chr(ord('a') + i))

        # Add space and punctuation to the dictionary
        self.__add_to_dict('.')
        self.__add_to_dict(' ')

    def __add_to_dict(self, character):
        if character not in self.dictionary:
            self.dictionary[character] = len(self.dictionary)
            self.reverse_dictionary[self.dictionary[character]] = character

    def tokenize(self, text):
        return [self.dictionary[c] for c in text]

    def character_to_token(self, character):
        return self.dictionary[character]

    def token_to_character(self, token):
        return self.reverse_dictionary[token]

    def size(self):
        return len(self.dictionary)

In [4]:
training_data = '. '.join([
    'cats rule the world',
    'dogs are the best',
    'elephants have long trunks',
    'monkeys like bananas',
    'pandas eat bamboo',
    'tigers are dangerous',
    'zebras have stripes',
    'lions are the kings of the savannah',
    'giraffes have long necks',
    'hippos are big and scary',
    'rhinos have horns',
    'penguins live in the arctic',
    'polar bears are white'
])

In [5]:
training_data

'cats rule the world. dogs are the best. elephants have long trunks. monkeys like bananas. pandas eat bamboo. tigers are dangerous. zebras have stripes. lions are the kings of the savannah. giraffes have long necks. hippos are big and scary. rhinos have horns. penguins live in the arctic. polar bears are white'

In [6]:
token_indices = Tokenizer().tokenize(training_data)
token_indices[-10:]

[38, 11, 28, 15, 38, 33, 18, 19, 30, 15]

In [7]:
MAX_SEQ_LEN = 5
def create_seq(token_indices: list[int]):
    sequences=[]
    for i in range(0, len(token_indices) - MAX_SEQ_LEN+1):
        sequences.append(token_indices[i:i+MAX_SEQ_LEN])
    return sequences
train_seq = create_seq(token_indices)  
train_seq[:10]

[[13, 11, 30, 29, 38],
 [11, 30, 29, 38, 28],
 [30, 29, 38, 28, 31],
 [29, 38, 28, 31, 22],
 [38, 28, 31, 22, 15],
 [28, 31, 22, 15, 38],
 [31, 22, 15, 38, 30],
 [22, 15, 38, 30, 18],
 [15, 38, 30, 18, 15],
 [38, 30, 18, 15, 38]]

In [8]:

#X = torch.tensor(train_seq)[:,:-1]
#Y = torch.tensor(train_seq)[:,-1]


In [9]:
MIN_BATCH_SIZE = 2
def batchify(token_seq):
    for i in range(0, len(token_seq)-MIN_BATCH_SIZE+1, MIN_BATCH_SIZE):
        batch_as_list = token_seq[i:i+MIN_BATCH_SIZE]
        X=torch.LongTensor(batch_as_list)[:,:-1] # last one for target
        Y=torch.LongTensor(batch_as_list)[:,-1]
        yield i,X,Y

#for i,X,Y in batchify(train_seq):
    #print(i,"---")
    #display(X,Y)

In [10]:

# vocab_size = Tokenizer().size()
# X_hot = F.one_hot(X, vocab_size).sum(axis=1).type(torch.FloatTensor)
# Y_hot = F.one_hot(Y, vocab_size).type(torch.FloatTensor)

In [11]:
import torch
from torch import nn
import lightning as L
import torch.nn.functional as F

vocab_size = Tokenizer().size()
NUM_HIDDEN = 10

# my embedding
class Embedding(nn.Module):
    def __init__(self,vocab_size, embedding_dim):
        super().__init__()
        #self.W = torch.zeros(vocab_size, embedding_dim)
        self.W = nn.parameter.Parameter(torch.zeros(vocab_size, embedding_dim))
        
        
    def forward(self, X): # X is a batch
        return self.W[X].sum(axis=1)


class Word2Vec(nn.Module):
    def __init__(self, embedding_dim, padding_idx=0):
        super().__init__()
        #self.embedding = nn.Linear(vocab_size, embedding_dim)
        #self.embedding = Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx,  max_norm=1)
        self.output = nn.Linear(embedding_dim, vocab_size)
       
    def forward(self, X: torch.Tensor):
        em = self.embedding(X)
        
        return self.output(em.sum(axis=1))
    
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)
        return optimizer

In [12]:
#torch.empty(3, dtype=torch.long).random_(5)
    

In [13]:
# em = Embedding(3,5)
# em(batch)
# batch = torch.LongTensor([[0,1],[0,2]])
# W[batch].sum(axis=1)

In [14]:
w2v_model = Word2Vec(NUM_HIDDEN, Tokenizer().character_to_token("<pad>"))
#w2v_model(X_hot)

In [15]:
#w2v_model.embedding(torch.tensor([[1,2,3,4]]))

In [16]:

optimizer = torch.optim.Adam(w2v_model.parameters(), lr=1e-3) 

def train():
    num_epochs=100
    for epoch in range(num_epochs):
        for i,X,Y in batchify(train_seq):

            loss_fn = nn.CrossEntropyLoss(reduction="mean")
            y_hat = w2v_model(X)
            
            #Y_vec=F.one_hot(Y,vocab_size).type(torch.LongTensor)
            #print(X.shape,y_hat.shape)
            loss = loss_fn(y_hat, Y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            #break
        #display(w2v_model.embedding.W)
        #if epoch ==0 or epoch == 99:
        #    for p in w2v_model.embedding.parameters():
        #        display(p)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')
        
train()

Epoch [1/100], Loss: 3.3688549995422363
Epoch [2/100], Loss: 3.1380667686462402
Epoch [3/100], Loss: 2.9523777961730957
Epoch [4/100], Loss: 2.8204145431518555
Epoch [5/100], Loss: 2.724544048309326
Epoch [6/100], Loss: 2.6485695838928223
Epoch [7/100], Loss: 2.5839121341705322
Epoch [8/100], Loss: 2.5264928340911865
Epoch [9/100], Loss: 2.4742612838745117
Epoch [10/100], Loss: 2.4260783195495605
Epoch [11/100], Loss: 2.381256103515625
Epoch [12/100], Loss: 2.3393702507019043
Epoch [13/100], Loss: 2.300474166870117
Epoch [14/100], Loss: 2.2640490531921387
Epoch [15/100], Loss: 2.229979991912842
Epoch [16/100], Loss: 2.1981897354125977
Epoch [17/100], Loss: 2.169308662414551
Epoch [18/100], Loss: 2.147266387939453
Epoch [19/100], Loss: 2.12632155418396
Epoch [20/100], Loss: 2.106503963470459
Epoch [21/100], Loss: 2.08780837059021
Epoch [22/100], Loss: 2.0702104568481445
Epoch [23/100], Loss: 2.0536792278289795
Epoch [24/100], Loss: 2.0381767749786377
Epoch [25/100], Loss: 2.023666381835

In [27]:
# import io
# from torchtext.vocab import build_vocab_from_iterator
# def yield_tokens(file_path):
#     with io.open(file_path, encoding = 'utf-8') as f:
#         for line in f:
#             yield line.strip().split()
# train_file="/Users/mak/Downloads/wikitext-2/wiki.train.tokens"
# vocab = build_vocab_from_iterator(yield_tokens(train_file), specials=["<unk>"])
# vocab.set_default_index(vocab['<unk>'])

In [65]:
#vocab.lookup_tokens(range(10))