In [142]:
import fasttext
from tokenizers import Tokenizer
import torch
import numpy as np
from tqdm import tqdm

In [2]:
tokenizer = Tokenizer.from_file("tokenizers/bpe_tokenizer/bpe_tokenizer.json")

In [51]:
tokenizer.enable_padding(direction="right", pad_id=0, pad_token='<pad>', length=256)
tokenizer.enable_truncation(max_length=256)

In [4]:
model = fasttext.load_model('embedding/fasttext/bpe_model.bin')



In [143]:
def create_embeddings(fasttext_model, bpe_tokenizer, dim=32):
    vocab_size = bpe_tokenizer.get_vocab_size()
    vocab = bpe_tokenizer.get_vocab()
    
    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (vocab_size, dim))
    
    # Load pretrained vectors
    for word, ids in tqdm(vocab.items()):
        embeddings[ids] = fasttext_model[word]
    
    # Padding token embedding set to 0
    embeddings[vocab['<pad>']] = np.zeros((dim,))
    
    return embeddings

In [144]:
embeddings = create_embeddings(model, tokenizer, dim=32)




  0%|                                                                                                                             | 0/18407 [00:00<?, ?it/s][A[A[A


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18407/18407 [00:00<00:00, 143855.57it/s][A[A[A


In [146]:
embeddings = torch.tensor(embeddings)

In [147]:
embeddings

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 9.3706e-02,  1.0837e+00, -3.0781e-01,  ..., -9.1932e-01,
         -7.0177e-03, -1.6006e+00],
        [-2.4031e-03,  9.2957e-05, -6.0015e-04,  ..., -3.0015e-03,
         -3.6742e-03,  7.4832e-03],
        ...,
        [-1.1193e-01,  8.7496e-01,  9.6823e-01,  ..., -2.9757e-01,
         -4.1534e-01, -2.7781e-01],
        [ 8.1030e-02,  1.3913e+00,  2.9736e-01,  ..., -3.9460e-01,
          2.1698e-01, -1.4945e-01],
        [ 8.4393e-02,  1.3341e+00,  6.4531e-01,  ..., -8.9766e-01,
         -2.2877e-01, -4.3331e-02]], dtype=torch.float64)

In [148]:
torch.save(embeddings, 'embedding/tensor/embeddings.pt')