# Tokens and Embeddings

Here we look at the tokens and embeddings of the text data. We will use the `transformers` library to load the pre-trained BERT model and tokenizer. We will then use the tokenizer to tokenize the text data and look at the tokens and embeddings.

In [None]:
# get the necessary libraries
from transformers import BertTokenizer, BertModel
import torch

In [None]:
# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
# Tokenize input text
#text = "Once upon a time,"
text = "We need to stop anthropomorphizing ChatGPT."
tokens = tokenizer.tokenize(text)
print(tokens)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

['we', 'need', 'to', 'stop', 'ant', '##hr', '##op', '##omo', '##rp', '##hi', '##zing', 'chat', '##gp', '##t', '.']
[2057, 2342, 2000, 2644, 14405, 8093, 7361, 19506, 14536, 4048, 6774, 11834, 21600, 2102, 1012]


In [6]:
# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

In [7]:
# Convert token IDs to tensor
input_ids = torch.tensor([token_ids])
print(input_ids)

tensor([[ 2057,  2342,  2000,  2644, 14405,  8093,  7361, 19506, 14536,  4048,
          6774, 11834, 21600,  2102,  1012]])


In [8]:
# Get the embeddings
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state

print(embeddings)

tensor([[[ 6.8755e-02,  5.2630e-02,  1.6248e-01,  ..., -6.8136e-01,
           2.4855e-02,  2.9209e-01],
         [ 1.3168e-01,  5.4850e-02,  2.7825e-01,  ..., -6.8561e-01,
          -2.7146e-01,  3.2189e-04],
         [-1.3295e-01, -2.3009e-01, -6.8416e-01,  ..., -4.9208e-01,
          -2.5459e-01, -2.0726e-02],
         ...,
         [ 2.1965e-01,  1.2949e-01,  4.0878e-01,  ..., -6.9590e-01,
          -5.6488e-02,  1.1724e-01],
         [-6.5462e-02, -8.0915e-02,  1.3040e-01,  ..., -4.7972e-01,
           2.1645e-02,  2.8639e-01],
         [-2.0452e-02, -4.0751e-01,  5.9630e-02,  ..., -4.4704e-01,
           2.8000e-01,  1.0796e-01]]])
