In [1]:
import os
import numpy as np
from transformers import BertTokenizer, BertModel
import torch

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
def extract_token_embeddings(sentence):
    """
    Returns:
      tokens: list of tokens (str)
      embeddings: np.array shape (seq_len, hidden_size)
    """
    inputs = tokenizer(sentence, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    emb = outputs.last_hidden_state[0].cpu().numpy()   # (seq_len, 768)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu().numpy())
    return tokens, emb


In [5]:
s = "The bank will not issue the loan today."
toks, emb = extract_token_embeddings(s)
print("Tokens:", toks)
print("Emb shape:", emb.shape)  # seq_len, 768

Tokens: ['[CLS]', 'the', 'bank', 'will', 'not', 'issue', 'the', 'loan', 'today', '.', '[SEP]']
Emb shape: (11, 768)


In [7]:
# Save example
np.save("bert_example_token_emb.npy", emb)
print("Saved example embeddings.")

Saved example embeddings.


In [12]:
def get_word_embedding(word):
    sent = f"This is about {word}."
    toks, emb = extract_token_embeddings(sent)

    # find token index
    if word in toks:
        idx = toks.index(word)
    else:
        # BERT might tokenize as "queen" -> "que", "##en"
        pieces = tokenizer.tokenize(word)
        idx = toks.index(pieces[0])

    return emb[idx]   # (768,)


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine(a, b):
    return float(cosine_similarity(a.reshape(1, -1), b.reshape(1, -1))[0][0])


In [19]:
pairs = [
    ("man", "woman"),
    ("king", "queen"),
    ("prince", "princess"),
    ("lord", "lady"),
    ("queen", "princess"),
    ("king", "crown"),
    ("lady", "throne"),
    ("man", "throne"),
]
pairs = [
    ("lord", "lady"),
    ("king", "queen"),
    ("north", "winterfell"),
    ("wall", "night"),
    ("throne", "crown"),
    ("king", "throne"),
    ("lord", "command"),
    ("army", "battle"),
    ("winter", "north"),
    # Object / place pairs
    ("dragon", "fire"),
    ("sword", "steel"),
    ("castle", "tower"),
    ("river", "bridge"),
    ("ship", "sea"),
    ("horse", "rider"),
    ("snow", "ice"),
    # Oppositional semantic pairs (good for similarity tests)
    ("life", "death"),
    ("love", "war"),
    ("night", "day"),
    ("north", "south"),
    ("king", "traitor"),
    ("friend", "enemy"),

    # high similarity
    ("jon", "arya"),  # Stark siblings → high similarity
    ("cersei", "jaime"),  # same contexts
    ("tyrion", "cersei"),  # medium similarity
    ("dragon", "queen"),  # Daenerys context
    ("winterfell", "north"),  # location close
    ("throne", "king"),  # context similar
]


In [20]:
results = []

for w1, w2 in pairs:
    e1 = get_word_embedding(w1)
    e2 = get_word_embedding(w2)
    sim = cosine(e1, e2)
    results.append((w1, w2, sim))

for w1, w2, sim in results:
    print(f"{w1:10s} {w2:10s} similarity = {sim:.4f}")


lord       lady       similarity = 0.6531
king       queen      similarity = 0.7833
north      winterfell similarity = 0.2271
wall       night      similarity = 0.5079
throne     crown      similarity = 0.7881
king       throne     similarity = 0.6896
lord       command    similarity = 0.5258
army       battle     similarity = 0.6168
winter     north      similarity = 0.5231
dragon     fire       similarity = 0.5017
sword      steel      similarity = 0.6684
castle     tower      similarity = 0.7727
river      bridge     similarity = 0.6397
ship       sea        similarity = 0.7093
horse      rider      similarity = 0.7625
snow       ice        similarity = 0.7257
life       death      similarity = 0.6159
love       war        similarity = 0.6142
night      day        similarity = 0.6718
north      south      similarity = 0.7647
king       traitor    similarity = 0.5271
friend     enemy      similarity = 0.7200
jon        arya       similarity = 0.6060
cersei     jaime      similarity =

# BERT Sentence Classifier

In [25]:
finance_sentences = [
    "The bank approved the loan",
    "The stock market crashed today",
    "I need to deposit money",
    "The investment portfolio grew quickly",
    "She took a mortgage from the bank",
]

nature_sentences = [
    "The river flows through the forest",
    "Leaves were falling from the trees",
    "The mountain is covered with snow",
    "Birds are singing near the lake",
    "The storm destroyed the valley",
]

X = finance_sentences + nature_sentences
y = [1]*len(finance_sentences) + [0]*len(nature_sentences)   # 1=finance, 0=nature

In [22]:
import torch
from transformers import BertTokenizer, BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
model.eval()


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [23]:
def bert_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k,v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    cls_emb = outputs.last_hidden_state[:,0,:].cpu().numpy()[0]
    return cls_emb


In [26]:
X_bert = np.array([bert_sentence_embedding(s) for s in X])


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_bert, y, test_size=0.3, random_state=42)

clf_bert = LogisticRegression(max_iter=2000)
clf_bert.fit(X_train, y_train)
pred = clf_bert.predict(X_test)
acc_bert = accuracy_score(y_test, pred)

print("BERT classifier accuracy:", acc_bert)


BERT classifier accuracy: 1.0
