In [27]:
!pip install -U pip transformers



In [28]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

In [29]:
checkpoint = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [30]:
print(f"{len(tokenizer.vocab)}\n")

tokenizer.vocab

256204



{'▁исән': 204220,
 'gaga': 33141,
 '▁18:3': 67535,
 '▁төлен': 244172,
 '▁thotë': 38349,
 '▁murume': 90029,
 '▁ასაკ': 149132,
 '▁lặng': 187781,
 '▁பழைய': 235533,
 '▁geutanyong': 210884,
 '▁allt': 20832,
 'baj': 163477,
 'zər': 123488,
 '▁тирән': 211504,
 '▁درم': 222981,
 'ေႃ': 126558,
 'xelo': 65240,
 '▁домой': 147294,
 '▁ಕೀ': 247423,
 '▁kennis': 81050,
 'প্ৰ': 69058,
 '▁egenskaper': 180669,
 '▁kuki': 74243,
 '▁inshita': 105594,
 '▁xogo': 124915,
 '엘': 250700,
 '▁watimo': 200673,
 'ገው': 53973,
 '▁викори': 62086,
 '▁Hugh': 100111,
 '▁Аза': 127414,
 'ɔni': 94878,
 '绝': 251775,
 '▁engo': 189522,
 '▁갇': 191211,
 '▁טובה': 98557,
 '▁죽지': 243308,
 '▁ब्रा': 59947,
 '▁biron': 181650,
 '▁veido': 48622,
 'autorité': 230163,
 'لاح': 16986,
 'evid': 199567,
 'ヘン': 170605,
 '▁kindlaks': 79815,
 'tato': 222693,
 '▁मोठे': 195370,
 'kald': 109485,
 '▁njezin': 214432,
 '▁finanzjarju': 140965,
 '▁740': 240108,
 'Po': 30590,
 '▁yoldan': 242854,
 '▁ક્યાં': 71925,
 '▁jedina': 198913,
 '▁mulanguteri': 223580,

In [31]:
thai_char_min = 0x0E00
thai_char_max = 0x0E7F

thai_tokens = [
    token for token in tokenizer.vocab.keys()
    if any(thai_char_min <= ord(char) <= thai_char_max for char in token)
]

thai_token_count = len(thai_tokens)
sample_size = 20
thai_tokens_sample = thai_tokens[:sample_size]


print(f"{thai_token_count}\n")
for token in thai_tokens_sample:
  print(token)


1712

ทิ
ประ
มนุษย์
▁หย
ษ
▁เม
เราจะ
ท
ตัดสิน
หมายความว่า
กิน
▁ตาม
สา
ใหม่
ิว
หาก
▁ก็
รวม
ว่าฉัน
▁เกี่ยว


In [32]:
import tensorflow as tf
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import math

In [33]:
sentence = 'Work hard, play harder'

In [34]:
cleaned_sentence = sentence.replace(',', '')
cleaned_sentence

'Work hard play harder'

In [35]:
words = cleaned_sentence.split()
words

['Work', 'hard', 'play', 'harder']

In [36]:
sorted_words = sorted(words)
sorted_words

['Work', 'hard', 'harder', 'play']

In [37]:
dc = {word: index for index, word in enumerate(sorted_words)}
dc

{'Work': 0, 'hard': 1, 'harder': 2, 'play': 3}

In [38]:
sentence_int = tf.constant(
    [dc[s] for s in sentence.replace(',', '').split()],
    dtype=tf.int32
)

In [39]:
print(sentence)
print(sentence_int)

Work hard, play harder
tf.Tensor([0 1 3 2], shape=(4,), dtype=int32)


In [40]:
# สร้าง embedding layer
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)

In [41]:
embedded_sentence = embed(sentence_int)

In [42]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.03433397, -0.02959937],
       [ 0.03081535, -0.02796173],
       [ 0.02732171,  0.00029918],
       [ 0.03476748,  0.01495   ]], dtype=float32)>

In [43]:
tf.random.set_seed(123)
vocab_size = 50_000
embedding_dim = 2

dummy_input = tf.constant([0, 1, 2], dtype=tf.int32)

# Case 1 Default initializer (RandomUniform(-0.05, 0.05))
embed_default = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
_ = embed_default(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_default = embed_default.get_weights()[0].flatten()
weights_default.shape

(100000,)

In [44]:
# Case 2 GlorotUniform initializer
tf.random.set_seed(123)
embed_glorot = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_dim,
    embeddings_initializer=tf.keras.initializers.GlorotUniform()
)
_ = embed_glorot(dummy_input) # เรียกใช้งาน layer เพื่อสร้าง weights
weights_glorot = embed_glorot.get_weights()[0].flatten()
weights_glorot.shape

(100000,)

In [45]:
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Histogram(x=weights_default, nbinsx=50, name="Default Uniform [-0.05, 0.05]", opacity=0.6))
fig.add_trace(go.Histogram(x=weights_glorot, nbinsx=50, name="Glorot Uniform", opacity=0.6))

fig.update_layout(
    title_text='Embedding Layer Initialization Comparison',
    xaxis_title_text='Weight values',
    yaxis_title_text='Frequency',
    barmode='overlay',
    legend_orientation="h",
    legend_yanchor="bottom",
    legend_y=1.02,
    legend_xanchor="right",
    legend_x=1
)

fig.show()

print("Default initializer range ", weights_default.min(), weights_default.max())
print("Glorot initializer range ", weights_glorot.min(), weights_glorot.max())

Default initializer range  -0.049999822 0.04999907
Glorot initializer range  -0.010954206 0.010954159


In [46]:
def glorot_uniform_limits(fan_in, fan_out):
    limit = math.sqrt(6.0 / (fan_in + fan_out))
    a, b = -limit, limit
    return a, b

# ตัวอย่าง Embedding layer (vocab_size=50000, embedding_dim=2)
fan_in = 50000
fan_out = 2

a, b = glorot_uniform_limits(fan_in, fan_out)
print("Glorot Uniform a =", a)
print("Glorot Uniform b =", b)

Glorot Uniform a = -0.010954232067652772
Glorot Uniform b = 0.010954232067652772


In [47]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [49]:
token_embedding_layer = model.model.encoder.embed_tokens
token_embedding_layer.weight.shape

torch.Size([256206, 1024])

In [50]:
long_sentence = "In the vast realm of natural language processing, understanding the nuances of how models handle sequential data is crucial. Positional encoding plays a vital role in providing this essential information to the model, allowing it to differentiate between words at different positions in a sentence, which is fundamental for tasks like translation, summarization, and text generation."

In [51]:
tokens = tokenizer(long_sentence, return_tensors="pt")

print(tokens['input_ids'][0])

tensor([256047,    717,    349,  14430,  12284, 248070,    452,  25307,  65445,
        157278, 248079, 133930,    349,    713,  75831,    452,  11657, 141057,
         47274, 116914, 124785,   6067,    248, 182071, 248075,  12013,  58409,
         12025, 246156,   3054,    705,      9, 104781,  76065,    108, 174693,
          3423, 140515,  18781,    202,    349,  14916, 248079,  82935,     87,
           796,    202,  53054,    502,  25914,  51744,    230,  30158, 199073,
           108,      9, 109267, 248079,   9089,    248,  75529,    351, 226047,
          6399, 200356, 248079,   2493, 109207, 181953, 248079,    540,  35883,
        120531, 248075,      2])


In [52]:
len(tokens['input_ids'][0])

75

In [53]:
token_embedding_layer(tokens['input_ids'][0][0]).shape

torch.Size([1024])

In [54]:
token_embeddings = token_embedding_layer(tokens['input_ids'][0])

print("Token Embedding Matrix shape", token_embeddings.shape)
token_embeddings

Token Embedding Matrix shape torch.Size([75, 1024])


tensor([[-5.0000e+00, -1.2725e+00, -9.3604e-01,  ..., -1.8297e+01,
         -9.1328e+00, -1.0672e+01],
        [ 2.6416e-01,  2.6831e-01,  2.0117e-01,  ...,  3.2715e+00,
         -3.2402e+00,  3.1738e+00],
        [ 4.3579e-01, -2.3352e-01,  2.6825e-02,  ...,  5.4648e+00,
          2.7129e+00,  5.5430e+00],
        ...,
        [ 8.5859e+00, -4.5391e+00, -4.7314e-01,  ..., -7.9529e-02,
          7.4844e+00, -7.5156e+00],
        [-2.4863e+00, -2.7515e-01,  5.6114e-03,  ...,  1.0180e+01,
         -7.2422e+00, -4.8047e+00],
        [-7.8320e-01, -9.0527e-01, -9.4482e-01,  ...,  3.1078e+01,
         -8.1494e-01, -8.7354e-01]], grad_fn=<MulBackward0>)

In [55]:
import plotly.express as px

token_embeddings_np = token_embeddings.detach().numpy()

fig = px.imshow(
    token_embeddings_np,
    color_continuous_scale="RdBu",
    labels=dict(x="Embedding Dimension", y="Token Index", color="Value"),
    title="Token Embedding Heatmap"
)

fig.update_xaxes(side="top")
fig.update_layout(height=500, width=900)
fig.show()

In [56]:
d = embedded_sentence.shape[-1]
d

2

In [57]:
d_q, d_k, d_v = 2, 2, 4

d_q, d_k, d_v

(2, 2, 4)

In [58]:
tf.random.set_seed(123)
W_query = tf.Variable(tf.random.uniform((d, d_q)), trainable=True)
W_key   = tf.Variable(tf.random.uniform((d, d_k)), trainable=True)
W_value = tf.Variable(tf.random.uniform((d, d_v)), trainable=True)

In [59]:
print(W_query.shape, W_key.shape, W_value.shape)

(2, 2) (2, 2) (2, 4)


In [60]:
W_query

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.12615311, 0.5727513 ],
       [0.2993133 , 0.5461836 ]], dtype=float32)>

In [61]:
W_key

<tf.Variable 'Variable:0' shape=(2, 2) dtype=float32, numpy=
array([[0.88968754, 0.12354946],
       [0.7718717 , 0.6850728 ]], dtype=float32)>

In [62]:
W_value

<tf.Variable 'Variable:0' shape=(2, 4) dtype=float32, numpy=
array([[0.48962688, 0.5857923 , 0.36451697, 0.6550509 ],
       [0.9075084 , 0.37557673, 0.6882372 , 0.25384045]], dtype=float32)>

In [63]:
embedded_sentence

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.03433397, -0.02959937],
       [ 0.03081535, -0.02796173],
       [ 0.02732171,  0.00029918],
       [ 0.03476748,  0.01495   ]], dtype=float32)>

In [64]:
queries = tf.matmul(embedded_sentence, W_query)
keys    = tf.matmul(embedded_sentence, W_key)
values  = tf.matmul(embedded_sentence, W_value)

In [65]:
print("Queries shape", queries.shape)
queries

Queries shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.01319082, -0.03583151],
       [-0.00448187,  0.00237729],
       [ 0.00353627,  0.01581195],
       [ 0.00886076,  0.02807857]], dtype=float32)>

In [66]:
print("Keys shape", keys.shape)
keys

Keys shape (4, 2)


<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[-0.05339342, -0.02451967],
       [ 0.00583317, -0.0153486 ],
       [ 0.02453871,  0.00358054],
       [ 0.04247168,  0.01453734]], dtype=float32)>

In [67]:
print("Values shape", values.shape)
values

Values shape (4, 4)


<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-0.04367251, -0.03122941, -0.0328867 , -0.03000401],
       [-0.01028748,  0.00754962, -0.00801159,  0.01308781],
       [ 0.01364895,  0.01611721,  0.01016513,  0.01797305],
       [ 0.03059035,  0.0259814 ,  0.02296248,  0.02656939]],
      dtype=float32)>

In [68]:
omega = tf.matmul(queries, keys, transpose_b=True)

print("Omega shape", omega.shape)
print("Omega (Unnormalized attention weights)")
print(omega)

Omega shape (4, 4)
Omega (Unnormalized attention weights)
tf.Tensor(
[[ 1.5828796e-03  4.7301932e-04 -4.5198190e-04 -1.0811314e-03]
 [ 1.8101171e-04 -6.2631589e-05 -1.0146721e-04 -1.5579286e-04]
 [-5.7651708e-04 -2.2206367e-04  1.4339075e-04  3.8005493e-04]
 [-1.1615834e-03 -3.7928042e-04  3.1796811e-04  7.8451919e-04]], shape=(4, 4), dtype=float32)


In [69]:
d_k = tf.cast(d_k, tf.float32)

scaled_omega = omega / tf.sqrt(d_k)

attention_weights = tf.nn.softmax(scaled_omega, axis=-1)

print("Attention Weights")
print(attention_weights)

Attention Weights
tf.Tensor(
[[0.25025678 0.25006047 0.24989694 0.2497858 ]
 [0.25003812 0.24999505 0.24998818 0.2499786 ]
 [0.24991027 0.2499729  0.2500375  0.25007936]
 [0.24981408 0.2499523  0.25007558 0.25015807]], shape=(4, 4), dtype=float32)


In [70]:
row_sums = tf.reduce_sum(attention_weights, axis=-1)

print("Sum of each row in attention_weights")
row_sums

Sum of each row in attention_weights


<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.99999994, 0.99999994, 1.        , 1.        ], dtype=float32)>

In [71]:
context_vector = tf.matmul(attention_weights, values)

print("Context Vector shape", context_vector.shape)
print(context_vector)

Context Vector shape (4, 4)
tf.Tensor(
[[-0.00244997  0.00458992 -0.00195756  0.0068921 ]
 [-0.0024326   0.00460273 -0.00194449  0.00690457]
 [-0.00242304  0.00460997 -0.00193729  0.00691168]
 [-0.0024157   0.00461548 -0.00193177  0.00691707]], shape=(4, 4), dtype=float32)


In [72]:
class SelfAttention(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v):
        super().__init__()
        self.d_out_kq = d_out_kq

        self.W_query = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_key = tf.Variable(
            tf.random.uniform((d_in, d_out_kq)), trainable=True
        )
        self.W_value = tf.Variable(
            tf.random.uniform((d_in, d_out_v)), trainable=True
        )

    def call(self, x):
        keys = tf.matmul(x, self.W_key)      # [T, d_out_kq]
        queries = tf.matmul(x, self.W_query) # [T, d_out_kq]
        values = tf.matmul(x, self.W_value)  # [T, d_out_v]

        # Attention scores: QKᵀ
        attn_scores = tf.matmul(queries, keys, transpose_b=True)  # [T, T]

        # Softmax (scaled by sqrt(d_k))
        attn_weights = tf.nn.softmax(
            attn_scores / tf.math.sqrt(tf.cast(self.d_out_kq, tf.float32)), axis=-1
        )  # [T, T]

        # Weighted sum
        context_vec = tf.matmul(attn_weights, values)  # [T, d_out_v]
        return context_vec

In [73]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 4

sa = SelfAttention(d_in, d_out_kq, d_out_v)

out = sa(embedded_sentence)

print(out.shape)  # (T, d_out_v)
print(out.numpy())

(4, 4)
[[-0.00244997  0.00458992 -0.00195756  0.0068921 ]
 [-0.0024326   0.00460273 -0.00194449  0.00690457]
 [-0.00242304  0.00460997 -0.00193729  0.00691168]
 [-0.0024157   0.00461548 -0.00193177  0.00691707]]


In [74]:
class MultiHeadAttentionWrapper(tf.keras.layers.Layer):
    def __init__(self, d_in, d_out_kq, d_out_v, num_heads):
        super().__init__()
        self.heads = [
            SelfAttention(d_in, d_out_kq, d_out_v)
            for _ in range(num_heads)
        ]

    def call(self, x):
        # รันทุก head แล้ว concat ตามแกนสุดท้าย
        head_outputs = [head(x) for head in self.heads]   # list of [T, d_out_v]
        return tf.concat(head_outputs, axis=-1)           # [T, num_heads * d_out_v]

In [75]:
tf.random.set_seed(123)

d_in, d_out_kq, d_out_v = 2, 2, 1

sa = SelfAttention(d_in, d_out_kq, d_out_v)

# ถ้า embedded_sentence.shape = [T, d_in] เช่น [6, 3]
out = sa(embedded_sentence)

print(out.shape)   # (T, d_out_v) -> (6, 1)
print(out.numpy())

(4, 1)
[[0.00095717]
 [0.00097093]
 [0.00097859]
 [0.00098444]]


In [76]:
tf.random.set_seed(123)

block_size = embedded_sentence.shape[0]   # [T, d_in] → T = sequence length

mha = MultiHeadAttentionWrapper(
    d_in, d_out_kq, d_out_v, num_heads=3
)

# run MHA
context_vecs = mha(embedded_sentence)   # [T, num_heads * d_out_v]

print(context_vecs)
print("context_vecs.shape:", context_vecs.shape)

tf.Tensor(
[[ 0.00095717 -0.00110177  0.00103195]
 [ 0.00097093 -0.00107346  0.00107719]
 [ 0.00097859 -0.00106488  0.00107855]
 [ 0.00098444 -0.0010565   0.00108565]], shape=(4, 3), dtype=float32)
context_vecs.shape: (4, 3)
