In [2]:
import torch
import torch.nn as nn
import math

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("SeaLLMs/SeaLLM3-7B-chat")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader


df = pd.read_json("data/train.jsonl", lines=True)
df.head()

In [None]:
sample_df = df.sample(20000, random_state=42)
sample_df.reset_index(drop=True, inplace=True)

In [None]:
sample_df

Unnamed: 0,en_segment,th_segment,review_star,correct
0,When this arrived I decided to read them right...,เมื่อสิ่งนี้มาถึงฉันตัดสินใจที่จะอ่านพวกเขาทัน...,5,0
1,I was looking for a replacement of the old sho...,ฉันกำลังมองหาการเปลี่ยนหัวฝักบัวเก่าที่มาพร้อม...,5,0
2,I was looking for something more sturdy and st...,ฉันกำลังมองหาบางสิ่งที่แข็งแกร่งและแข็งแกร่งกว...,1,0
3,I thought this book was supposed to be about t...,ฉันคิดว่าหนังสือเล่มนี้ควรเกี่ยวกับคริสตจักรยุ...,1,0
4,I was expecting a book for the &#34;normal&#34...,ฉันคาดหวังว่าหนังสือสำหรับผู้ใช้ &quot;ปกติ&qu...,1,0
...,...,...,...,...
19995,I purchased my Hoover SteamVac Carpet Cleaner ...,ฉันซื้อ Hoover SteamVac Carpet Cleaner ประมาณห...,4,0
19996,I wish they made more of these without the vel...,ฉันหวังว่าพวกเขาจะทำสิ่งเหล่านี้ได้มากขึ้นโดยไ...,3,0
19997,It is a great album with awesome music and I l...,มันเป็นอัลบั้มที่ยอดเยี่ยมพร้อมกับเพลงที่ยอดเย...,1,0
19998,I purchased the Nikon P100 with a UV filter. W...,ฉันซื้อ Nikon P100 พร้อมฟิลเตอร์ UV เมื่อใช้มั...,1,1


In [None]:
tokenizer.all_special_tokens

['<|im_end|>', '<|endoftext|>', '<|im_start|>']

In [None]:
tokenizer.eos_token_id

151645

In [None]:
tokenizer("<|endoftext|> <|im_start|>")

{'input_ids': [151643, 220, 151644], 'attention_mask': [1, 1, 1]}

In [None]:
tokenizer("สวัสดี")

{'input_ids': [143126, 23271, 125136, 28319], 'attention_mask': [1, 1, 1, 1]}

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        src = self.df["en_segment"][idx]
        tgt = self.df["th_segment"][idx]

        src_token = tokenizer(src, truncation=True, padding="max_length", max_length=128)["input_ids"]
        tgt_token = tokenizer(tgt, truncation=True, padding="max_length", max_length=128)["input_ids"]

        return torch.tensor(src_token), torch.tensor(tgt_token)

dataset = CustomDataset(sample_df)


batch_size = 8
dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model != num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
    
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_score = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)

        if mask is not None:
            attn_score = attn_score.masked_fill(mask == 0, -1e9)
        
        attn_probs = torch.softmax(attn_score, dim=-1)

        output = torch.matmul(attn_probs, V)

        return output

    def split_heads(self, x):
        batch_size, seq_lenght, d_model = x.size()
        return x.view(batch_size, seq_lenght, self.num_heads, self.d_k).transpose(1, 2)
    
    def combine_heads(self, x):
        batch_size, _, seq_lenght, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_lenght, self.d_model)
    
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [None]:
d_model = 512
num_heads = 8
batch_size = 16
seq_length = 32

# Create an instance of MultiHeadAttention
multihead_attn = MultiHeadAttention(d_model, num_heads)

# Generate random input tensors
Q = torch.randn(batch_size, seq_length, d_model)
K = torch.randn(batch_size, seq_length, d_model)
V = torch.randn(batch_size, seq_length, d_model)

# # Call the forward method and print the output
output = multihead_attn(Q, K, V)
print(output.shape)

torch.Size([16, 32, 512])


In [None]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        output = self.fc2(x)
        return output

In [None]:
d_model = 512  
d_ff = 2048 
batch_size = 4 
seq_len = 10

# Create an instance of the class
model = PositionWiseFeedForward(d_model, d_ff)

# Generate some random input data
random_input = torch.randn(batch_size, seq_len, d_model) 

# Pass the data through the model
output = model(random_input)

# Print the shapes of the input and output tensors 
print("Input Shape:", random_input.shape)
print("Output Shape:", output.shape)

Input Shape: torch.Size([4, 10, 512])
Output Shape: torch.Size([4, 10, 512])


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_lenght):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_lenght, d_model) 
        position = torch.arange(0, max_seq_lenght, dtype=torch.float).unsqueeze(1)
        dev_term = torch.exp(torch.arange(0, d_model, 2).float() * (math.log(10000.0) / d_model))

        pe[:, 0::2] = torch.sin(position * dev_term)
        pe[:, 1::2] = torch.cos(position * dev_term)

        self.register_buffer('pe', pe.unsqueeze(0))
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

In [None]:
d_model = 512
max_seq_lenght = 10
batch_size = 32

model = PositionalEncoding(d_model, max_seq_lenght)

random_input = torch.randn(batch_size, max_seq_lenght, d_model)

output = model(random_input)
output.shape

torch.Size([32, 10, 512])

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, mask):
        att_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(att_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [None]:
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
batch_size = 1

seq_length = 10

model = EncoderLayer(d_model, num_heads, d_ff, dropout)

random_input = torch.randn(batch_size, seq_length, d_model)
mask = torch.ones(batch_size, seq_length, seq_length)

output = model(random_input, mask)
output.shape

torch.Size([1, 10, 512])

In [None]:
class DecderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x
        

In [None]:
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
batch_size = 1

seq_length = 10

model = DecderLayer(d_model, num_heads, d_ff, dropout)

x = torch.randn(batch_size, seq_length, d_model)
enc_model = EncoderLayer(d_model, num_heads, d_ff, dropout)
src_mask = torch.ones(batch_size, seq_length, seq_length)
enc_output = enc_model(x, mask)
tgt_mask = torch.ones(batch_size, seq_length, seq_length)

output = model(x, enc_output, src_mask, tgt_mask)
print(output.shape)

torch.Size([1, 10, 512])


In [None]:
class Transformer(nn.Module):
    def __init__(self, 
                 src_vocab_size, 
                 tgt_vocab_size, 
                 d_model, 
                 num_heads, 
                 num_layers, 
                 d_ff,
                 max_seq_lenght,
                 dropout):
        super(Transformer, self).__init__()
        self.encoder_embed = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embed = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_lenght)

        self.encoder_layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        
        self.decoder_layers = nn.ModuleList([
            DecderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
    
    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2).to(src.device)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).to(tgt.device)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=tgt.device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embeded = self.dropout(self.positional_encoding(self.encoder_embed(src)))
        tgt_embeded = self.dropout(self.positional_encoding(self.decoder_embed(tgt)))

        enc_output = src_embeded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        

        dec_output = tgt_embeded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        
        output = self.fc(dec_output)
        return output

In [None]:
src_vocab_size = tokenizer.vocab_size+1000
tgt_vocab_size = tokenizer.vocab_size+1000
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = 128
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# # Generate random sample data
# src_data = torch.randint(1, src_vocab_size, (16, max_seq_length))  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (16, max_seq_length))  # (batch_size, seq_length)

In [None]:
from torch import optim

loss_fn = nn.CrossEntropyLoss(ignore_index=0)
opt = optim.Adam(transformer.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

device = "cuda" if torch.cuda.is_available() else "cpu"
# device = "cpu"

transformer = transformer.to(device)
# src_data = src_data.to(device)
# tgt_data = tgt_data.to(device)

print("model is running on", device)

model is running on cuda


In [None]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

def eval(model=None, loader=None):
    y_true = ["สวัสดีครับชมรมคนชอบหมี"]
    y_pred = ["สวัสดีครับชมรมคนชอบของสะสม"]
    
    y_true = [tokenizer.batch_decode(tokenizer(i, truncation=True, padding="max_length", max_length=128)["input_ids"]) for i in y_true]
    y_pred = [tokenizer.batch_decode(tokenizer(i, truncation=True, padding="max_length", max_length=128)["input_ids"]) for i in y_pred]
    
    y_pred = np.array(y_pred).squeeze()
    
    
    bleu_score = sentence_bleu(y_true, y_pred)
    
    return bleu_score

In [None]:
eval()

TypeError: Fraction.__new__() got an unexpected keyword argument '_normalize'

In [5]:
from math import sqrt, log, exp
from collections import Counter

def get_ngrams(text, order):
    """
    Given a string `text` and an integer `order`, returns a Counter object containing
    the frequency counts of all ngrams of size `order` in the string.
    """
    ngrams = Counter()

    words = text.split()
    for i in range(len(words)- order+1):
      ngram = " ". join(words[i: i + order])
      ngrams[ngram] += 1

    return ngrams

def calculate_bleu(hypothesis, references):
    
    bleu=0
    p1=0
    p2=0
    p3=0
    p4=0
    bp=1

    # 1. Find the closest reference to the hypothesis
    closest_size=100000
    closest_ref=[]

    for ref in references:
      ref_size = len(ref)
      if abs(len(hypothesis) - ref_size) < closest_size:
        closest_size = abs(len(hypothesis) - ref_size)
        closest_ref = ref
        pass

    # 2. Calculating pn
    pns=[]
    for order in range(1,5):
      # calculate intersection and union of n-grams
      # hint: use the get_ngrams function you implemented
      # calculate pn for each order
        hyp_ngrams = get_ngrams(hypothesis, order)
        hyp_count = Counter(hyp_ngrams)
        closest_ref_ngrams = get_ngrams(closest_ref, order)
        closest_ref_count = Counter(closest_ref_ngrams)
        intersection_count = dict(hyp_count & closest_ref_count)
        intersection_size = sum(intersection_count.values())
        hyp_size = max(len(hyp_ngrams), 1)
        p_n = intersection_size / hyp_size
        pns.append(p_n)
        pass

    # 3. Calculating the brevity penalty
    bp=1
    c=len(hypothesis)
    r=min(abs(len(ref) - c) for ref in references)
    if c > r:
      bp = 1.0
    else:
      bp = exp(1 - r / c)

    # 4. Calculating the BLEU score
    weights = [0.25] * 4
    bleu=bp * exp(sum(w * log(p_n) for w, p_n in zip(weights, pns)))    
    
    # Assigning values to p1, p2, p3, p4!
    p1, p2, p3, p4 = pns

    
    # Do not change the variable name
    return bleu, p1, p2, p3, p4, bp

In [12]:
hypothesis="Abandon all hope , ye who enter here"
references=["All hope abandon , ye who enter here", "All hope abandon , ye who enter in !", "Leave every hope, ye that enter", "Leave all hope , ye that enter"]

In [13]:
bleu, p1, p2, p3, p4, bp=calculate_bleu(hypothesis, references)
print("BLEU: %.3f" % bleu)

BLEU: 0.541


In [None]:
from tqdm import tqdm

for epoch in range(2):
    transformer.train()
    loss_sum = 0

    for src, tgt in tqdm(dataLoader):
        src = src.to(device)
        tgt = tgt.to(device)
        
        opt.zero_grad()
        output = transformer(src, tgt[:, 1:])
        loss = loss_fn(output.contiguous().view(-1, tgt_vocab_size), tgt[:, 1:].contiguous().view(-1))
        loss.backward()
        opt.step()
        loss_sum += loss.item()
    
    print(f"Epoch: {epoch+1:02d}, Loss: {loss_sum / len(dataLoader):.5f}")

  0%|          | 0/2500 [00:00<?, ?it/s]

100%|██████████| 2500/2500 [05:56<00:00,  7.01it/s]


Epoch: 01, Loss: 0.22751


100%|██████████| 2500/2500 [05:57<00:00,  6.99it/s]

Epoch: 02, Loss: 0.01937





In [None]:
transformer.eval()

Transformer(
  (encoder_embed): Embedding(152643, 512)
  (decoder_embed): Embedding(152643, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecderLayer(
      

In [None]:
def predict(model, src_raw, max_lenght=128, sos_token=151644, eos_token=151643):
    model.eval()
    model = model.to("cpu")
    src = tokenizer(src_raw, truncation=True, padding="max_length", max_length=128)["input_ids"]
    src = torch.tensor([src])
    src_mask, _ = model.generate_mask(src, src)
    
    with torch.no_grad():
        src_embeded = model.encoder_embed(src)
        src_embeded = model.positional_encoding(src_embeded)
        enc_output = src_embeded
        for enc_layer in model.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        
    batch_size = src.size(0)
    tgt_tokens = torch.ones(batch_size, 1).long().to(src.device)
    
    for _ in range(max_lenght-1):
        _, tgt_mask = model.generate_mask(src, tgt_tokens)
        tgt_embeded = model.decoder_embed(tgt_tokens)
        tgt_embeded = model.positional_encoding(tgt_embeded)
        dec_output = tgt_embeded
        
        for dec_layer in model.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
    
        output = model.fc(dec_output[:, -1, :])
        
        _, next_token = torch.max(output, dim=1)
        
        tgt_tokens = torch.cat([tgt_tokens, next_token.unsqueeze(1)], dim=1)
        
        if(next_token == eos_token).all():
            break
    
    return tgt_tokens
    

In [None]:
x = predict(transformer, "Best value in a hard to obtain item. The item was new with the box.")

In [None]:
x

tensor([[  1, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220, 220,
         220, 220]])

In [None]:
tokenizer.decode(x.tolist()[0])

'"                                                                                                                               '

In [10]:
import models

In [11]:
models.train()

training.....
