In [120]:
import torch
import torch.nn.functional as F
import os
import time
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
import sentencepiece as spm
from Transformer import Transformer
from TranslationDataset import TranslationDataset, create_train_val_dataloaders
from Optimizer import CustomOptim
from itertools import islice
import json
import csv
from torchtext.data.metrics import bleu_score
from utils import set_seed, ensure_directory_exists, save_checkpoint, load_state_dicts

In [121]:
set_seed(2630)

CONFIG_FILE = "ex_config-4"
CONFIG_PATH = "config"

# Open and load the JSON file into a dictionary
config_path = os.path.join(CONFIG_PATH,f"{CONFIG_FILE}.json")
with open(config_path, 'r') as file:
    config = json.load(file)

# VARIABLES FROM CONFIG FILE THAT CONTROL EXPERIMENT RUN
pytorch_cuda_config = config.get('pytorch_cuda','max_split_size_mb:128')

corpus_path_config = config.get('corpus_path','/corpus/df_encoded.pkl')
bpe_model_path_config = config.get('bpe_model_path','/bpe/bpe_model.model')
results_path_config = config.get('results','results')

batch_size_config = config.get('batch_size',16)
dataset_value_split_config = config.get('dataset_value_split',0.1)

lr_config = config.get('lr',1e-4)
beta1_config = config.get('beta1',0.9)
beta2_config = config.get('beta2',0.98)
eps_config = config.get('eps',1e-9)
warmup_steps_config = config.get('warmup_steps',4000)
lr_factor_config = config.get('lr_factor',1)

num_epochs_config = config.get('num_epochs', 10)
total_training_steps_config = config.get('total_training_steps', 100000)
model_save_path_config = config.get('model_save_path','/models')
save_interval_in_minutes_config = config.get('save_interval_in_minutes',10)
average_model_weight_num_config = config.get('average_model_weight_num',5)

beam_size_config = config.get('beam_size',4)
len_penalty_alpha_config = config.get('len_penalty_alpha',0.6)
max_len_a_config = config.get('max_len_a',1)
max_len_b_config = config.get('max_len_b',50)

d_model_config = config.get('d_model_config',512)

d_dec_ff_inner_config = config.get('d_dec_ff_inner',2048)
t_dec_heads_config = config.get('t_dec_heads',8)
t_dec_layer_num_config = config.get('t_dec_layer_num',6)

d_enc_ff_inner_config = config.get('d_enc_ff_inner',2048)
t_enc_heads_config = config.get('t_enc_heads',8)
t_enc_layer_num_config = config.get('t_enc_layer_num',6)

d_query_key_head_config = config.get('d_query_key_head',64)
d_value_head_config = config.get('d_value_head',64)

t_dropout_config = config.get('t_dropout',0.1)
t_dot_product_config = config.get('t_dot_product',True)
if t_dot_product_config == 1:
    t_dot_product_config = True
else:
    t_dot_product_config = False
label_smoothing_config = config.get('label_smoothing',0.1)

beam_size_config = config.get('beam_size',4)
len_penalty_alpha_config = config.get('len_penalty_alpha','max_split_size_mb:128')
max_len_a_config = config.get('max_len_a','max_split_size_mb:128')
max_len_b_config = config.get('max_len_b','max_split_size_mb:128')

In [122]:
print(f"Loading BPE model from: {bpe_model_path_config} ...")
sp = spm.SentencePieceProcessor()
sp.load(bpe_model_path_config)

# create variables for model from bpe model
sb_vocab_size = sp.get_piece_size()
sb_vocab_list = [sp.id_to_piece(i) for i in range(sb_vocab_size)]
sb_vocab_dict = {sb_vocab_list[i]: i for i in range(sb_vocab_size)}

Loading BPE model from: ../bpe/bpe_model.model ...


In [123]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if device == 'cuda':
    torch.cuda.empty_cache()

# initialize the model
print("Initializing model ...")
model = Transformer(
    n_vocab_len=sb_vocab_size,
    i_vocab_padding=sb_vocab_dict['<mask>'],
    d_model=d_model_config,
    device=device,
    d_dec_ff_inner=d_dec_ff_inner_config,
    t_dec_heads=t_dec_heads_config,
    t_dec_layer_num=t_dec_layer_num_config,
    d_enc_ff_inner=d_enc_ff_inner_config,
    t_enc_heads=t_enc_heads_config, 
    t_enc_layer_num=t_enc_layer_num_config,
    d_query_key_head=d_query_key_head_config,
    d_value_head=d_value_head_config,
    t_dropout=t_dropout_config,
    t_dot_product=t_dot_product_config
).to(device)

Using device: cuda
Initializing model ...


In [124]:
model_dict = torch.load('models/' + 'ex_config-4_model_epoch_end.pth', map_location=torch.device(device))  # Use "cuda" if on GPU

In [125]:
model.load_state_dict(model_dict['model_state_dict'])

<All keys matched successfully>

In [126]:
model.eval()

Transformer(
  (vocab_embedding): Embedding(37000, 512, padding_idx=5)
  (positional_encoding): PositionalEncoding()
  (encoder): Encoder(
    (encoder_layer_stack): ModuleList(
      (0-1): 2 x EncoderLayer(
        (attention_sublayer): MHAttentionSublayer(
          (multi_headed_attention): MHAttention(
            (key_proj): Linear(in_features=512, out_features=512, bias=False)
            (query_proj): Linear(in_features=512, out_features=512, bias=False)
            (value_proj): Linear(in_features=512, out_features=512, bias=False)
            (scaled_dot_product_attention): ScaledDotProductAttention()
            (concat_proj): Linear(in_features=512, out_features=512, bias=False)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (normalization): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
        (feed_forward_sublayer): FeedForwardSublayer(
          (linear_proj): FeedForwardUnit(
            (d_model_to_d_inner): Linear(in_featur

In [127]:
import torch
import torch.nn.functional as F

def beam_search(model, sp, input_sentence, beam_width=4, max_length=50):
    """
    Beam search for inference using a Transformer model with SentencePiece tokenizer.
    Args:
        model: Transformer model (expects input + decoder input).
        sp: SentencePiece tokenizer.
        input_sentence: Sentence to translate/generate.
        beam_width: Number of beams to keep.
        max_length: Max output length.
    Returns:
        Best decoded sentence.
    """
    #print(model.parameters)
    device = next(model.parameters()).device
    
    input_ids = sp.encode(input_sentence)

    start_token = sp.piece_to_id("<s>")
    end_token = sp.piece_to_id("</s>")

    # Adding sos and end tokens to our encoder sequence
    input_ids.insert(0, start_token)
    input_ids.append(end_token)
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)

    # Initializing beams to the start token
    beams = [(torch.tensor([start_token]).to(device), 0.0)]

    # Going through all possible beams until the eos tokens are produced in all beams or until max length is reached
    for _ in range(max_length):
        new_beams = []
        
        for seq, score in beams:
            if seq[-1].item() == end_token:
                new_beams.append((seq, score))
                continue
                
            decoder_input = seq.unsqueeze(0)
            
            with torch.no_grad():
                output = model(input_ids, decoder_input)  # (batch_size, seq_len, vocab_size)
                logits = output[:, -1, :]  # (batch_size, vocab_size)
            
            # Converting logits to probabilities
            probs = F.softmax(logits, dim=-1)
            
            # Selecting top-k tokens
            topk_probs, topk_indices = torch.topk(probs[0], beam_width)  # Remove batch dimension
            
            # Adding k tokens to their corresponding beams
            for i in range(beam_width):
                new_token = topk_indices[i].unsqueeze(0) 
                new_seq = torch.cat([seq, new_token], dim=0)
                new_score = score + torch.log(topk_probs[i]).item() # Adding the logits of the added token as the score
                new_beams.append((new_seq, new_score))
        
        # Keeping top-k beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]
        
        # Stoping if all beams end with eos
        if all(seq[-1].item() == end_token for seq, _ in beams):
            break
    
    # Return best sequence
    best_seq = beams[0][0].cpu().tolist()
    return sp.decode(best_seq)

In [131]:
input_sentence = "Today is a beautiful day to train a Transformer"
print("Input: ", input_sentence)
output_sentence = beam_search(model, sp, input_sentence)
print("Generated Output:", output_sentence)

Input:  Today is a beautiful day to train a Transformer
Generated Output: Heute ist ein schönen Tag, um einen Transform zu machen.
