In [None]:
!pip install wandb
import wandb
wandb.login(key="580e769ee2f34eafdded556ce52aaf31c265ad3b")



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mma23m011[0m ([33mma23m011-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Without Attention

In [None]:
import torch  #  for building and training deep learning models
import torch.nn as nn  # Imports the neural network module
import torch.optim as optim  # Imports optimization algorithms like SGD, Adam, etc.
import wandb  # Imports Weights & Biases for experiment tracking (like loss curves, metrics, etc.)
from torch.utils.data import Dataset, DataLoader  # Used for creating custom datasets and loading them in batches
from sklearn.model_selection import train_test_split  # Used to split the data into training and testing sets
import pandas as pd  # Imports pandas library for handling tabular data (like Excel or CSV files)


#  Dataset
class TransliterationDataset(Dataset):
    def __init__(self, data, src_vocab, tgt_vocab):
        self.data = data  # Save the input data (list of (src, tgt) pairs)
        self.src_vocab = src_vocab  # Save the source vocabulary (dictionary of tokens to numbers)
        self.tgt_vocab = tgt_vocab  # Save the target vocabulary

    def __len__(self):
        return len(self.data)# Return the number of samples in the data

    def __getitem__(self, idx):
        src, tgt = self.data[idx]# Get one pair of source and target using the index
        # for c in src:
        #     print(c)
        # Convert source text to numbers (IDs), add <sos> at start and <eos> at end
        src_ids = [self.src_vocab['<sos>']] + [self.src_vocab.get(c, self.src_vocab['<unk>']) for c in src] + [self.src_vocab['<eos>']]

       # Convert target text to numbers (IDs), add <sos> at start and <eos> at end
        tgt_ids = [self.tgt_vocab['<sos>']] + [self.tgt_vocab.get(c, self.tgt_vocab['<unk>']) for c in tgt] + [self.tgt_vocab['<eos>']]

        return torch.tensor(src_ids), torch.tensor(tgt_ids)  # Return tensors of source and target IDs
def collate_fn(batch):
    src_seqs, tgt_seqs = zip(*batch)  # Separate source and target sequences in the batch
    # Pad all source sequences to the same length with 0 (for batch processing)
    src_padded = nn.utils.rnn.pad_sequence(src_seqs, batch_first=True, padding_value=0)
    # Pad all target sequences to the same length with 0
    tgt_padded = nn.utils.rnn.pad_sequence(tgt_seqs, batch_first=True, padding_value=0)
    return src_padded, tgt_padded  # Return the padded sequences



# Function to build vocabulary from data
def build_vocab(data):
    # Start with special tokens in the vocabulary
    vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    idx = 4  # Next available index

    for word in data:
        if isinstance(word, str):  # Check if the input is a string
            for char in word:  # Go through each character
                if char not in vocab:  # If character is new
                    vocab[char] = idx  # Add it to the vocab
                    idx += 1  # Move to next index
    return vocab  # Return the final vocabulary


class Seq2Seq(nn.Module):  # Define a sequence-to-sequence model using PyTorch
    def __init__(self, config, src_vocab_size, tgt_vocab_size):  # Constructor with config and vocab sizes
        super().__init__()  # Call the parent class constructor
        self.config = config  # Store the configuration

        # Create embedding layer for source language with padding index 0
        self.embedding_src = nn.Embedding(src_vocab_size, config['embedding_dim'], padding_idx=0)
        # Create embedding layer for target language with padding index 0
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, config['embedding_dim'], padding_idx=0)

        # Choose RNN type (RNN, GRU, or LSTM) based on config
        rnn_cell = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[config['cell_type']]

        # Create encoder RNN with config values
        self.encoder = rnn_cell(config['embedding_dim'], config['hidden_size'], config['encoder_layers'], dropout=config['dropout'], batch_first=True)
        # Create decoder RNN with config values
        self.decoder = rnn_cell(config['embedding_dim'], config['hidden_size'], config['decoder_layers'], dropout=config['dropout'], batch_first=True)

        # Final fully connected layer to map RNN output to target vocabulary
        self.fc_out = nn.Linear(config['hidden_size'], tgt_vocab_size)

    def forward(self, src, tgt):  # Forward pass: takes source and target sequences
        embedded_src = self.embedding_src(src)  # Get embeddings for source input
        embedded_tgt = self.embedding_tgt(tgt)  # Get embeddings for target input

        _, hidden = self.encoder(embedded_src)  # Pass source through encoder, get final hidden state

        def expand_hidden(h_enc, required_layers):  # Helper function to adjust hidden state layers
            # h_enc: (num_layers_enc, batch, hidden_size)
            num_enc_layers = h_enc.size(0)  # Get number of encoder layers
            if num_enc_layers < required_layers:  # If encoder has fewer layers than decoder
                # Create extra zero layers to match decoder requirement
                extra = torch.zeros(
                    required_layers - num_enc_layers,
                    h_enc.size(1),
                    h_enc.size(2),
                    device=h_enc.device,
                    dtype=h_enc.dtype
                )
                h_enc = torch.cat([h_enc, extra], dim=0)  # Combine original and extra layers
            else:
                h_enc = h_enc[-required_layers:]  # Take only the needed layers if more
            return h_enc  # Return adjusted hidden state

        if isinstance(hidden, tuple):  # If RNN is LSTM, hidden is a tuple (h, c)
            h, c = hidden  # Split hidden and cell states
            h = expand_hidden(h, self.config['decoder_layers'])  # Adjust hidden state
            c = expand_hidden(c, self.config['decoder_layers'])  # Adjust cell state
            decoder_output, _ = self.decoder(embedded_tgt, (h, c))  # Pass through decoder
        else:  # If RNN is GRU or simple RNN
            hidden = expand_hidden(hidden, self.config['decoder_layers'])  # Adjust hidden state
            decoder_output, _ = self.decoder(embedded_tgt, hidden)  # Pass through decoder

        output = self.fc_out(decoder_output)  # Pass decoder output through final layer
        return output  # Return the model output



# Function to calculate token-level accuracy
def calculate_accuracy(output, target, pad_idx):
    preds = output.argmax(2)  # Get the predicted token indices with highest probability
    mask = (target != pad_idx)  # Create a mask to ignore padding tokens
    correct = (preds == target) & mask  # Count only the correct predictions (excluding padding)
    return correct.sum().item() / mask.sum().item()  # Return the ratio of correct tokens

# Function to calculate word-level (sequence-level) accuracy
def compute_word_accuracy(output, target, tgt_index_to_token, pad_idx):
    preds = output.argmax(dim=2)  # Get predicted token indices for each position

    correct = 0  # Count of correct sequences
    total = 0    # Total number of sequences

    # Loop over each predicted and target sequence pair
    for pred_seq, tgt_seq in zip(preds, target):
        # Convert predicted token indices to words/tokens, ignoring padding
        pred_tokens = [tgt_index_to_token[idx.item()] for idx in pred_seq if idx.item() != pad_idx]
        # Convert target token indices to words/tokens, ignoring padding
        tgt_tokens = [tgt_index_to_token[idx.item()] for idx in tgt_seq if idx.item() != pad_idx]

        # Cut off predicted tokens after the <eos> token if it exists
        if '<eos>' in pred_tokens:
            pred_tokens = pred_tokens[:pred_tokens.index('<eos>')]
        # Cut off target tokens after the <eos> token if it exists
        if '<eos>' in tgt_tokens:
            tgt_tokens = tgt_tokens[:tgt_tokens.index('<eos>')]

        # If predicted and target token lists are the same, count it as correct
        if pred_tokens == tgt_tokens:
            correct += 1
        total += 1  # Increase total count

    # Return the ratio of completely correct sequences
    return correct / total if total > 0 else 0.0



# Function to train the model
def train(model, dataloader, optimizer, criterion, tgt_pad_idx, tgt_index_to_token):
    model.train()  # Set model to training mode
    total_loss, total_acc, total_word_acc = 0, 0, 0  # Initialize total metrics

    for src, tgt in dataloader:  # Loop through each batch
        src, tgt = src.to(device), tgt.to(device)  # Move data to GPU or CPU
        optimizer.zero_grad()  # Clear old gradients

        output = model(src, tgt[:, :-1])  # Predict next tokens using input except last
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))  # Calculate loss

        acc = calculate_accuracy(output, tgt[:, 1:], tgt_pad_idx)  # Token-level accuracy
        word_acc = compute_word_accuracy(output, tgt[:, 1:], tgt_index_to_token, tgt_pad_idx)  # Sentence-level accuracy

        loss.backward()  # Backpropagation
        optimizer.step()  # Update model weights

        total_loss += loss.item()  # Add batch loss
        total_acc += acc  # Add batch accuracy
        total_word_acc += word_acc  # Add batch word accuracy

    # Return average metrics
    return total_loss / len(dataloader), total_acc / len(dataloader), total_word_acc / len(dataloader)

# Function to evaluate the model
def evaluate(model, dataloader, criterion, tgt_pad_idx, tgt_index_to_token):
    model.eval()  # Set model to evaluation mode
    total_loss, total_acc, total_word_acc = 0, 0, 0  # Initialize total metrics

    with torch.no_grad():  # No gradient calculation
        for src, tgt in dataloader:  # Loop through each batch
            src, tgt = src.to(device), tgt.to(device)  # Move data to device
            output = model(src, tgt[:, :-1])  # Predict next tokens

            loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))  # Calculate loss
            acc = calculate_accuracy(output, tgt[:, 1:], tgt_pad_idx)  # Token accuracy
            word_acc = compute_word_accuracy(output, tgt[:, 1:], tgt_index_to_token, tgt_pad_idx)  # Word accuracy

            total_loss += loss.item()  # Add loss
            total_acc += acc  # Add token accuracy
            total_word_acc += word_acc  # Add word accuracy

    # Return average metrics
    return total_loss / len(dataloader), total_acc / len(dataloader), total_word_acc / len(dataloader)

def sweep_train():  # Function to train with W&B sweep
    wandb.init()  # Initialize wandb
    config = wandb.config  # Get config from sweep

    # Load train dataset
    train_df = pd.read_csv("/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv", sep="\t", header=None, names=["tgt", "src", "freq"])
    # Repeat rows based on frequency
    train_df = train_df.loc[train_df.index.repeat(train_df['freq'])].reset_index(drop=True)
    # Load dev dataset
    dev_df = pd.read_csv("/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv", sep="\t", header=None, names=["tgt", "src", "freq"])

    # Convert to string
    train_df['src'] = train_df['src'].astype(str)
    train_df['tgt'] = train_df['tgt'].astype(str)

    # Build vocabularies
    src_vocab = build_vocab(train_df['src'])
    tgt_vocab = build_vocab(train_df['tgt'])

    print(src_vocab)  # Print source vocab
    print(tgt_vocab)  # Print target vocab

    # Index to token mapping
    tgt_index_to_token = {v: k for k, v in tgt_vocab.items()}
    idx_to_tgt = {v: k for k, v in tgt_vocab.items()}

    # Prepare training and dev data
    train_data = list(zip(train_df['src'], train_df['tgt']))
    dev_data = list(zip(dev_df['src'], dev_df['tgt']))

    # Create datasets
    train_dataset = TransliterationDataset(train_data, src_vocab, tgt_vocab)
    dev_dataset = TransliterationDataset(dev_data, src_vocab, tgt_vocab)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Initialize model
    model = Seq2Seq(config, len(src_vocab), len(tgt_vocab)).to(device)

    # Set optimizer
    optimizer = optim.Adam(model.parameters())

    # Set loss function
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    # Training loop
    for epoch in range(config['epochs']):
        # Train model
        train_loss, train_acc, train_word_acc = train(model, train_loader, optimizer, criterion, tgt_vocab['<pad>'], tgt_index_to_token)
        # Evaluate model
        val_loss, val_acc, val_word_acc = evaluate(model, dev_loader, criterion, tgt_vocab['<pad>'], tgt_index_to_token)

        # Print metrics
        print(f"Epoch {epoch + 1}")
        print(f"{'train_loss:':20} {train_loss:.4f}")
        print(f"{'val_loss:':20} {val_loss:.4f}")
        print(f"{'train_accuracy:':20} {train_acc * 100:.2f}%")
        print(f"{'val_accuracy:':20} {val_acc * 100:.2f}%")
        print(f"{'train_word_accuracy:':20} {train_word_acc * 100:.2f}%")
        print(f"{'val_word_accuracy:':20} {val_word_acc * 100:.2f}%")

        # Log to wandb
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "train_accuracy": train_acc * 100,
            "val_accuracy": val_acc * 100,
            "train_word_accuracy": train_word_acc * 100,
            "val_word_accuracy": val_word_acc * 100
        })

    # predict_and_show(model, dev_dataset, src_vocab, tgt_vocab, idx_to_tgt, num_samples=100)



# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define sweep configuration
sweep_config = {
    'method': 'random',  # Use random search for hyperparameters
    'name': 'DakshinaSweepForPred',  # Name of the sweep
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},  # Goal is to maximize validation accuracy
    'parameters': {
        'embedding_dim': {'values': [32, 64, 128]},  # Try different embedding dimensions
        'hidden_size': {'values': [64, 128]},  # Try different hidden sizes
        'encoder_layers': {'values': [1, 2, 3]},  # Try 1 to 3 encoder layers
        'decoder_layers': {'values': [1, 2, 3]},  # Try 1 to 3 decoder layers
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},  # Try different RNN cell types
        'dropout': {'values': [0.2, 0.3]},  # Try different dropout rates
        'epochs': {'values': [5, 10, 13, 15, 17, 20]}  # Try different number of epochs
    }
}

# Create sweep with the above config and project name
sweep_id = wandb.sweep(sweep_config, project="DL_A3")

# Run the sweep agent for one configuration using sweep_train function
wandb.agent(sweep_id, function=sweep_train, count=1)


# Without Attention: Predict with beam search

In [None]:
# Define function to predict using beam search
def predict_with_beam_search(model, src_seq, src_vocab, tgt_vocab, beam_width=3, max_len=30):
    model.eval()  # Set model to evaluation mode

    sos_token = tgt_vocab['<sos>']  # Start of sequence token
    eos_token = tgt_vocab['<eos>']  # End of sequence token
    pad_token = tgt_vocab['<pad>']  # Padding token

    tgt_index_to_token = {v: k for k, v in tgt_vocab.items()}  # Map indices to tokens

    with torch.no_grad():  # Disable gradient calculation
        # Convert input string to tensor of indices with <sos> and <eos>
        src_seq = torch.tensor(
            [src_vocab['<sos>']] + [src_vocab.get(c, src_vocab['<unk>']) for c in src_seq] + [src_vocab['<eos>']]
        ).unsqueeze(0).to(device)  # Add batch dimension and move to device

        embedded_src = model.embedding_src(src_seq)  # Embed the source sequence
        encoder_output, hidden = model.encoder(embedded_src)  # Pass through encoder

        # Handle LSTM hidden state
        if isinstance(hidden, tuple):
            h, c = hidden  # For LSTM: hidden = (h, c)
        else:
            h, c = hidden, None  # For RNN/GRU: no c state

        # Get number of expected decoder layers
        expected_layers = model.config.get("decoder_num_layers", 3)
        actual_layers = h.shape[0]  # Actual number of layers in hidden state

        # If decoder expects more layers than encoder provided
        if actual_layers < expected_layers:
            diff = expected_layers - actual_layers  # Number of missing layers
            extra_h = torch.zeros(diff, h.shape[1], h.shape[2], device=h.device)  # Create extra h layers
            h = torch.cat([h, extra_h], dim=0)  # Concatenate extra layers

            if c is not None:
                extra_c = torch.zeros(diff, c.shape[1], c.shape[2], device=c.device)  # Create extra c layers
                c = torch.cat([c, extra_c], dim=0)  # Concatenate extra c layers

        # Initialize beams with start token and zero score
        beams = [(torch.tensor([sos_token], device=device), 0.0, h, c)]

        for _ in range(max_len):  # Loop until max length
            new_beams = []  # Store new candidate beams
            for seq, score, h, c in beams:  # Iterate over current beams
                if seq[-1].item() == eos_token:  # If sequence ends with <eos>
                    new_beams.append((seq, score, h, c))  # Keep it as is
                    continue

                # Get embedding for last predicted token
                embedded = model.embedding_tgt(seq[-1].unsqueeze(0).unsqueeze(0))

                # Pass through decoder based on cell type
                if model.config['cell_type'] == 'LSTM':
                    output, (h_new, c_new) = model.decoder(embedded, (h, c))  # For LSTM
                else:
                    output, h_new = model.decoder(embedded, h)  # For RNN/GRU
                    c_new = None

                logits = model.fc_out(output.squeeze(1))  # Compute output logits
                log_probs = torch.log_softmax(logits, dim=-1).squeeze(0)  # Get log probabilities

                # Get top k predictions
                topk_log_probs, topk_indices = torch.topk(log_probs, beam_width)

                # Create new beams with each top prediction
                for log_prob, idx in zip(topk_log_probs, topk_indices):
                    new_seq = torch.cat([seq, idx.unsqueeze(0)])  # Append predicted token
                    new_score = score + log_prob.item()  # Update beam score
                    new_beams.append((new_seq, new_score, h_new, c_new))  # Add new beam

            # Keep top beam_width beams
            beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

        best_seq = beams[0][0]  # Get best predicted sequence

        # Convert indices to tokens, skip <pad> and <eos>
        return ''.join([tgt_index_to_token[token.item()] for token in best_seq[1:] if token.item() not in [pad_token, eos_token]])


In [None]:
import csv

# def predict_and_show(model, dataset, src_vocab, tgt_vocab, idx_to_tgt, num_samples=10):
#     model.eval()
#     dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

#     sos_idx = tgt_vocab['<sos>']
#     eos_idx = tgt_vocab['<eos>']
#     pad_idx = tgt_vocab['<pad>']

#     for i, (src_tensor, tgt_tensor) in enumerate(dataloader):
#         if i >= num_samples:
#             break

#         src_tensor = src_tensor.to(device)
#         tgt_tensor = tgt_tensor.to(device)

#         with torch.no_grad():
#             output = model(src_tensor, tgt_tensor[:, :-1])

#         # Get predicted token indices
#         pred_tokens = output.argmax(dim=2)[0].tolist()

#         # Convert indices to tokens, removing padding and after <eos>
#         input_tokens = [k for k,v in src_vocab.items() if v in src_tensor[0].tolist()]
#         actual_tokens = [idx_to_tgt[idx.item()] for idx in tgt_tensor[0] if idx.item() not in [sos_idx, eos_idx, pad_idx]]
#         predicted_tokens = []
#         for idx in pred_tokens:
#             if idx == eos_idx:
#                 break
#             if idx not in [sos_idx, pad_idx]:
#                 predicted_tokens.append(idx_to_tgt.get(idx, '?'))

#         # Print the result
#         input_text = ''.join([k for k,v in src_vocab.items() if v in src_tensor[0].tolist() and v not in [sos_idx, eos_idx, pad_idx]])
#         print("Input text:     ", input_text)
#         print("Actual text:    ", ''.join(actual_tokens))
#         print("Predicted text: ", ''.join(predicted_tokens))
#         print("-" * 30)


def sweep_train_pred():
    wandb.init()  # Initialize a new Weights and Biases (wandb) run
    config = wandb.config  # Access configuration parameters from wandb

    # Load training dataset and assign column names
    train_df = pd.read_csv("/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv", sep="\t", header=None, names=["tgt", "src", "freq"])

    # Repeat rows in training data according to frequency
    train_df = train_df.loc[train_df.index.repeat(train_df['freq'])].reset_index(drop=True)

    # Load validation/dev dataset
    dev_df = pd.read_csv("/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv", sep="\t", header=None, names=["tgt", "src", "freq"])

    # Load test dataset
    test_df = pd.read_csv("/kaggle/input/dakshina/dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv", sep="\t", header=None, names=["tgt", "src", "freq"])

    # Ensure src and tgt columns in training data are strings
    train_df['src'] = train_df['src'].astype(str)
    train_df['tgt'] = train_df['tgt'].astype(str)

    # Build vocabulary for source language
    src_vocab = build_vocab(train_df['src'])

    # Build vocabulary for target language
    tgt_vocab = build_vocab(train_df['tgt'])

    # Print source vocabulary
    print(src_vocab)

    # Print target vocabulary
    print(tgt_vocab)

    # Map target vocabulary index to token
    tgt_index_to_token = {v: k for k, v in tgt_vocab.items()}

    # Duplicate mapping of index to target token
    idx_to_tgt = {v: k for k, v in tgt_vocab.items()}

    # Create training data as (src, tgt) pairs
    train_data = list(zip(train_df['src'], train_df['tgt']))

    # Create validation data as (src, tgt) pairs
    dev_data = list(zip(dev_df['src'], dev_df['tgt']))

    # Create test data as (src, tgt) pairs
    test_data = list(zip(test_df['src'], test_df['tgt']))

    # Create training dataset object
    train_dataset = TransliterationDataset(train_data, src_vocab, tgt_vocab)

    # Create validation dataset object
    dev_dataset = TransliterationDataset(dev_data, src_vocab, tgt_vocab)

    # Create test dataset object
    test_dataset = TransliterationDataset(test_data, src_vocab, tgt_vocab)

    # Create DataLoader for training dataset
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

    # Create DataLoader for validation dataset
    dev_loader = DataLoader(dev_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Create DataLoader for test dataset
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Instantiate Seq2Seq model and move it to device (GPU/CPU)
    model = Seq2Seq(config, len(src_vocab), len(tgt_vocab)).to(device)

    # Define Adam optimizer
    optimizer = optim.Adam(model.parameters())

    # Define cross-entropy loss, ignoring padding token
    criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])

    # Loop over epochs for training
    for epoch in range(config['epochs']):
        # Train the model and get training metrics
        train_loss, train_acc, train_word_acc = train(model, train_loader, optimizer, criterion, tgt_vocab['<pad>'], tgt_index_to_token)

        # Evaluate model on validation data
        val_loss, val_acc, val_word_acc = evaluate(model, dev_loader, criterion, tgt_vocab['<pad>'], tgt_index_to_token)

        # Evaluate model on test data
        test_loss, test_acc, test_word_acc = evaluate(model, test_loader, criterion, tgt_vocab['<pad>'], tgt_index_to_token)

        # Print metrics for this epoch
        print(f"Epoch {epoch + 1}")
        print(f"{'train_loss:':20} {train_loss:.4f}")
        print(f"{'val_loss:':20} {val_loss:.4f}")
        print(f"{'test_loss:':20} {test_loss:.4f}")
        print(f"{'train_accuracy:':20} {train_acc * 100:.2f}%")
        print(f"{'val_accuracy:':20} {val_acc * 100:.2f}%")
        print(f"{'test_accuracy:':20} {test_acc * 100:.2f}%")
        print(f"{'train_word_accuracy:':20} {train_word_acc * 100:.2f}%")
        print(f"{'val_word_accuracy:':20} {val_word_acc * 100:.2f}%")
        print(f"{'test_word_accuracy:':20} {test_word_acc * 100:.2f}%")

        # Log metrics to Weights & Biases
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "test_loss": test_loss,
            "train_accuracy": train_acc * 100,
            "val_accuracy": val_acc * 100,
            "test_accuracy": test_acc * 100,
            "train_word_accuracy": train_word_acc * 100,
            "val_word_accuracy": val_word_acc * 100,
            "test_word_accuracy": test_word_acc * 100
        })

    # Get beam width value from config (used in beam search decoding)
    beam_width = config.get('beam_width')
    print('beam_width', beam_width)

    # Initialize list to store prediction results
    results = []

    # Loop over test samples for prediction
    for sample_src, actual_tgt in test_data[:9229]:
        # Generate prediction using beam search
        pred_seq = predict_with_beam_search(model, sample_src, src_vocab, tgt_vocab, beam_width=beam_width)

        # Remove special tokens from prediction
        pred_tokens = [c for c in pred_seq if c not in ['<sos>', '<pad>', '<eos>']]

        # Join predicted tokens to form final string
        pred_str = ''.join(pred_tokens)

        # Print input, actual, and predicted strings
        print(f"Input:      {sample_src}")
        print(f"Actual:     {actual_tgt}")
        print(f"Prediction: {pred_str}")
        print("-" * 30)

        # Add the result to results list
        results.append({
            "Input": sample_src,
            "Actual": actual_tgt,
            "Prediction": pred_str
        })

    # Define output CSV path
    output_csv_path = "beam_search_predictions.csv"

    # Save all results to a CSV file
    with open(output_csv_path, mode='w', encoding='utf-8-sig', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=["Input", "Actual", "Prediction"])
        writer.writeheader()
        writer.writerows(results)

    # Print confirmation message
    print(f"Saved predictions to {output_csv_path}"

# Set the device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the sweep configuration for hyperparameter tuning using Weights & Biases
sweep_config = {
    'method': 'random',  # Use random search to sample hyperparameters
    'name': 'DakshinaSweepForPred_Best_without_attn_nothing',  # Name of the sweep
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},  # Target metric to maximize during tuning
    'parameters': {  # Define the hyperparameters to sweep
        'embedding_dim': {'values': [256]},  # Embedding dimension
        'hidden_size': {'values': [128]},  # Hidden size of RNN
        'encoder_layers': {'values': [2]},  # Number of layers in encoder
        'decoder_layers': {'values': [3]},  # Number of layers in decoder
        'cell_type': {'values': ['LSTM']},  # Type of RNN cell to use
        'dropout': {'values': [0.3]},  # Dropout rate
        'epochs': {'values': [1]},  # Number of epochs to train
        'beam_width': {'values': [3]}  # Beam width to use during beam search prediction
    }
}

# Initialize the sweep in Weights & Biases and get the sweep ID
sweep_id = wandb.sweep(sweep_config, project="DL_A3")

# Start an agent that will run the sweep using the sweep_train_pred function for 1 run
wandb.agent(sweep_id, function=sweep_train_pred, count=1)


Create sweep with ID: 7fce309f
Sweep URL: https://wandb.ai/ma23m011-iit-madras/DL_A3/sweeps/7fce309f


[34m[1mwandb[0m: Agent Starting Run: 0wxhv9b3 with config:
[34m[1mwandb[0m: 	beam_width: 3
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 3
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 256
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 1
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


{'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'a': 4, 'n': 5, 'g': 6, 'k': 7, 'i': 8, 't': 9, 'o': 10, 'e': 11, 'r': 12, 's': 13, 'h': 14, 'y': 15, 'w': 16, 'u': 17, 'l': 18, 'd': 19, 'j': 20, 'b': 21, 'm': 22, 'c': 23, 'q': 24, 'z': 25, 'p': 26, 'x': 27, 'v': 28, 'f': 29}
{'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3, 'অ': 4, 'ং': 5, 'ক': 6, 'ি': 7, 'ত': 8, 'ে': 9, 'র': 10, 'শ': 11, 'ই': 12, 'ও': 13, 'গ': 14, 'ু': 15, 'ল': 16, 'ো': 17, '্': 18, 'হ': 19, 'ণ': 20, 'া': 21, 'ী': 22, 'দ': 23, 'ন': 24, 'ট': 25, 'ব': 26, 'ষ': 27, 'ম': 28, 'স': 29, 'খ': 30, 'য': 31, 'ড': 32, 'ৎ': 33, 'ধ': 34, 'ঠ': 35, 'জ': 36, 'প': 37, 'ূ': 38, 'চ': 39, 'ছ': 40, 'ভ': 41, 'ঘ': 42, 'ঙ': 43, 'ৈ': 44, 'ঞ': 45, '়': 46, 'ঃ': 47, 'এ': 48, 'থ': 49, 'ৃ': 50, 'ৌ': 51, 'ফ': 52, 'ঝ': 53, 'আ': 54, 'উ': 55, 'ঁ': 56, 'ঈ': 57, 'ঊ': 58, 'ঋ': 59, 'ঐ': 60, 'ঔ': 61, 'ঢ': 62, '২': 63}
Epoch 1
train_loss:          1.4595
val_loss:            0.7188
test_loss:           0.7149
train_accuracy:      58.07%
val_accuracy:       

0,1
epoch,▁
test_accuracy,▁
test_loss,▁
test_word_accuracy,▁
train_accuracy,▁
train_loss,▁
train_word_accuracy,▁
val_accuracy,▁
val_loss,▁
val_word_accuracy,▁

0,1
epoch,1.0
test_accuracy,76.95625
test_loss,0.71495
test_word_accuracy,16.62702
train_accuracy,58.07133
train_loss,1.4595
train_word_accuracy,6.58488
val_accuracy,77.33357
val_loss,0.71883
val_word_accuracy,17.24242
