

## Installation


In [2]:
pip install wandb numpy pandas matplotlib torch torchvision keras

Note: you may need to restart the kernel to use updated packages.


## Libraries

In [3]:
import torch
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import wandb
from torch.utils.data import TensorDataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

cuda


## Dataset Loader

In [None]:
# preprocessing all train,test,validation dataset files
def load_data(train_path, val_path, test_path):
  paths = [train_path, val_path, test_path]
  datasets = []
  for path in paths:
    dataset = []
    with open(path, encoding="utf-8") as file:
      lines = file.readlines()
      for line in lines:
        cols = line.strip().split("\t")
        if(len(cols) != 3):   # skipping malformed lines if any
          continue
        dataset.append((cols[0].strip(), cols[1].strip()))  # third column is ignored
    datasets.append(dataset)
  return datasets

## Build Vocabulary

In [None]:
# Building vocabulary from training data
def buildVocabulary(data):
  input_characters = set()   # using set to collect unique characters
  target_characters = set()
  for input, target in data:
    for char in input:
      input_characters.add(char)
    for char in target:
      target_characters.add(char)

  input_characters = [' '] + sorted(list(input_characters))       # padding token is ' '
  target_characters = [' ', '\t', '\n']+ sorted(list(target_characters))   # start token: '\t'; end token:'\n'
  return input_characters, target_characters

In [None]:
# creating dictionaries to get indices for any input, target character
def generateTokenIndices(input_characters, target_characters):
  input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
  target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
  return input_token_index, target_token_index

In [None]:
def generateEmbeddings(data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length):
    encoder_input_data = np.zeros((len(data), max_encoder_seq_length), dtype="int64")
    decoder_input_data = np.zeros((len(data), max_decoder_seq_length), dtype="int64")
    decoder_target_data = np.zeros((len(data), max_decoder_seq_length), dtype="int64")

    for i, (input_text, target_text) in enumerate(data):
        target_text = "\t" + target_text + "\n"  # Adding start and end tokens

        # Filling encoder sequence
        for t, char in enumerate(input_text):
            encoder_input_data[i, t] = input_token_index.get(char, input_token_index[" "])
        for t in range(len(input_text), max_encoder_seq_length):
            encoder_input_data[i, t] = input_token_index[" "]

        # Filling decoder input and target
        for t, char in enumerate(target_text):
            decoder_input_data[i, t] = target_token_index.get(char, target_token_index[" "])
            if t > 0:
                decoder_target_data[i, t - 1] = target_token_index.get(char, target_token_index[" "])
        for t in range(len(target_text), max_decoder_seq_length):
            decoder_input_data[i, t] = target_token_index[" "]
            if t > 0:
                decoder_target_data[i, t - 1] = target_token_index[" "]

    #print("train_enc shape:", encoder_input_data.shape)
    #print("train_enc dtype:", encoder_input_data.dtype)
    return encoder_input_data, decoder_input_data, decoder_target_data

In [8]:
# train_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.train.tsv"
# val_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.dev.tsv"
# test_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.test.tsv"
# train_data, val_data, test_data = load_data(train_path, val_path, test_path)
# input_characters, target_characters = buildVocabulary(train_data)
# input_token_index, target_token_index = generateTokenIndices(input_characters, target_characters)
# max_encoder_seq_length = max([len(input) for input, _ in train_data])
# max_decoder_seq_length = max([len(target) for _, target in train_data]) + 2

# train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = generateEmbeddings(train_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)
# val_encoder_input_data, val_decoder_input_data, val_decoder_target_data = generateEmbeddings(val_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)
# test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = generateEmbeddings(test_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)

In [None]:
import torch.nn as nn
import torch

class Seq2SeqModel(nn.Module):
    def __init__(self, config, input_vocab_size, target_vocab_size):
        super().__init__()
        self.config = config

        # embedding layers for encoder and decoder
        self.embedding_encoder = nn.Embedding(input_vocab_size, config.embedding_dim, padding_idx=0)
        self.embedding_decoder = nn.Embedding(target_vocab_size, config.embedding_dim, padding_idx=0)

        # considering torch model based on cell type in config
        rnn_model_type = {'RNN': nn.RNN, 'GRU': nn.GRU, 'LSTM': nn.LSTM}[config.cell_type]

        self.encoder = rnn_model_type(
            input_size=config.embedding_dim,
            hidden_size=config.hidden_size,
            num_layers=config.encoder_layers,
            batch_first=True,
            dropout=config.dropout if config.encoder_layers > 1 else 0,     # adding dropout if number of layers are more than 1
            bidirectional=False
        )

        self.decoder = rnn_model_type(
            input_size=config.embedding_dim,
            hidden_size=config.hidden_size,
            num_layers=config.decoder_layers,
            batch_first=True,
            dropout=config.dropout if config.decoder_layers > 1 else 0,
            bidirectional=False
        )

        # projecting encoder hidden state to match decoder shape
        self.h_proj = nn.Linear(config.hidden_size * config.encoder_layers,
                                config.hidden_size * config.decoder_layers)

        if config.cell_type == 'LSTM':
            self.c_proj = nn.Linear(config.hidden_size * config.encoder_layers,
                                    config.hidden_size * config.decoder_layers)
        else:
            self.c_proj = None

        # output layer
        self.fc_out = nn.Linear(config.hidden_size, target_vocab_size)

    def _transform_hidden(self, h_enc):
        # h_enc shape: (layers, batch, hidden_size)
        # reshaping encoder hidden state for decoder
        batch_size = h_enc.size(1)
        h_flat = h_enc.permute(1, 0, 2).contiguous().view(batch_size, -1)  # (batch, layers*hidden_size)
        h_proj = self.h_proj(h_flat)  # (batch, dec_layers*hidden_size)
        h_proj = h_proj.view(batch_size, self.config.decoder_layers, self.config.hidden_size).permute(1, 0, 2).contiguous()
        return h_proj

    def _transform_cell(self, c_enc):
        if c_enc is None:
            return None
        batch_size = c_enc.size(1)
        c_flat = c_enc.permute(1, 0, 2).contiguous().view(batch_size, -1)
        c_proj = self.c_proj(c_flat)
        c_proj = c_proj.view(batch_size, self.config.decoder_layers, self.config.hidden_size).permute(1, 0, 2).contiguous()
        return c_proj

    def forward(self, encoder_input, decoder_input):
        encoder_embedded = self.embedding_encoder(encoder_input)
        decoder_embedded = self.embedding_decoder(decoder_input)

        # print shapes for debugging
        #print(encoder_embedded.shape)
        #print(decoder_embedded.shape)

        if self.config.cell_type == 'LSTM':
            encoder_outputs, (h, c) = self.encoder(encoder_embedded)
            h_dec = self._transform_hidden(h)
            c_dec = self._transform_cell(c)
            decoder_outputs, _ = self.decoder(decoder_embedded, (h_dec, c_dec))
        else:
            encoder_outputs, h = self.encoder(encoder_embedded)
            h_dec = self._transform_hidden(h)
            decoder_outputs, _ = self.decoder(decoder_embedded, h_dec)

        output = self.fc_out(decoder_outputs)
        return output



# ==================== TRAINING ====================
def train_model(model, train_data, val_data, config, target_vocab_size, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0)   # ignoring padding values in loss (padded with zero here)
    optimizer = optim.Adam(model.parameters(), lr=config.lr)

    train_enc, train_dec, train_tgt = train_data
    val_enc, val_dec, val_tgt = val_data

    train_enc = torch.tensor(train_enc, dtype=torch.long)
    train_dec = torch.tensor(train_dec, dtype=torch.long)
    train_tgt = torch.tensor(train_tgt, dtype=torch.long)

    #print(f"After torch tensor conversion shape: f{train_enc.shape} , f{train_dec.shape}, f{train_tgt.shape}")

    val_enc = torch.tensor(val_enc, dtype=torch.long)
    val_dec = torch.tensor(val_dec, dtype=torch.long)
    val_tgt = torch.tensor(val_tgt, dtype=torch.long)

    # batching training data
    dataset = TensorDataset(train_enc, train_dec, train_tgt)
    train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

    start_token_idx = 1  # '\t' in target_token_index
    end_token_idx = 2    # '\n' in target_token_index

    for epoch in range(config.epochs):
        model.train()
        total_loss = 0

        for encoder_batch, decoder_input_batch, decoder_target_batch in train_loader:
            optimizer.zero_grad()
            output = model(encoder_batch.to(device), decoder_input_batch.to(device))
            loss = criterion(output.view(-1, target_vocab_size), decoder_target_batch.to(device).view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            
        wandb.log({
            "epoch" : epoch+1,
            "train_loss": total_loss,
            
        })

        model.eval()
        with torch.no_grad():
            correct_tokens = 0
            total_tokens = 0
            val_loss = 0

            val_dataset = TensorDataset(val_enc, val_dec, val_tgt)
            val_loader = DataLoader(val_dataset, batch_size=32)

            for val_enc_batch, val_dec_batch, val_tgt_batch in val_loader:
                val_enc_batch = val_enc_batch.to(device)
                val_dec_batch = val_dec_batch.to(device)
                val_tgt_batch = val_tgt_batch.to(device) 

                output = model(val_enc_batch, val_dec_batch)
                loss = criterion(output.view(-1, target_vocab_size), val_tgt_batch.view(-1))
                val_loss += loss.item()                

                predictions = output.argmax(dim=-1)
                mask = val_tgt_batch != 0  # ignore padding
                correct_tokens += (predictions == val_tgt_batch).masked_select(mask).sum().item()
                total_tokens += mask.sum().item()

            val_accuracy = correct_tokens / total_tokens

            wandb.log({"val_loss": val_loss, "val_accuracy": val_accuracy})
        #print(f"Epoch {epoch+1}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

In [9]:
# ==================== SWEEP CONFIG ====================
sweep_config = {
    'name': 'sweep_final1',
    'method': 'bayes',
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
    'parameters': {
        'embedding_dim': {'values': [16, 32, 64]},
        'hidden_size': {'values': [32, 64, 128]},
        'encoder_layers': {'values': [1, 2]},
        'decoder_layers': {'values': [1, 2]},
        'cell_type': {'values': ['RNN', 'GRU', 'LSTM']},
        'dropout': {'values': [0.2, 0.3]},
        'lr': {'values': [0.001]},
        'epochs': {'value': 10},
    }
}

Replace train, test, validation dataset paths in below cell as marked

In [None]:
def sweep_train():
    wandb.init()
    config = wandb.config
    wandb.run.name = f"{config.cell_type}_enc_{config.encoder_layers}_dec_{config.decoder_layers}_{config.epochs}_emb_{config.embedding_dim}_hs_{config.hidden_size}"

    # Replace these paths with your train, val, test dataset paths
    train_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.train.tsv"
    val_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.dev.tsv"
    test_path = "/kaggle/input/assignment3-telugu-dakshinadataset/te.translit.sampled.test.tsv"

    # load input-output string pairs from each split
    train_data, val_data, test_data = load_data(train_path, val_path, test_path)

    # building vocabularies from training data
    input_characters, target_characters = buildVocabulary(train_data)

    # creating character-to-index mappings
    input_token_index, target_token_index = generateTokenIndices(input_characters, target_characters)

    max_encoder_seq_length = max([len(input) for input, _ in train_data])
    max_decoder_seq_length = max([len(target) for _, target in train_data]) + 2
    
    train_encoder_input_data, train_decoder_input_data, train_decoder_target_data = generateEmbeddings(train_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)
    val_encoder_input_data, val_decoder_input_data, val_decoder_target_data = generateEmbeddings(val_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)
    test_encoder_input_data, test_decoder_input_data, test_decoder_target_data = generateEmbeddings(test_data, input_token_index, target_token_index, max_encoder_seq_length, max_decoder_seq_length)


    model = Seq2SeqModel(config, len(input_token_index), len(target_token_index))

    # train the model and log metrics to wandb
    train_model(model, (train_encoder_input_data, train_decoder_input_data, train_decoder_target_data), (val_encoder_input_data, val_decoder_input_data, val_decoder_target_data), config, len(target_token_index), device)

Replace wandb key and project, entity details with your information

In [None]:
wandb.login(key="XXXX")
sweep_id = wandb.sweep(sweep_config, project='DA6401_Assignment3',entity="nikhithaa-iit-madras")
wandb.agent(sweep_id, function=sweep_train, count=50)   # remove count if you dont want to limit number of runs

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnikhithaa[0m ([33mnikhithaa-iit-madras[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: i6pogcjt
Sweep URL: https://wandb.ai/nikhithaa-iit-madras/DA6401_Assignment3/sweeps/i6pogcjt


[34m[1mwandb[0m: Agent Starting Run: yb5efjel with config:
[34m[1mwandb[0m: 	cell_type: LSTM
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▃▂▂▂▁▁▁
val_accuracy,▁▄▅▆▇▇▇███
val_loss,█▅▄▃▂▂▂▁▁▁

0,1
epoch,10.0
train_loss,1231.54235
val_accuracy,0.81161
val_loss,102.90616


[34m[1mwandb[0m: Agent Starting Run: 4mkqoq24 with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	encoder_layers: 2
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▃▂▂▁▁▁▁▁▁
val_accuracy,▁▅▇▇▇█████
val_loss,█▄▂▂▂▁▁▁▁▁

0,1
epoch,10.0
train_loss,563.46399
val_accuracy,0.8779
val_loss,62.30132


[34m[1mwandb[0m: Agent Starting Run: 9wxlw2ka with config:
[34m[1mwandb[0m: 	cell_type: RNN
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.2
[34m[1mwandb[0m: 	embedding_dim: 64
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 64
[34m[1mwandb[0m: 	lr: 0.001


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▄▃▂▂▂▁▁▁▁
val_accuracy,▁▄▆▇▇█▇▇█▇
val_loss,█▅▃▃▂▂▂▁▁▁

0,1
epoch,10.0
train_loss,3503.59712
val_accuracy,0.38551
val_loss,348.67116


[34m[1mwandb[0m: Agent Starting Run: 3i1wsj9a with config:
[34m[1mwandb[0m: 	cell_type: GRU
[34m[1mwandb[0m: 	decoder_layers: 2
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	embedding_dim: 32
[34m[1mwandb[0m: 	encoder_layers: 1
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	hidden_size: 32
[34m[1mwandb[0m: 	lr: 0.001
