In [None]:
# Import required libraries
import os
import numpy as np
from tensorflow.keras import layers, models, preprocessing
from tensorflow.keras.optimizers import Adam
import wandb
from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint


In [None]:
# Adding and Verifying GPU 
import torch
import tensorflow as tf

# Check PyTorch device availability
print("PyTorch CUDA available:", torch.cuda.is_available())
torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch using device: {torch_device}")

# Check TensorFlow device availability
print("\nTensorFlow GPU devices:", tf.config.list_physical_devices('GPU'))
print(f"TensorFlow using device: {'GPU' if tf.test.is_gpu_available() else 'CPU'}")

# Set random seeds for reproducibility
def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seeds()

In [None]:
# Data Preparation
class DataLoader:
    def __init__(self, data_path):
        self.data_path = data_path
        self.input_tokenizer = None
        self.target_tokenizer = None
    
    def load_dataset(self, filename):
        """Load dataset from TSV file"""
        with open(os.path.join(self.data_path, filename), encoding='utf-8') as f:
            return [line.strip().split('\t') for line in f if '\t' in line]
    
    def preprocess_data(self, train_file, val_file):
        """Preprocess and tokenize data"""
        train_data = self.load_dataset(train_file)
        val_data = self.load_dataset(val_file)
        
        # Prepare texts
        train_source = [x[1] for x in train_data]
        train_target = [x[0] for x in train_data]
        
        # Create tokenizers
        self.input_tokenizer = self._create_tokenizer(train_source + [x[1] for x in val_data])
        self.target_tokenizer = self._create_tokenizer(
            ['\t' + t for t in train_target] + 
            [t + '\n' for t in train_target]
        )
        
        # Convert to sequences
        max_source_len = max(len(s) for s in train_source)
        max_target_len = max(len(t) for t in train_target) + 1
        
        train_enc = self._text_to_sequence(train_source, self.input_tokenizer, max_source_len)
        train_dec_in = self._text_to_sequence(['\t' + t for t in train_target], self.target_tokenizer, max_target_len)
        train_dec_out = np.expand_dims(
            self._text_to_sequence([t + '\n' for t in train_target], self.target_tokenizer, max_target_len),
            -1
        )
        
        return (train_enc, train_dec_in, train_dec_out), self.target_tokenizer
    
    def _create_tokenizer(self, texts):
        """Create character-level tokenizer"""
        tokenizer = preprocessing.text.Tokenizer(char_level=True, lower=False)
        tokenizer.fit_on_texts(texts)
        return tokenizer
    
    def _text_to_sequence(self, texts, tokenizer, max_len):
        """Convert texts to padded sequences"""
        seq = tokenizer.texts_to_sequences(texts)
        return preprocessing.sequence.pad_sequences(seq, padding='post', maxlen=max_len)

In [None]:
# Model Building
def build_seq2seq_model(input_vocab_size, target_vocab_size, config):
    """Build seq2seq model with specified architecture"""
    
    # Input layers
    encoder_input = layers.Input(shape=(None,))
    decoder_input = layers.Input(shape=(None,))
    
    # Shared embedding
    embedding = layers.Embedding(input_vocab_size, config.embedding_dim, mask_zero=True)
    
    # Encoder
    encoder_output, encoder_states = build_encoder(
        encoder_input, embedding, config)
    
    # Decoder
    decoder_output = build_decoder(
        decoder_input, embedding, encoder_states, config)
    
    # Output layer
    output = layers.Dense(target_vocab_size, activation='softmax')(decoder_output)
    
    return models.Model([encoder_input, decoder_input], output)

def build_encoder(inputs, embedding, config):
    """Build encoder based on config"""
    x = embedding(inputs)
    rnn_type = config.rnn_type.lower()
    
    if rnn_type == 'lstm':
        layer = layers.LSTM
    elif rnn_type == 'gru':
        layer = layers.GRU
    else:
        layer = layers.SimpleRNN
    
    for i in range(config.enc_layers):
        return_sequences = (i < config.enc_layers - 1)
        x = layer(
            config.hidden_dim,
            return_sequences=return_sequences,
            return_state=True,
            dropout=config.dropout,
            name=f'enc_{rnn_type}_{i}'
        )(x)
    
    return x if config.enc_layers == 1 else x[0], x[1:]

def build_decoder(inputs, embedding, initial_state, config):
    """Build decoder based on config"""
    x = embedding(inputs)
    rnn_type = config.rnn_type.lower()
    
    if rnn_type == 'lstm':
        layer = layers.LSTM
    elif rnn_type == 'gru':
        layer = layers.GRU
    else:
        layer = layers.SimpleRNN
    
    for i in range(config.dec_layers):
        x = layer(
            config.hidden_dim,
            return_sequences=True,
            return_state=True,
            dropout=config.dropout,
            name=f'dec_{rnn_type}_{i}'
        )(x, initial_state=initial_state)
    
    return x[0]


In [None]:
# Training with Weights & Biases
def train():
    wandb.init()
    config = wandb.config
    
    # Load data
    loader = DataLoader("/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons")
    (train_enc, train_dec_in, train_dec_out), target_tokenizer = loader.preprocess_data(
        "mr.translit.sampled.train.tsv", 
        "mr.translit.sampled.dev.tsv"
    )
    
    # Build model
    model = build_seq2seq_model(
        len(loader.input_tokenizer.word_index) + 1,
        len(target_tokenizer.word_index) + 1,
        config
    )
    
    model.compile(
        optimizer=Adam(),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    model.fit(
        [train_enc, train_dec_in],
        train_dec_out,
        batch_size=config.batch_size,
        epochs=10,
        validation_split=0.1,
        callbacks=[WandbMetricsLogger(), WandbModelCheckpoint("models")],
        verbose=2
    )


In [None]:
# Configure and Run Sweep
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
    'parameters': {
        'rnn_type': {'values': ['rnn', 'lstm', 'gru']},
        'embedding_dim': {'values': [64, 128, 256]},
        'hidden_dim': {'values': [128, 256, 512]},
        'enc_layers': {'values': [1, 2]},
        'dec_layers': {'values': [1, 2]},
        'dropout': {'values': [0.1, 0.2, 0.3]},
        'batch_size': {'values': [32, 64]},
        'beam_width': {'values': [1, 3, 5]}
    }
}

wandb.login(key="") #add your api key here
sweep_id = wandb.sweep(sweep_config, project="marathi-transliteration")
wandb.agent(sweep_id, function=train, count=15)

In [None]:
# import os
# import numpy as np
# from tensorflow.keras import layers, models, preprocessing
# from tensorflow.keras.optimizers import Adam
# import wandb
# from wandb.integration.keras import WandbMetricsLogger, WandbModelCheckpoint

# class TransliterationSystem:
#     def __init__(self, data_dir):
#         self.data_dir = data_dir
#         self.input_tokenizer = None
#         self.target_tokenizer = None
        
#     def _load_dataset(self, filename):
#         """Load TSV file into list of pairs"""
#         with open(os.path.join(self.data_dir, filename), encoding='utf-8') as f:
#             return [line.strip().split('\t') for line in f if '\t' in line]
    
#     def prepare_data(self):
#         """Load and preprocess training/validation data"""
#         train_data = self._load_dataset("mr.translit.sampled.train.tsv")
#         val_data = self._load_dataset("mr.translit.sampled.dev.tsv")
        
#         # Process text pairs
#         train_source = [x[1] for x in train_data]
#         train_target = [x[0] for x in train_data]
        
#         # Create tokenizers
#         self.input_tokenizer = self._create_tokenizer(train_source + [x[1] for x in val_data])
#         self.target_tokenizer = self._create_tokenizer(
#             ['\t' + t for t in train_target] + 
#             [t + '\n' for t in train_target] +
#             ['\t' + t for t in [x[0] for x in val_data]] + 
#             [t + '\n' for t in [x[0] for x in val_data]]
#         )
        
#         # Prepare sequences
#         max_source_len = max(len(s) for s in train_source)
#         max_target_len = max(len(t) for t in train_target) + 1  # +1 for start/end tokens
        
#         train_enc = self._texts_to_padded_sequences(train_source, self.input_tokenizer, max_source_len)
#         train_dec_in = self._texts_to_padded_sequences(['\t' + t for t in train_target], self.target_tokenizer, max_target_len)
#         train_dec_out = np.expand_dims(
#             self._texts_to_padded_sequences([t + '\n' for t in train_target], self.target_tokenizer, max_target_len),
#             -1
#         )
        
#         return (train_enc, train_dec_in, train_dec_out), self.input_tokenizer, self.target_tokenizer
    
#     def _create_tokenizer(self, texts):
#         """Create character-level tokenizer"""
#         tokenizer = preprocessing.text.Tokenizer(char_level=True, lower=False)
#         tokenizer.fit_on_texts(texts)
#         return tokenizer
    
#     def _texts_to_padded_sequences(self, texts, tokenizer, max_len):
#         """Convert texts to padded sequences"""
#         seq = tokenizer.texts_to_sequences(texts)
#         return preprocessing.sequence.pad_sequences(seq, padding='post', maxlen=max_len)

# class Seq2SeqModelBuilder:
#     """Builds sequence-to-sequence models with different RNN types"""
    
#     RNN_TYPES = {
#         'rnn': layers.SimpleRNN,
#         'lstm': layers.LSTM,
#         'gru': layers.GRU
#     }
    
#     def __init__(self, input_vocab_size, target_vocab_size):
#         self.input_vocab_size = input_vocab_size
#         self.target_vocab_size = target_vocab_size
    
#     def build_model(self, rnn_type='lstm', embedding_dim=256, hidden_dim=512, 
#                    enc_layers=1, dec_layers=1, dropout=0.2):
#         """Build end-to-end seq2seq model"""
        
#         # Input layers
#         encoder_input = layers.Input(shape=(None,))
#         decoder_input = layers.Input(shape=(None,))
        
#         # Shared embedding
#         embedding = layers.Embedding(self.input_vocab_size, embedding_dim, mask_zero=True)
        
#         # Encoder
#         enc_output, enc_states = self._build_encoder(
#             encoder_input, embedding, rnn_type, hidden_dim, enc_layers, dropout)
        
#         # Decoder
#         decoder_output = self._build_decoder(
#             decoder_input, embedding, rnn_type, hidden_dim, dec_layers, dropout, enc_states)
        
#         # Final output
#         output = layers.Dense(self.target_vocab_size, activation='softmax')(decoder_output)
        
#         return models.Model([encoder_input, decoder_input], output)
    
#     def _build_encoder(self, inputs, embedding, rnn_type, hidden_dim, num_layers, dropout):
#         """Build encoder architecture"""
#         x = embedding(inputs)
#         rnn_class = self.RNN_TYPES[rnn_type.lower()]
        
#         for i in range(num_layers):
#             return_sequences = (i < num_layers - 1)
#             x = rnn_class(
#                 hidden_dim,
#                 return_sequences=return_sequences,
#                 return_state=True,
#                 dropout=dropout,
#                 name=f'enc_{rnn_type}_{i}'
#             )(x)
        
#         return x if num_layers == 1 else x[0], x[1:] if num_layers > 1 else x[1]
    
#     def _build_decoder(self, inputs, embedding, rnn_type, hidden_dim, num_layers, dropout, initial_state):
#         """Build decoder architecture"""
#         x = embedding(inputs)
#         rnn_class = self.RNN_TYPES[rnn_type.lower()]
        
#         for i in range(num_layers):
#             x = rnn_class(
#                 hidden_dim,
#                 return_sequences=True,
#                 return_state=True,
#                 dropout=dropout,
#                 name=f'dec_{rnn_type}_{i}'
#             )(x, initial_state=initial_state)
        
#         return x[0]

# class InferenceSystem:
#     """Handles model inference with beam search"""
    
#     def __init__(self, model, rnn_type, hidden_dim):
#         self.rnn_type = rnn_type.lower()
#         self._setup_inference_models(model, hidden_dim)
        
#     def _setup_inference_models(self, model, hidden_dim):
#         """Create encoder/decoder models for inference"""
#         encoder_input = model.input[0]
#         decoder_input = model.input[1]
#         embedding = model.get_layer('embedding')
        
#         # Encoder model
#         enc_output = embedding(encoder_input)
#         rnn_layer = next(l for l in model.layers if l.name.startswith(f'enc_{self.rnn_type}'))
        
#         if self.rnn_type == 'lstm':
#             _, state_h, state_c = rnn_layer(enc_output)
#             self.encoder_model = models.Model(encoder_input, [state_h, state_c])
#             self.state_size = 2
#         elif self.rnn_type == 'gru':
#             _, state_h = rnn_layer(enc_output)
#             self.encoder_model = models.Model(encoder_input, [state_h])
#             self.state_size = 1
#         else:  # Simple RNN
#             _, state_h = rnn_layer(enc_output)
#             self.encoder_model = models.Model(encoder_input, [state_h])
#             self.state_size = 1
        
#         # Decoder model
#         decoder_states_input = [
#             layers.Input(shape=(hidden_dim,)) for _ in range(self.state_size)
#         ]
#         decoder_emb = embedding(decoder_input)
        
#         decoder_rnn = next(l for l in model.layers if l.name.startswith(f'dec_{self.rnn_type}'))
#         decoder_output = decoder_rnn(decoder_emb, initial_state=decoder_states_input)
        
#         dense_layer = model.get_layer('dense')
#         decoder_output = dense_layer(decoder_output[0])
        
#         self.decoder_model = models.Model(
#             [decoder_input] + decoder_states_input,
#             [decoder_output] + list(decoder_output[1:])
    
#     def beam_search_decode(self, input_seq, tokenizer, beam_width=3, max_len=30):
#         """Decode sequence using beam search"""
#         idx_to_char = {i: c for c, i in tokenizer.word_index.items()}
#         idx_to_char[0] = ''
        
#         start_token = tokenizer.word_index['\t']
#         end_token = tokenizer.word_index['\n']
        
#         states = self.encoder_model.predict(input_seq)
#         if self.state_size == 1:
#             states = [states]
        
#         beams = [([start_token], 0.0, states)]
        
#         for _ in range(max_len):
#             candidates = []
#             for seq, score, states in beams:
#                 if seq[-1] == end_token:
#                     candidates.append((seq, score, states))
#                     continue
                
#                 target_seq = np.array([[seq[-1]]])
#                 outputs = self.decoder_model.predict([target_seq] + states)
#                 probs = outputs[0][0, -1, :]
#                 top_tokens = np.argsort(probs)[-beam_width:]
                
#                 for token in top_tokens:
#                     new_score = score - np.log(probs[token] + 1e-9)
#                     candidate_seq = seq + [token]
#                     candidates.append((candidate_seq, new_score, outputs[1:]))
            
#             beams = sorted(candidates, key=lambda x: x[1])[:beam_width]
        
#         best_seq = beams[0][0]
#         return ''.join(idx_to_char.get(i, '') for i in best_seq[1:-1])

# def run_sweep():
#     """Configure and run hyperparameter sweep"""
#     sweep_config = {
#         'method': 'bayes',
#         'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
#         'parameters': {
#             'rnn_type': {'values': ['rnn', 'lstm', 'gru']},
#             'embedding_dim': {'values': [64, 128, 256]},
#             'hidden_dim': {'values': [128, 256, 512]},
#             'enc_layers': {'values': [1, 2]},
#             'dec_layers': {'values': [1, 2]},
#             'dropout': {'values': [0.1, 0.2, 0.3]},
#             'batch_size': {'values': [32, 64]},
#             'beam_width': {'values': [1, 3, 5]}
#         }
#     }
    
#     def sweep_train():
#         with wandb.init() as run:
#             config = run.config
#             run.name = (f"{config.rnn_type}_e{config.embedding_dim}_h{config.hidden_dim}_"
#                        f"enc{config.enc_layers}_dec{config.dec_layers}_"
#                        f"drop{config.dropout}_beam{config.beam_width}")
            
#             # Initialize system
#             system = TransliterationSystem("/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons")
#             (train_enc, train_dec_in, train_dec_out), _, target_tokenizer = system.prepare_data()
            
#             # Build model
#             builder = Seq2SeqModelBuilder(
#                 len(system.input_tokenizer.word_index) + 1,
#                 len(target_tokenizer.word_index) + 1
#             )
#             model = builder.build_model(
#                 rnn_type=config.rnn_type,
#                 embedding_dim=config.embedding_dim,
#                 hidden_dim=config.hidden_dim,
#                 enc_layers=config.enc_layers,
#                 dec_layers=config.dec_layers,
#                 dropout=config.dropout
#             )
            
#             model.compile(
#                 optimizer=Adam(),
#                 loss='sparse_categorical_crossentropy',
#                 metrics=['accuracy']
#             )
            
#             model.fit(
#                 [train_enc, train_dec_in],
#                 train_dec_out,
#                 batch_size=config.batch_size,
#                 epochs=10,
#                 validation_split=0.1,
#                 callbacks=[WandbMetricsLogger(), WandbModelCheckpoint("models")],
#                 verbose=2
#             )
    
#     sweep_id = wandb.sweep(sweep_config, project="marathi-transliteration")
#     wandb.agent(sweep_id, function=sweep_train, count=15)

# if __name__ == "__main__":
#     wandb.login(key="fc2e1c64c097689579cbd81e98b023cd1e9d3ee3")
#     run_sweep()

In [None]:
# Load Best Hyperparameters
best_config = {
    'embedding_dim': 256,
    'hidden_dim': 256,
    'rnn_type': 'lstm',
    'enc_layers': 1,
    'dec_layers': 1,
    'dropout': 0.3,
    'batch_size': 64,
    'epochs': 10
}

In [None]:
# Initialize Data Loader
data_loader = DataLoader("/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons")
(train_enc, train_dec_in, train_dec_out), target_tokenizer = data_loader.preprocess_data(
    "mr.translit.sampled.train.tsv", 
    "mr.translit.sampled.dev.tsv"
)

In [None]:
# Build and Train Best Model
best_model = Seq2SeqModelBuilder(
    len(data_loader.input_tokenizer.word_index) + 1,
    len(target_tokenizer.word_index) + 1
).build_model(**best_config)

best_model.compile(
    optimizer=Adam(),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


In [None]:
# Start WandB run
wandb.init(project="marathi-transliteration", name="best_model_training")
best_model.fit(
    [train_enc, train_dec_in],
    train_dec_out,
    batch_size=best_config['batch_size'],
    epochs=best_config['epochs'],
    validation_split=0.1,
    callbacks=[WandbMetricsLogger()],
    verbose=2
)

In [None]:
# Save model
best_model.save("best_model.keras")
wandb.finish()

In [None]:
# Load Test Data
test_pairs = data_loader.load_dataset("mr.translit.sampled.test.tsv")
test_source = [x[1] for x in test_pairs]
test_target = [x[0] for x in test_pairs]

In [None]:
# Prepare Test Sequences
test_enc = data_loader._text_to_sequence(
    test_source, 
    data_loader.input_tokenizer, 
    max_len=train_enc.shape[1]
)
test_dec_in = data_loader._text_to_sequence(
    ['\t' + t for t in test_target],
    target_tokenizer,
    max_len=train_dec_in.shape[1]
)
test_dec_out = np.expand_dims(
    data_loader._text_to_sequence(
        [t + '\n' for t in test_target],
        target_tokenizer,
        max_len=train_dec_out.shape[1]
    ),
    -1
)

In [None]:
# Evaluate on Test Set
wandb.init(project="marathi-transliteration", name="best_model_testing")
test_loss, test_acc = best_model.evaluate(
    [test_enc, test_dec_in],
    test_dec_out,
    verbose=0
)
print(f"Test Accuracy: {test_acc:.4f}")
wandb.log({'test_accuracy': test_acc})

In [None]:
# Decoding Utilities
def decode_sequence(seq, tokenizer):
    """Convert sequence of indices to string"""
    idx_to_char = {i: c for c, i in tokenizer.word_index.items()}
    idx_to_char[0] = ''
    decoded = []
    for idx in seq:
        if idx == 0:
            continue
        token = idx_to_char.get(idx, '')
        if token == '\n':
            break
        decoded.append(token)
    return ''.join(decoded)

In [None]:
# Generate Predictions
preds = best_model.predict([test_enc, test_dec_in])
pred_indices = np.argmax(preds, axis=-1)
decoded_preds = [decode_sequence(seq, target_tokenizer) for seq in pred_indices]
decoded_refs = [t.replace('\n', '') for t in test_target]

In [None]:
# Save Predictions
os.makedirs("predictions", exist_ok=True)
with open("predictions/test_predictions.txt", "w", encoding="utf-8") as f:
    for inp, pred, ref in zip(test_source, decoded_preds, decoded_refs):
        f.write(f"{inp}\t{pred}\t{ref}\n")

In [None]:
# Visualize Samples
sample_indices = np.random.choice(len(test_source), 10, replace=False)
print("Prediction Samples:")
for i, idx in enumerate(sample_indices):
    print(f"{i+1}. Input: {test_source[idx]}")
    print(f"   Predicted: {decoded_preds[idx]}")
    print(f"   Reference: {decoded_refs[idx]}\n")

In [None]:
# WandB Prediction Table
wandb_table = wandb.Table(columns=["Input", "Prediction", "Reference", "Correct"])
for idx in sample_indices:
    correct = decoded_preds[idx] == decoded_refs[idx]
    wandb_table.add_data(
        test_source[idx],
        decoded_preds[idx],
        decoded_refs[idx],
        "✅" if correct else "❌"
    )
wandb.log({"predictions": wandb_table})
wandb.finish()

In [None]:
# import os
# import numpy as np
# import tensorflow as tf
# from tensorflow.keras import layers, models, preprocessing
# from tensorflow.keras.optimizers import Adam
# import wandb
# from wandb.integration.keras import WandbMetricsLogger

# class AttentionLayer(layers.Layer):
#     """Bahdanau Attention Layer with weight export capability"""
#     def __init__(self, units, return_attention=False):
#         super().__init__()
#         self.W1 = layers.Dense(units)
#         self.W2 = layers.Dense(units)
#         self.V = layers.Dense(1)
#         self.return_attention = return_attention
#         self.units = units

#     def call(self, query, values):
#         # Add time axis for broadcasting
#         query_with_time = tf.expand_dims(query, 2)  # (batch, dec_len, 1, units)
#         values_with_time = tf.expand_dims(values, 1)  # (batch, 1, enc_len, units)
        
#         # Attention scores calculation
#         score = self.V(tf.nn.tanh(
#             self.W1(values_with_time) + self.W2(query_with_time)
#         ))
        
#         attention_weights = tf.nn.softmax(score, axis=2)
#         context_vector = tf.reduce_sum(attention_weights * values_with_time, axis=2)
        
#         if self.return_attention:
#             return context_vector, tf.squeeze(attention_weights, -1)
#         return context_vector

# class AttentionTransliterator:
#     def __init__(self, data_dir):
#         self.data_dir = data_dir
#         self.input_tokenizer = None
#         self.target_tokenizer = None
    
#     def load_dataset(self, filename):
#         """Load and parse dataset file"""
#         with open(os.path.join(self.data_dir, filename), 'r', encoding='utf-8') as f:
#             return [line.strip().split('\t') for line in f if '\t' in line]
    
#     def preprocess_data(self):
#         """Load and prepare training/validation/test data"""
#         train_pairs = self.load_dataset("mr.translit.sampled.train.tsv")
#         val_pairs = self.load_dataset("mr.translit.sampled.dev.tsv")
#         test_pairs = self.load_dataset("mr.translit.sampled.test.tsv")
        
#         # Extract texts
#         train_source = [x[1] for x in train_pairs]
#         train_target = [x[0] for x in train_pairs]
        
#         # Create tokenizers
#         self.input_tokenizer = self._create_tokenizer(train_source + [x[1] for x in val_pairs])
#         self.target_tokenizer = self._create_tokenizer(
#             ['\t' + t for t in train_target] + 
#             [t + '\n' for t in train_target]
#         )
        
#         # Prepare sequences
#         max_source_len = max(len(s) for s in train_source)
#         max_target_len = max(len(t) for t in train_target) + 1  # +1 for start/end tokens
        
#         # Training data
#         train_enc = self._text_to_sequence(train_source, self.input_tokenizer, max_source_len)
#         train_dec_in = self._text_to_sequence(['\t' + t for t in train_target], self.target_tokenizer, max_target_len)
#         train_dec_out = np.expand_dims(
#             self._text_to_sequence([t + '\n' for t in train_target], self.target_tokenizer, max_target_len),
#             -1
#         )
        
#         # Validation data
#         val_enc = self._text_to_sequence([x[1] for x in val_pairs], self.input_tokenizer, max_source_len)
#         val_dec_in = self._text_to_sequence(['\t' + x[0] for x in val_pairs], self.target_tokenizer, max_target_len)
#         val_dec_out = np.expand_dims(
#             self._text_to_sequence([x[0] + '\n' for x in val_pairs], self.target_tokenizer, max_target_len),
#             -1
#         )
        
#         # Test data
#         test_enc = self._text_to_sequence([x[1] for x in test_pairs], self.input_tokenizer, max_source_len)
#         test_dec_in = self._text_to_sequence(['\t' + x[0] for x in test_pairs], self.target_tokenizer, max_target_len)
#         test_dec_out = np.expand_dims(
#             self._text_to_sequence([x[0] + '\n' for x in test_pairs], self.target_tokenizer, max_target_len),
#             -1
#         )
        
#         return (train_enc, train_dec_in, train_dec_out), \
#                (val_enc, val_dec_in, val_dec_out), \
#                (test_enc, test_dec_in, test_dec_out)
    
#     def _create_tokenizer(self, texts):
#         """Create character-level tokenizer"""
#         tokenizer = preprocessing.text.Tokenizer(char_level=True, lower=False)
#         tokenizer.fit_on_texts(texts)
#         return tokenizer
    
#     def _text_to_sequence(self, texts, tokenizer, max_len):
#         """Convert texts to padded sequences"""
#         seq = tokenizer.texts_to_sequences(texts)
#         return preprocessing.sequence.pad_sequences(seq, padding='post', maxlen=max_len)

# class AttentionModel:
#     def __init__(self, input_vocab_size, target_vocab_size):
#         self.input_vocab_size = input_vocab_size
#         self.target_vocab_size = target_vocab_size
    
#     def build_model(self, embedding_dim=256, hidden_dim=512, dropout_rate=0.2):
#         """Build attention-based seq2seq model"""
#         # Encoder
#         encoder_inputs = layers.Input(shape=(None,))
#         enc_emb = layers.Embedding(self.input_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
#         encoder_outputs, state_h, state_c = layers.LSTM(
#             hidden_dim, return_sequences=True, return_state=True, dropout=dropout_rate
#         )(enc_emb)
        
#         # Decoder
#         decoder_inputs = layers.Input(shape=(None,))
#         dec_emb = layers.Embedding(self.target_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
#         decoder_outputs = layers.LSTM(
#             hidden_dim, return_sequences=True, return_state=True, dropout=dropout_rate
#         )(dec_emb, initial_state=[state_h, state_c])[0]
        
#         # Attention
#         context_vector, attention_weights = AttentionLayer(hidden_dim, return_attention=True)(
#             decoder_outputs, encoder_outputs
#         )
#         concat_output = layers.Concatenate()([decoder_outputs, context_vector])
        
#         # Output
#         outputs = layers.Dense(self.target_vocab_size, activation='softmax')(concat_output)
        
#         return models.Model([encoder_inputs, decoder_inputs], outputs)

# def run_sweep():
#     """Configure and execute hyperparameter sweep"""
#     sweep_config = {
#         'method': 'bayes',
#         'metric': {'name': 'val_accuracy', 'goal': 'maximize'},
#         'parameters': {
#             'embedding_dim': {'values': [128, 256]},
#             'hidden_dim': {'values': [128, 256, 512]},
#             'dropout_rate': {'values': [0.1, 0.2, 0.3]},
#             'batch_size': {'values': [32, 64]},
#             'learning_rate': {'min': 1e-4, 'max': 1e-3}
#         }
#     }
    
#     def train():
#         wandb.init()
#         config = wandb.config
        
#         # Initialize components
#         transliterator = AttentionTransliterator(
#             "/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons"
#         )
#         (train_enc, train_dec_in, train_dec_out), \
#         (val_enc, val_dec_in, val_dec_out), _ = transliterator.preprocess_data()
        
#         # Build model
#         model = AttentionModel(
#             len(transliterator.input_tokenizer.word_index) + 1,
#             len(transliterator.target_tokenizer.word_index) + 1
#         ).build_model(
#             embedding_dim=config.embedding_dim,
#             hidden_dim=config.hidden_dim,
#             dropout_rate=config.dropout_rate
#         )
        
#         model.compile(
#             optimizer=Adam(learning_rate=config.learning_rate),
#             loss='sparse_categorical_crossentropy',
#             metrics=['accuracy']
#         )
        
#         # Train
#         model.fit(
#             [train_enc, train_dec_in],
#             train_dec_out,
#             validation_data=([val_enc, val_dec_in], val_dec_out),
#             batch_size=config.batch_size,
#             epochs=10,
#             callbacks=[WandbMetricsLogger()],
#             verbose=2
#         )
        
#         # Save model if performance is good
#         val_acc = model.history.history['val_accuracy'][-1]
#         if val_acc > 0.85:  # Adjust threshold as needed
#             model.save(f"attention_model_{wandb.run.id}.keras")
    
#     sweep_id = wandb.sweep(sweep_config, project="marathi-transliteration-attention")
#     wandb.agent(sweep_id, function=train, count=15)

# def evaluate_best_model():
#     """Evaluate the best saved model on test set"""
#     wandb.init(project="marathi-transliteration-attention", name="best_model_evaluation")
    
#     # Initialize components
#     transliterator = AttentionTransliterator(
#         "/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons"
#     )
#     _, _, (test_enc, test_dec_in, test_dec_out) = transliterator.preprocess_data()
    
#     # Load best model (replace with actual best run ID)
#     best_model = models.load_model(
#         "attention_model_BEST_RUN_ID.keras",
#         custom_objects={'AttentionLayer': AttentionLayer}
#     )
    
#     # Evaluate
#     test_loss, test_acc = best_model.evaluate(
#         [test_enc, test_dec_in],
#         test_dec_out,
#         verbose=1
#     )
#     print(f"\nTest Accuracy: {test_acc:.4f}")
    
#     # Log results
#     wandb.log({
#         'test_accuracy': test_acc,
#         'test_loss': test_loss
#     })
    
#     # Generate predictions
#     preds = best_model.predict([test_enc, test_dec_in])
#     pred_indices = np.argmax(preds, axis=-1)
    
#     # Save predictions
#     os.makedirs("predictions", exist_ok=True)
#     with open("predictions/attention_predictions.txt", "w", encoding="utf-8") as f:
#         for i in range(len(test_enc)):
#             input_text = transliterator.input_tokenizer.sequences_to_texts([test_enc[i]])[0]
#             pred_text = transliterator.target_tokenizer.sequences_to_texts([pred_indices[i]])[0]
#             true_text = transliterator.target_tokenizer.sequences_to_texts([test_dec_out[i]])[0]
#             f.write(f"{input_text}\t{pred_text}\t{true_text}\n")
    
#     wandb.finish()

# if __name__ == "__main__":
#     wandb.login(key="your_api_key_here")  # Replace with your WandB key
    
#     # Run either the sweep or evaluation
#     run_sweep()  # Comment this out after sweep is done
#     # evaluate_best_model()  # Uncomment after selecting best model

In [None]:
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, units, return_attention=False):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        self.return_attention = return_attention

    def call(self, query, values):
        # Add time axis for broadcasting
        query = tf.expand_dims(query, 2)  # (batch, dec_len, 1, units)
        values = tf.expand_dims(values, 1)  # (batch, 1, enc_len, units)
        
        # Attention calculation
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(query)
        ))
        attention_weights = tf.nn.softmax(score, axis=2)
        context = tf.reduce_sum(attention_weights * values, axis=2)
        
        if self.return_attention:
            return context, tf.squeeze(attention_weights, -1)
        return context

def build_attention_model(vocab_size_input, vocab_size_target):
    # Encoder
    encoder_input = tf.keras.Input(shape=(None,))
    enc_emb = tf.keras.layers.Embedding(vocab_size_input, 256, mask_zero=True)(encoder_input)
    encoder_output, state_h, state_c = tf.keras.layers.LSTM(
        256, return_sequences=True, return_state=True
    )(enc_emb)
    
    # Decoder
    decoder_input = tf.keras.Input(shape=(None,))
    dec_emb = tf.keras.layers.Embedding(vocab_size_target, 256, mask_zero=True)(decoder_input)
    decoder_output = tf.keras.layers.LSTM(
        256, return_sequences=True, return_state=True
    )(dec_emb, initial_state=[state_h, state_c])[0]
    
    # Attention
    context, attention_weights = AttentionLayer(256, return_attention=True)(
        decoder_output, encoder_output
    )
    concat = tf.keras.layers.Concatenate()([decoder_output, context])
    
    # Output
    outputs = tf.keras.layers.Dense(vocab_size_target, activation='softmax')(concat)
    
    return tf.keras.Model([encoder_input, decoder_input], outputs)


In [None]:
# Initialize data
attention_data = TransliterationSystem("/kaggle/input/dakshinadataset/dakshina_dataset_v1.0/mr/lexicons")
(train_enc, train_dec_in, train_dec_out), target_tokenizer = attention_data.prepare_data()

# Build attention model
attention_model = build_attention_model(
    len(attention_data.input_tokenizer.word_index) + 1,
    len(target_tokenizer.word_index) + 1
)

attention_model.compile(
    optimizer=Adam(),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# Train with WandB
wandb.init(project="marathi-transliteration", name="attention_model")
attention_model.fit(
    [train_enc, train_dec_in],
    train_dec_out,
    batch_size=64,
    epochs=10,
    validation_split=0.1,
    callbacks=[WandbMetricsLogger()],
    verbose=2
)
wandb.finish()

In [None]:
def visualize_attention(model, input_seq, tokenizer):
    # Create inference model
    encoder_input = model.input[0]
    encoder_output = model.layers[4].output  # Encoder LSTM output
    decoder_input = model.input[1]
    decoder_lstm = model.layers[5]
    attention_layer = model.layers[7]
    
    # Encoder inference model
    encoder_model = tf.keras.Model(encoder_input, encoder_output)
    
    # Decoder inference model
    decoder_state_input_h = tf.keras.Input(shape=(256,))
    decoder_state_input_c = tf.keras.Input(shape=(256,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        model.layers[3](decoder_input), initial_state=decoder_states_inputs
    )
    
    context, attention = attention_layer(decoder_outputs, encoder_output)
    decoder_model = tf.keras.Model(
        [decoder_input] + decoder_states_inputs,
        [model.layers[8](tf.keras.layers.Concatenate()([decoder_outputs, context]))] + 
        [state_h, state_c, attention]
    )
    
    # Run inference
    enc_output = encoder_model.predict(input_seq)
    states = [np.zeros((1, 256)), np.zeros((1, 256))]
    attention_weights = []
    
    target_seq = np.array([[tokenizer.word_index['\t']]])
    for _ in range(30):  # Max output length
        outputs = decoder_model.predict([target_seq] + states)
        output_token, h, c, attention = outputs[0], outputs[1], outputs[2], outputs[3]
        attention_weights.append(attention[0])
        
        sampled_token = np.argmax(output_token[0, -1, :])
        if sampled_token == tokenizer.word_index['\n']:
            break
            
        target_seq = np.array([[sampled_token]])
        states = [h, c]
    
    return np.array(attention_weights)

In [None]:
# Example visualization
sample_idx = 0
input_seq = train_enc[sample_idx:sample_idx+1]
attention_weights = visualize_attention(attention_model, input_seq, target_tokenizer)

# Plot attention
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
plt.imshow(attention_weights.T, cmap='viridis')
plt.xlabel('Decoder Step')
plt.ylabel('Encoder Step')
plt.title('Attention Weights Visualization')
plt.colorbar()
plt.show()

In [None]:
# Sweep Config.

sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_accuracy',
        'goal': 'maximize'
    },
    'parameters': {
        'embedding_dim': {
            'values': [128, 256]
        },
        'hidden_dim': {
            'values': [128, 256, 512]
        },
        'dropout_rate': {
            'values': [0.0, 0.2, 0.3]
        },
        'batch_size': {
            'values': [32, 64]
        },
        'learning_rate': {
            'min': 1e-4,
            'max': 1e-3
        }
    }
}


In [None]:
#Training function

def attention_sweep_train():
    wandb.init()
    config = wandb.config
    
    # Build model with sweep parameters
    model = build_attention_model(
        vocab_size_input=len(data_loader.input_tokenizer.word_index) + 1,
        vocab_size_target=len(target_tokenizer.word_index) + 1,
        embedding_dim=config.embedding_dim,
        hidden_dim=config.hidden_dim,
        dropout_rate=config.dropout_rate
    )
    
    model.compile(
        optimizer=Adam(learning_rate=config.learning_rate),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        [train_enc, train_dec_in],
        train_dec_out,
        validation_data=([val_enc, val_dec_in], val_dec_out),
        batch_size=config.batch_size,
        epochs=10,
        callbacks=[WandbMetricsLogger()],
        verbose=2
    )
    
    # Save the best model
    if history.history['val_accuracy'][-1] > 0.85: 
        model.save(f"attention_model_{wandb.run.id}.keras")


In [None]:
# Initialize sweep
wandb.login(key="your_api_key_here")  # Replace with your actual key
sweep_id = wandb.sweep(sweep_config, project="marathi-transliteration-attention")


In [None]:
# Start sweep agent
wandb.agent(sweep_id, function=attention_sweep_train, count=15)

In [None]:
# After sweep completes, manually load best model from WandB
best_model = tf.keras.models.load_model("attention_model_[BEST_RUN_ID].keras", 
                                      custom_objects={'AttentionLayer': AttentionLayer})

In [None]:
# Evaluate on test set
test_loss, test_acc = best_model.evaluate(
    [test_enc, test_dec_in],
    test_dec_out,
    verbose=1
)
print(f"\nBest Model Test Accuracy: {test_acc:.4f}")

In [None]:
# Log to WandB
wandb.init(project="marathi-transliteration-attention", name="best_model_eval")
wandb.log({
    'test_accuracy': test_acc,
    'test_loss': test_loss
})

In [None]:
def plot_attention_weights(model, input_seq, output_seq, input_tokenizer, target_tokenizer):
    # Create inference models
    encoder_inputs = model.input[0]
    encoder_outputs = model.layers[4].output  # Encoder LSTM
    decoder_lstm = model.layers[5]
    attention_layer = model.layers[7]
    
    # Encoder inference model
    encoder_model = tf.keras.Model(encoder_inputs, encoder_outputs)
    
    # Decoder inference model
    decoder_state_input_h = tf.keras.Input(shape=(config.hidden_dim,))
    decoder_state_input_c = tf.keras.Input(shape=(config.hidden_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    
    decoder_outputs, state_h, state_c = decoder_lstm(
        model.layers[3](model.input[1]), 
        initial_state=decoder_states_inputs
    )
    
    context_vector, attention_weights = attention_layer(decoder_outputs, encoder_outputs)
    decoder_model = tf.keras.Model(
        [model.input[1]] + decoder_states_inputs,
        [model.layers[8](tf.keras.layers.Concatenate()([decoder_outputs, context_vector]))] +
        [state_h, state_c, attention_weights]
    )
    
    # Run inference
    attention_plot = np.zeros((output_seq.shape[1], input_seq.shape[1]))
    states = encoder_model.predict(input_seq)
    dec_states = [states[1], states[2]]  # Initial states
    
    for t in range(output_seq.shape[1]):
        outputs = decoder_model.predict([output_seq[:, t:t+1]] + dec_states)
        attention_weights = outputs[3][0, 0, :]
        attention_plot[t] = attention_weights
        
    # Plotting
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(attention_plot[:len(output_seq[0]), :len(input_seq[0])], cmap='viridis')
    
    # Axis labels
    input_tokens = [input_tokenizer.index_word.get(i, '') for i in input_seq[0]]
    output_tokens = [target_tokenizer.index_word.get(i, '') for i in output_seq[0]]
    
    ax.set_xticks(range(len(input_tokens)))
    ax.set_yticks(range(len(output_tokens)))
    ax.set_xticklabels(input_tokens, rotation=90)
    ax.set_yticklabels(output_tokens)
    
    plt.colorbar(cax)
    plt.title("Attention Weights Heatmap")
    plt.show()
    return fig


In [None]:
# Example visualization
sample_idx = 0  # Change to visualize different samples
plot_attention_weights(
    best_model,
    test_enc[sample_idx:sample_idx+1],
    test_dec_in[sample_idx:sample_idx+1],
    data_loader.input_tokenizer,
    target_tokenizer
)

wandb.finish()

In [None]:
# Load pre-trained attention model
attention_model = load_model('best_attention_model.keras', 
                           custom_objects={'BahdanauAttention': BahdanauAttention})

# Initialize WandB
wandb.init(project="Assignment_03", name='Attention_Final_Evaluation')

# Evaluate on test set
test_loss, test_acc = attention_model.evaluate(
    [test_encoder_input, test_decoder_input], 
    test_target_output,
    verbose=2
)
print(f"Test Accuracy (Attention Model): {test_acc:.4f}")
wandb.log({'test_accuracy': test_acc})

In [None]:
# Generate predictions
os.makedirs("predictions_attention", exist_ok=True)
attention_preds = attention_model.predict([test_encoder_input, test_decoder_input])
attention_pred_indices = np.argmax(attention_preds, axis=-1)

# Decoding utilities
def decode_sequence(seq):
    decoded = []
    for idx in seq:
        if idx == 0:
            continue
        token = index_to_char.get(idx, '')
        if token == '\n':
            break
        decoded.append(token)
    return ''.join(decoded)

decoded_attention_preds = [decode_sequence(seq) for seq in attention_pred_indices]
decoded_refs = [t.replace('\n', '') for t in test_deva_out]

# Save predictions
with open("predictions_attention/test_predictions.txt", "w", encoding='utf-8') as f:
    for inp, pred, ref in zip(test_lat, decoded_attention_preds, decoded_refs):
        f.write(f"{inp}\t{pred}\t{ref}\n")

In [None]:
# Load vanilla model predictions
vanilla_preds = []
with open("predictions_vanilla/test_predictions.txt", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            vanilla_preds.append(parts[1])  # Predicted text is second column

In [None]:
# Find improvements
print("Attention Model Improvements:")
improvement_count = 0
for i, (v_pred, a_pred, ref) in enumerate(zip(vanilla_preds, decoded_attention_preds, decoded_refs)):
    if v_pred != ref and a_pred == ref:
        print(f"Case {improvement_count+1}:")
        print(f"  Input: {test_lat[i]}")
        print(f"  Vanilla: {v_pred}")
        print(f"  Attention: {a_pred}")
        print(f"  Reference: {ref}\n")
        improvement_count += 1
    if improvement_count >= 5:  # Show top 5 improvements
        break

In [None]:
# Find "jevan" in test set
target_word = "ikbala"
found_idx = -1
for i, word in enumerate(test_lat):
    if word == target_word:
        found_idx = i
        break

if found_idx == -1:
    print(f"❌ '{target_word}' not found in test set")
else:
    print(f"✅ Found '{target_word}' at index {found_idx}")

In [None]:
# Build inference models
encoder_inputs = attention_model.input[0]
encoder_outputs = attention_model.layers[4].output  # Encoder LSTM
encoder_model = Model(encoder_inputs, encoder_outputs)

decoder_inputs = attention_model.input[1]
decoder_lstm = attention_model.layers[5]
attention_layer = attention_model.layers[7]

# Decoder inference model
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs = decoder_lstm(
    attention_model.layers[3](decoder_inputs),
    initial_state=decoder_states_inputs
)[0]

context_vector, attention_weights = attention_layer(
    decoder_outputs, encoder_outputs
)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [attention_model.layers[8](context_vector), attention_weights]
)

In [None]:
# Decode with attention tracking
def decode_with_attention(input_seq, max_len=30):
    # Encode input
    encoder_out = encoder_model.predict(input_seq)
    
    # Initialize decoder
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['\t']
    
    decoded = []
    attention_weights = []
    
    # Initialize states
    states = [np.zeros((1, 256)), np.zeros((1, 256))]
    
    for _ in range(max_len):
        # Get output and attention
        output_tokens, attn = decoder_model.predict([target_seq] + states)
        
        # Sample token
        sampled_token = np.argmax(output_tokens[0, -1, :])
        decoded.append(sampled_token)
        attention_weights.append(attn[0][0])  # Get weights for this step
        
        # Exit condition
        if sampled_token == target_tokenizer.word_index['\n']:
            break
            
        # Update states and target sequence
        states = [output_tokens[1], output_tokens[2]]
        target_seq[0, 0] = sampled_token
    
    # Convert to text
    decoded_text = ''.join([index_to_char.get(idx, '') for idx in decoded])
    
    return decoded_text, np.array(attention_weights)

# Process "ikbala"
input_seq = test_encoder_input[found_idx:found_idx+1]
decoded_text, attention_weights = decode_with_attention(input_seq)


In [None]:
def plot_attention(input_text, output_text, attention_weights):
    plt.figure(figsize=(8, 6))
    ax = sns.heatmap(
        attention_weights,
        xticklabels=list(input_text),
        yticklabels=list(output_text),
        cmap="YlOrRd",
        linewidths=0.5,
        annot=True,
        fmt=".2f",
        cbar=False
    )
    plt.title(f"Attention Weights: '{input_text}' → '{output_text}'")
    plt.xlabel("Input Characters")
    plt.ylabel("Output Characters")
    plt.tight_layout()
    plt.show()

print(f"\nAttention Visualization for 'jevan':")
print(f"  Input: {test_lat[found_idx]}")
print(f"  Prediction: {decoded_text}")
print(f"  Reference: {decoded_refs[found_idx]}")


In [None]:
plot_attention(
    input_text=test_lat[found_idx],
    output_text=decoded_text,
    attention_weights=attention_weights
)

# Save visualization
os.makedirs("attention_visualizations", exist_ok=True)
plt.savefig(f"attention_visualizations/jevan_attention.png")
plt.close()


# Create WandB table
wandb_table = wandb.Table(columns=["Input", "Prediction", "Reference", "Correct"])

# Add sample predictions
sample_indices = np.random.choice(len(test_lat), 10, replace=False)
for idx in sample_indices:
    input_seq = test_encoder_input[idx:idx+1]
    pred_text, _ = decode_with_attention(input_seq)
    correct = pred_text == decoded_refs[idx]
    wandb_table.add_data(
        test_lat[idx],
        pred_text,
        decoded_refs[idx],
        "✅" if correct else "❌"
    )

# Special row for "jevan"
wandb_table.add_data(
    test_lat[found_idx],
    decoded_text,
    decoded_refs[found_idx],
    "✅" if decoded_text == decoded_refs[found_idx] else "❌"
)


In [None]:
# Log attention heatmap
attention_img = wandb.Image(f"attention_visualizations/jevan_attention.png")
wandb.log({
    "predictions": wandb_table,
    "attention_heatmap": attention_img,
    "test_accuracy": test_acc
})

wandb.finish()