In [1]:
import time
import math
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from xformers.ops import memory_efficient_attention  # Flash Attention
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
#https://www.kaggle.com/datasets/tboyle10/medicaltranscriptions



# Load dataset (Ensure the file is downloaded from Kaggle: https://www.kaggle.com/tboyle10/medicaltranscriptions)
df = pd.read_csv("mtsamples.csv")

# Select relevant columns (assuming 'description' as text and 'medical_specialty' as labels)
df = df[['transcription', 'medical_specialty']].dropna()

# Reduce the number of categories for a binary classification task
df['LABEL'] = df['medical_specialty'].apply(lambda x: 1 if x == ' Surgery' else 0)

# Splitting dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['transcription'].values, df['LABEL'].values, test_size=0.2, random_state=42
)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class MedicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Define dataset parameters
max_len = 512
batch_size = 256

train_dataset = MedicalDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = MedicalDataset(val_texts, val_labels, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [4]:
df['LABEL'].value_counts()

LABEL
0    3878
1    1088
Name: count, dtype: int64

In [5]:
len(train_texts)

3972

In [6]:
len(val_texts)

994

In [7]:
next(iter(train_loader))

{'input_ids': tensor([[  101, 10507,  1024,  ...,  2150,  2062,   102],
         [  101,  3653, 25918,  ...,  1996,  5923,   102],
         [  101,  3653, 25918,  ...,  4013, 26745,   102],
         ...,
         [  101,  2381,  1997,  ...,  1011,  4097,   102],
         [  101, 25086, 11655,  ...,     0,     0,     0],
         [  101,  3653, 25918,  ...,  1996,  5776,   102]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'labels': tensor([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
curr_max = 0
for i in train_loader:
    temp_max = i['input_ids'].max().detach().cpu().numpy()
    if temp_max > curr_max:
        curr_max = temp_max

In [9]:
for i in val_loader:
    temp_max = i['input_ids'].max().detach().cpu().numpy()
    if temp_max > curr_max:
        curr_max = temp_max

In [10]:
vocab_size = curr_max + 1

In [11]:
class PositionalEncoding(nn.Module):
    # Cool thread to visualize it: https://datascience.stackexchange.com/questions/51065/what-is-the-positional-encoding-in-the-transformer-model
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model, device=device)
        position = torch.arange(0, max_len, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2, device=device).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:pe[:, 1::2].shape[1]])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  
        self.register_buffer('pe', pe)
    #Layer that adds the encoding.
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [12]:
class FlashAttentionLayer(nn.Module):
    # Quick flash attention implementation
    def __init__(self, embed_dim, nhead, dropout=0.1):
        super().__init__()
        assert embed_dim % nhead == 0, "Embedding dimension must be divisible by number of heads"
        self.nhead = nhead
        self.head_dim = embed_dim // nhead  # Get head dimensions
        
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.attn_proj_q = nn.Linear(embed_dim, embed_dim)
        self.attn_proj_k = nn.Linear(embed_dim, embed_dim)
        self.attn_proj_v = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.feedforward = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, L, C = x.shape  # Batch, Sequence Length, Embedding Dim
        # Project linearly for Q, K, V
        q = self.attn_proj_q(x).view(B, L, self.nhead, self.head_dim).transpose(1, 2)  # (B, nh, L, head_dim)
        k = self.attn_proj_k(x).view(B, L, self.nhead, self.head_dim).transpose(1, 2)
        v = self.attn_proj_v(x).view(B, L, self.nhead, self.head_dim).transpose(1, 2)

        # Apply Flash Attention
        attn_output = memory_efficient_attention(q, k, v)  # (B, nh, L, head_dim)

        attn_output = attn_output.transpose(1, 2).reshape(B, L, C)  # Reshape back to (B, L, C)
        attn_output = self.out_proj(attn_output)
        x = self.norm1(x + self.dropout(attn_output))

        ff_output = self.feedforward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x


In [13]:

#Important to note the many parameters here, that's what we're optimizing
class SmallFlashTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, nhead, num_layers, dropout, num_classes=2):
        super().__init__()
        #Make embedding and encoding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim)
        # Note: You can stack as many attention layers as you want
        self.layers = nn.ModuleList([
            FlashAttentionLayer(embed_dim, nhead, dropout) for _ in range(num_layers)
        ])
        # Linear head
        # Add extra layers for better feature extraction
        self.norm = nn.LayerNorm(embed_dim)
        self.fc1 = nn.Linear(embed_dim, embed_dim * 2)
        self.activation = nn.GELU()
        self.fc2 = nn.Linear(embed_dim * 2, num_classes)
        self.dropout = nn.Dropout(dropout)
        self.activation = nn.GELU()
        # Note how init weights is applied
        self.apply(self._init_weights)
    # Note the model initialization and weights
    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
            
    def forward(self, x, attention_mask=None):
        x = self.embedding(x)
        x = self.pos_encoder(x)
        for layer in self.layers:
            x = layer(x)
        
        if attention_mask is not None:
            attention_mask = attention_mask.float()
        # Note the attention mask that is used to only pay attention "backwards" through the text.
        #Not necessary for nucleic acids
        if attention_mask is not None:
            x = x * attention_mask.unsqueeze(-1)
            # Mean pooling with mask - Note the representation used - Mean token representation here:
            x = x.sum(dim=1) / attention_mask.sum(dim=1, keepdim=True).clamp(min=1e-9)
            # Last token representation version
            #x = x[:,-1,:] / attention_mask.sum(dim=1, keepdim=True).clamp(min=1e-9)
        else:
            x = x.mean(dim=1)
            #x = x[:,-1,:]
        # Note the Fully Connected head built on top of the attention layers
        x = self.norm(x)
        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [14]:

# You need to generate a training function, it's also good practice to do so.
def train_epoch(model, dataloader, optimizer, scheduler, criterion):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(inputs, attention_mask)
        # Debugging output shapes
        loss = criterion(outputs, labels)
        loss.backward()
        # Note the gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
        scheduler.step()
    return total_loss / len(dataloader.dataset)

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(inputs)
            # Debugging output shapes
            loss = criterion(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
    avg_loss = total_loss / len(dataloader.dataset)
    accuracy = correct / len(dataloader.dataset)
    return avg_loss, accuracy





In [16]:


# For optuna, it's good to generate a function that runs all your suggestions through.
def objective(trial):
    #This is what we'll test, but it could be anything, HEHE!
    embed_dim = trial.suggest_categorical("embed_dim", [32, 64, 128])
    nhead = trial.suggest_categorical("nhead", [1, 2, 4])
    num_layers = trial.suggest_int("num_layers", 1, 2)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [128,256,512])
    # Make the transformer
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    model = SmallFlashTransformer(vocab_size, embed_dim, nhead, num_layers, dropout).to(device)
    #NOTE this: Weighted classes for CrossEntropy loss
    class_counts = df['LABEL'].value_counts().to_list()
    weights = [1.0 / count for count in class_counts]
    class_weights = torch.tensor(weights, dtype=torch.float).to(device)
    
    #Note the weight and label smoothing
    criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
    #Note the difference between Adam and Adam Weight Decay
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    #Note difference between StepLR and OneCycleLR
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) 
    scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=lr,
            epochs=10,
            steps_per_epoch=len(train_loader),
            pct_start=0.1
        )
    # Generate data loaders
    # Set how many epocs you want
    num_epochs = 10
    for epoch in range(num_epochs):
        print(epoch)
        train_loss = train_epoch(model, train_loader, optimizer, scheduler,criterion)
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        print(val_acc)
        trial.report(val_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return val_loss
# Do you want to minimize or maximize the objective (val_loss?)
study = optuna.create_study(direction="minimize")
# How many trials? optuna has several optimization algoirthms included
study.optimize(objective, n_trials=20, timeout=600)
#https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/003_efficient_optimization_algorithms.html Oh god, there's so many of them.
print("Best trial:")
trial = study.best_trial
print("  Validation Loss:", trial.value)
print("  Best hyperparameters:", trial.params)

[I 2025-03-11 13:45:26,047] A new study created in memory with name: no-name-d4ebe030-acfb-4a2b-9b14-27a7d93ab06e


0
0.23138832997987926
1
0.4798792756539235
2
0.682092555331992
3
0.6438631790744467
4
0.6680080482897385
5
0.6348088531187123
6
0.6519114688128773
7
0.635814889336016
8
0.635814889336016
9


[I 2025-03-11 13:47:59,079] Trial 0 finished with value: 0.601291082154817 and parameters: {'embed_dim': 128, 'nhead': 1, 'num_layers': 2, 'dropout': 0.4337938793442445, 'lr': 0.0038079254590203406, 'batch_size': 128}. Best is trial 0 with value: 0.601291082154817.


0.635814889336016
0
0.23138832997987926
1
0.23138832997987926
2
0.23138832997987926
3
0.727364185110664
4
0.635814889336016
5
0.6579476861167002
6
0.7142857142857143
7
0.676056338028169
8
0.693158953722334
9


[I 2025-03-11 13:50:06,015] Trial 1 finished with value: 0.577077158739869 and parameters: {'embed_dim': 64, 'nhead': 4, 'num_layers': 1, 'dropout': 0.17258195662547351, 'lr': 0.002490214165099875, 'batch_size': 512}. Best is trial 1 with value: 0.577077158739869.


0.693158953722334
0
0.23138832997987926
1
0.6851106639839034
2
0.7092555331991952
3
0.6780684104627767
4


[W 2025-03-11 13:51:03,841] Trial 2 failed with parameters: {'embed_dim': 64, 'nhead': 4, 'num_layers': 1, 'dropout': 0.4615023374789826, 'lr': 0.006268787743472726, 'batch_size': 256} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/nikolas/miniconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_12506/1989297933.py", line 37, in objective
    train_loss = train_epoch(model, train_loader, optimizer, scheduler,criterion)
  File "/tmp/ipykernel_12506/774603689.py", line 5, in train_epoch
    for batch in dataloader:
  File "/home/nikolas/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 708, in __next__
    data = self._next_data()
  File "/home/nikolas/miniconda3/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 764, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration

In [25]:
model = SmallFlashTransformer(vocab_size, 512, 8, 8, 0.3).to(device)
print(model.parameters)
total_params = 0
for param in model.parameters():
    total_params += param.numel()

print(f"Total number of parameters: {total_params}")

<bound method Module.parameters of SmallFlashTransformer(
  (embedding): Embedding(29653, 512)
  (pos_encoder): PositionalEncoding()
  (layers): ModuleList(
    (0-7): 8 x FlashAttentionLayer(
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn_proj_q): Linear(in_features=512, out_features=512, bias=True)
      (attn_proj_k): Linear(in_features=512, out_features=512, bias=True)
      (attn_proj_v): Linear(in_features=512, out_features=512, bias=True)
      (out_proj): Linear(in_features=512, out_features=512, bias=True)
      (feedforward): Sequential(
        (0): Linear(in_features=512, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=512, bias=True)
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
  )
  (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=512, out_features=1024, bias=Tru

In [26]:
from muon import Muon

In [41]:

# You need to generate a training function, it's also good practice to do so.
def train_epoch(model, dataloader, optimizers, scheduler, criterion):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        for opt in optimizers:
            opt.zero_grad()
        outputs = model(inputs, attention_mask)
        # Debugging output shapes
        loss = criterion(outputs, labels)
        loss.backward()
        # Note the gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # in the training step
        for opt in optimizers:
            opt.step()
        total_loss += loss.item() * inputs.size(0)
        scheduler.step()
    return total_loss / len(dataloader.dataset)




In [45]:
help(Muon)

Help on class Muon in module muon:

class Muon(torch.optim.optimizer.Optimizer)
 |  Muon(params, lr=0.02, weight_decay=0.01, momentum=0.95, nesterov=True, ns_steps=5, rank=0, world_size=1)
 |  
 |  Muon - MomentUm Orthogonalized by Newton-schulz
 |  
 |  https://kellerjordan.github.io/posts/muon/
 |  
 |  Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
 |  processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
 |  matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
 |  the advantage that it can be stably run in bfloat16 on the GPU.
 |  
 |  - This optimizer should not be used for the embedding layer, the final fully connected layer,
 |  or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
 |  - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
 |  
 |  Arguments:
 |      lr: The lea

In [51]:
import torch.distributed as dist

# Initialize the process group with the 'nccl' backend and TCP-based discovery
dist.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:51000', world_size=1, rank=0)

In [53]:


# For optuna, it's good to generate a function that runs all your suggestions through.
def objective(trial):
    #This is what we'll test, but it could be anything, HEHE!
    embed_dim = trial.suggest_categorical("embed_dim", [64, 128, 256])
    nhead = trial.suggest_categorical("nhead", [1, 2, 4])
    num_layers = trial.suggest_int("num_layers", 1, 4)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_float("lr", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [128,256,512])
    # Make the transformer
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    model = SmallFlashTransformer(vocab_size, embed_dim, nhead, num_layers, dropout).to(device)
    #NOTE this: Weighted classes for CrossEntropy loss
    class_counts = df['LABEL'].value_counts().to_list()
    weights = [1.0 / count for count in class_counts]
    class_weights = torch.tensor(weights, dtype=torch.float).to(device)
    
    #Note the weight and label smoothing
    criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
    #Note the difference between Adam and Adam Weight Decay
    # Find â‰¥2D parameters in the body of the network -- these should be optimized by Muon
    muon_params = [p for p in model.layers.parameters() if p.ndim >= 2]
    # Find everything else -- these should be optimized by AdamW
    adamw_params = ([p for p in model.layers.parameters() if p.ndim < 2]
                  + [*model.fc1.parameters(),*model.fc2.parameters(), *model.embedding.parameters()])
    # Create the optimizer
    optimizers = [Muon(muon_params, lr=0.02, momentum=0.95),
                  torch.optim.AdamW(adamw_params, lr=3e-4, betas=(0.90, 0.95), weight_decay=0.01)]

    #Note difference between StepLR and OneCycleLR
    #scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1) 
    scheduler = optim.lr_scheduler.OneCycleLR(
            optimizers[1],
            max_lr=lr,
            epochs=10,
            steps_per_epoch=len(train_loader),
            pct_start=0.1
        )
    # Generate data loaders
    # Set how many epocs you want
    num_epochs = 10
    for epoch in range(num_epochs):
        print(epoch)
        train_loss = train_epoch(model, train_loader, optimizers, scheduler,criterion)
        val_loss, val_acc = evaluate(model, val_loader, criterion)
        print(val_acc)
        trial.report(val_loss, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return val_loss
# Do you want to minimize or maximize the objective (val_loss?)
study = optuna.create_study(direction="minimize")
# How many trials? optuna has several optimization algoirthms included
study.optimize(objective, n_trials=20, timeout=600)
#https://optuna.readthedocs.io/en/stable/tutorial/10_key_features/003_efficient_optimization_algorithms.html Oh god, there's so many of them.
print("Best trial:")
trial = study.best_trial
print("  Validation Loss:", trial.value)
print("  Best hyperparameters:", trial.params)

[I 2025-03-11 14:43:26,835] A new study created in memory with name: no-name-7b4bc429-afa2-4506-8f98-c2d7a10a5350


0
0.23138832997987926
1
0.23138832997987926
2
0.23138832997987926
3
0.23138832997987926
4
0.23138832997987926
5
0.6861167002012073
6
0.23138832997987926
7
0.23138832997987926
8
0.7535211267605634
9


[I 2025-03-11 14:46:32,645] Trial 0 finished with value: 0.6862265133042211 and parameters: {'embed_dim': 256, 'nhead': 2, 'num_layers': 3, 'dropout': 0.3996054369150499, 'lr': 0.00043456229507846773, 'batch_size': 512}. Best is trial 0 with value: 0.6862265133042211.


0.7535211267605634
0
0.23138832997987926
1
0.23138832997987926
2
0.7454728370221329
3
0.6740442655935613
4
0.6629778672032193
5
0.7313883299798792
6
0.7323943661971831
7
0.7152917505030181
8
0.7022132796780685
9


[I 2025-03-11 14:49:07,465] Trial 1 finished with value: 0.5758691293372954 and parameters: {'embed_dim': 64, 'nhead': 4, 'num_layers': 3, 'dropout': 0.18354112854748, 'lr': 0.006730967320945705, 'batch_size': 512}. Best is trial 1 with value: 0.5758691293372954.


0.7152917505030181
0
0.7686116700201208
1
0.7344064386317908
2
0.704225352112676
3
0.6096579476861167
4
0.6971830985915493
5
0.7132796780684104
6
0.6247484909456741
7
0.6579476861167002
8
0.630784708249497
9


[I 2025-03-11 14:51:40,166] Trial 2 finished with value: 0.6005610419471019 and parameters: {'embed_dim': 64, 'nhead': 1, 'num_layers': 3, 'dropout': 0.43695408096549937, 'lr': 0.004604097537256577, 'batch_size': 128}. Best is trial 1 with value: 0.5758691293372954.


0.670020120724346
0
0.23138832997987926
1
0.23138832997987926
2
0.5583501006036218
3
0.7505030181086519
4
0.7364185110663984
5
0.6368209255533199
6
0.7344064386317908
7
0.7354124748490946
8
0.676056338028169
9


[I 2025-03-11 14:54:57,349] Trial 3 finished with value: 0.5706504096447582 and parameters: {'embed_dim': 256, 'nhead': 2, 'num_layers': 3, 'dropout': 0.29331254792873557, 'lr': 0.004957341273659826, 'batch_size': 512}. Best is trial 3 with value: 0.5706504096447582.


0.6770623742454729
Best trial:
  Validation Loss: 0.5706504096447582
  Best hyperparameters: {'embed_dim': 256, 'nhead': 2, 'num_layers': 3, 'dropout': 0.29331254792873557, 'lr': 0.004957341273659826, 'batch_size': 512}
