In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/crater-segmentation/Ytrain2_b.npy
/kaggle/input/crater-segmentation/Xtrain2_b.npy


In [None]:
X = np.load('Xtrain2_b.npy')
y = np.load('Ytrain2_b.npy')

In [3]:
print(X.shape,y.shape)

(547, 2304) (547, 2304)


In [4]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ColorJitter, Resize
from torchvision.transforms.functional import InterpolationMode
from torch.cuda.amp import autocast, GradScaler
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

# Check GPU availability and set device
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

class CraterDataset(Dataset):
    def __init__(self, images, masks, processor, transform=None):
        self.images = images
        self.masks = masks
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        mask = self.masks[idx]
        image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
        
        # Apply ColorJitter augmentation if transform exists
        if self.transform and torch.rand(1) < 0.5:  # 50% chance to apply
            image_tensor = torch.from_numpy(image_rgb).permute(2, 0, 1)
            image_tensor = self.transform(image_tensor)
            image_rgb = image_tensor.permute(1, 2, 0).numpy()

        inputs = self.processor(
            images=image_rgb, 
            segmentation_maps=mask, 
            return_tensors="pt",
            do_rescale=False
        )
        return {
            'pixel_values': inputs['pixel_values'].squeeze(),
            'labels': inputs['labels'].squeeze()
        }

def calculate_validation_accuracy(model, X_test_normalized, y_test, processor):
    resize_transform = Resize(
        size=(48, 48),
        interpolation=InterpolationMode.BILINEAR,
        antialias=True
    )
    
    model.eval()
    predictions = np.zeros((len(X_test_normalized), 48, 48))
    
    with torch.no_grad():
        for i, image in enumerate(X_test_normalized):
            image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
            inputs = processor(
                images=image_rgb,
                return_tensors="pt",
                do_rescale=False
            )
            pixel_values = inputs['pixel_values'].cuda()
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            pred = probs[0, 1].cpu().numpy()
            
            if pred.shape != (48, 48):
                pred_tensor = torch.from_numpy(pred).unsqueeze(0)
                pred_resized = resize_transform(pred_tensor).squeeze().numpy()
                predictions[i] = pred_resized
            else:
                predictions[i] = pred
    
    y_true_flat = y_test.reshape(-1, 48, 48).flatten()
    y_pred_flat = predictions.flatten()
    y_pred_flat = (y_pred_flat > 0.5).astype(int)
    
    return balanced_accuracy_score(y_true_flat, y_pred_flat)

print("Starting data preparation...")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape}, Test set size: {X_test.shape}")

# Reshape and normalize data
X_train_reshaped = X_train.reshape(-1, 48, 48, 1)
y_train_reshaped = y_train.reshape(-1, 48, 48)
X_train_normalized = X_train_reshaped / 255.0

# Reshape and normalize test data
X_test_reshaped = X_test.reshape(-1, 48, 48, 1)
X_test_normalized = X_test_reshaped / 255.0

print("Initializing model and processor...")

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b3")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b3",
    num_labels=2,
    ignore_mismatched_sizes=True
)

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
model = model.cuda()

# Data augmentation
color_jitter = ColorJitter(
    brightness=0.2,
    contrast=0.2,
    saturation=0.2,
    hue=0.1
)

# Create dataset and dataloader with augmentation
train_dataset = CraterDataset(
    X_train_normalized, 
    y_train_reshaped, 
    processor,
    transform=color_jitter
)

train_dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

# Training setup
optimizer = torch.optim.AdamW([
    {'params': model.module.segformer.parameters(), 'lr': 1e-5},
    {'params': model.module.decode_head.parameters(), 'lr': 1e-4}
])

scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=5,
    T_mult=2,
    eta_min=1e-6
)

scaler = GradScaler()

# Training loop
best_balanced_accuracy = 0
patience = 100  # Changed to 20
patience_counter = 0
num_epochs = 100  # Changed to 100

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    batch_count = 0
    
    for batch_idx, batch in enumerate(train_dataloader):
        pixel_values = batch['pixel_values'].cuda()
        labels = batch['labels'].cuda()
        
        with autocast():
            outputs = model(pixel_values=pixel_values, labels=labels)
            loss = outputs.loss.mean()
        
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        batch_count += 1
        
        if batch_idx % 5 == 0:
            print(f"Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}")
            print(f"Learning rates: {[group['lr'] for group in optimizer.param_groups]}")
            for i in range(torch.cuda.device_count()):
                memory_allocated = torch.cuda.memory_allocated(i) / 1e9
                print(f"GPU {i} memory allocated: {memory_allocated:.2f} GB")
    
    avg_loss = total_loss / batch_count
    
    # Calculate validation balanced accuracy
    val_balanced_accuracy = calculate_validation_accuracy(model, X_test_normalized, y_test, processor)
    
    print(f"Epoch: {epoch}")
    print(f"Average Training Loss: {avg_loss:.4f}")
    print(f"Validation Balanced Accuracy: {val_balanced_accuracy:.4f}")
    
    scheduler.step()
    
    # Early stopping and model saving based on balanced accuracy
    if val_balanced_accuracy > best_balanced_accuracy:
        best_balanced_accuracy = val_balanced_accuracy
        patience_counter = 0
        # Save best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': avg_loss,
            'balanced_accuracy': val_balanced_accuracy,
        }, 'best_model.pt')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs - No improvement in balanced accuracy")
            break
    
    # Regular checkpoint saving
    if (epoch + 1) % 5 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': avg_loss,
            'balanced_accuracy': val_balanced_accuracy,
        }, f'checkpoint_epoch_{epoch+1}.pt')

CUDA available: True
Number of GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4
Starting data preparation...
Training set size: (437, 2304), Test set size: (110, 2304)
Initializing model and processor...


preprocessor_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

  return func(*args, **kwargs)


config.json:   0%|          | 0.00/70.0k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/179M [00:00<?, ?B/s]

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b3 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs!


  scaler = GradScaler()
  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 0, Batch: 0, Loss: 0.7261
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.83 GB
GPU 1 memory allocated: 0.01 GB
Epoch: 0, Batch: 5, Loss: 0.4513
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 10, Loss: 0.5543
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 15, Loss: 0.4136
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 20, Loss: 0.4891
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 25, Loss: 0.4840
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 30, Loss: 0.3865
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.84 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 0, Batch: 35, Loss: 0.5339
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.

  self.pid = os.fork()


Epoch: 0
Average Training Loss: 0.4499
Validation Balanced Accuracy: 0.7450


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 1, Batch: 0, Loss: 0.3712
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 5, Loss: 0.5598
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 10, Loss: 0.3997
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 15, Loss: 0.4874
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 20, Loss: 0.2994
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 25, Loss: 0.3282
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 1, Batch: 30, Loss: 0.2656
Learning

  self.pid = os.fork()


Epoch: 1
Average Training Loss: 0.3689
Validation Balanced Accuracy: 0.7545


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 2, Batch: 0, Loss: 0.2753
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 5, Loss: 0.3546
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 10, Loss: 0.4254
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 15, Loss: 0.2760
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 20, Loss: 0.3517
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 25, Loss: 0.2939
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 2, Batch: 30, Loss: 0.4247
Learning

  self.pid = os.fork()


Epoch: 2
Average Training Loss: 0.3291
Validation Balanced Accuracy: 0.7756


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 3, Batch: 0, Loss: 0.4628
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 5, Loss: 0.2947
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 10, Loss: 0.2696
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 15, Loss: 0.2186
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 20, Loss: 0.3052
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 25, Loss: 0.3098
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 3, Batch: 30, Loss: 0.3

  self.pid = os.fork()


Epoch: 3
Average Training Loss: 0.3109
Validation Balanced Accuracy: 0.7841


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 4, Batch: 0, Loss: 0.2991
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 5, Loss: 0.2654
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 10, Loss: 0.2897
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 15, Loss: 0.3117
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 20, Loss: 0.3131
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 25, Loss: 0.3357
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 4, Batch: 30, Los

  self.pid = os.fork()


Epoch: 4
Average Training Loss: 0.2999
Validation Balanced Accuracy: 0.7905


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 5, Batch: 0, Loss: 0.2300
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 5, Loss: 0.4140
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 10, Loss: 0.2165
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 15, Loss: 0.3265
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 20, Loss: 0.3028
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 25, Loss: 0.2638
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 30, Loss: 0.3101
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 5, Batch: 35, Loss: 0.2592
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.

  self.pid = os.fork()


Epoch: 5
Average Training Loss: 0.3038
Validation Balanced Accuracy: 0.7938


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 6, Batch: 0, Loss: 0.2138
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 5, Loss: 0.3610
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 10, Loss: 0.2038
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 15, Loss: 0.1880
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 20, Loss: 0.5354
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 25, Loss: 0.3677
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 6, Batch: 30, Loss: 0.3416
Le

  self.pid = os.fork()


Epoch: 6
Average Training Loss: 0.2898
Validation Balanced Accuracy: 0.7904


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 7, Batch: 0, Loss: 0.2527
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 5, Loss: 0.2298
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 10, Loss: 0.2564
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 15, Loss: 0.2272
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 20, Loss: 0.3271
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 25, Loss: 0.1518
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 7, Batch: 30, Loss: 0.2217
Learning

  self.pid = os.fork()


Epoch: 7
Average Training Loss: 0.2659
Validation Balanced Accuracy: 0.8048


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 8, Batch: 0, Loss: 0.1750
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 5, Loss: 0.3590
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 10, Loss: 0.2163
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 15, Loss: 0.2332
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 20, Loss: 0.2019
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 25, Loss: 0.1935
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 8, Batch: 30, Loss: 0.2897
Learning

  self.pid = os.fork()


Epoch: 8
Average Training Loss: 0.2519
Validation Balanced Accuracy: 0.7941


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 9, Batch: 0, Loss: 0.2328
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 5, Loss: 0.2594
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 10, Loss: 0.1894
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 15, Loss: 0.1961
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 20, Loss: 0.2288
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 25, Loss: 0.1603
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 9, Batch: 30, Loss: 0.2483
Learning

  self.pid = os.fork()


Epoch: 9
Average Training Loss: 0.2378
Validation Balanced Accuracy: 0.7928


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 10, Batch: 0, Loss: 0.2207
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 5, Loss: 0.1790
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 10, Loss: 0.2036
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 15, Loss: 0.2062
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 20, Loss: 0.2943
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 25, Loss: 0.2005
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 30, Loss: 0.1409
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 10, Batch: 35, Loss: 0.1897
Learning rates: [5.5e-

  self.pid = os.fork()


Epoch: 10
Average Training Loss: 0.2216
Validation Balanced Accuracy: 0.8132


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 11, Batch: 0, Loss: 0.1520
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 5, Loss: 0.2775
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 10, Loss: 0.2029
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 15, Loss: 0.2508
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 20, Loss: 0.1868
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 25, Loss: 0.1976
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 11, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 11
Average Training Loss: 0.2071
Validation Balanced Accuracy: 0.8095


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 12, Batch: 0, Loss: 0.1833
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 5, Loss: 0.2379
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 10, Loss: 0.1340
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 15, Loss: 0.1483
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 20, Loss: 0.1840
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 25, Loss: 0.1142
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 12, Batch: 

  self.pid = os.fork()


Epoch: 12
Average Training Loss: 0.2007
Validation Balanced Accuracy: 0.8148


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 13, Batch: 0, Loss: 0.1982
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 5, Loss: 0.2125
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 10, Loss: 0.1657
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 15, Loss: 0.1704
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 20, Loss: 0.2015
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 25, Loss: 0.2362
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 13, Batch: 

  self.pid = os.fork()


Epoch: 13
Average Training Loss: 0.1983
Validation Balanced Accuracy: 0.8158


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 14, Batch: 0, Loss: 0.2334
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 5, Loss: 0.2096
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 10, Loss: 0.1846
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 15, Loss: 0.1877
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 20, Loss: 0.1525
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 25, Loss: 0.2986
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 14, Batch: 30, Loss: 0.1995
Learnin

  self.pid = os.fork()


Epoch: 14
Average Training Loss: 0.1927
Validation Balanced Accuracy: 0.8123


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 15, Batch: 0, Loss: 0.2350
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 5, Loss: 0.2354
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 10, Loss: 0.1837
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 15, Loss: 0.1729
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 20, Loss: 0.2163
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 25, Loss: 0.2420
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 30, Loss: 0.1636
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 15, Batch: 35, Loss: 0.2521
Learning rates: [1e-05, 0.0001]
GPU 0 memory alloc

  self.pid = os.fork()


Epoch: 15
Average Training Loss: 0.1998
Validation Balanced Accuracy: 0.8137


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 16, Batch: 0, Loss: 0.1767
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 5, Loss: 0.1599
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 10, Loss: 0.1960
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 15, Loss: 0.2628
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 20, Loss: 0.2031
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 25, Loss: 0.1514
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 16, Batch: 30, Loss: 0.2204
L

  self.pid = os.fork()


Epoch: 16
Average Training Loss: 0.1912
Validation Balanced Accuracy: 0.8100


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 17, Batch: 0, Loss: 0.1477
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 5, Loss: 0.1794
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 10, Loss: 0.2498
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 15, Loss: 0.2111
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 20, Loss: 0.2012
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 25, Loss: 0.2128
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 17, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 17
Average Training Loss: 0.1867
Validation Balanced Accuracy: 0.8041


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 18, Batch: 0, Loss: 0.1519
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 5, Loss: 0.1865
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 10, Loss: 0.1703
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 15, Loss: 0.1657
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 20, Loss: 0.1553
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 25, Loss: 0.1644
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 18, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 18
Average Training Loss: 0.1696
Validation Balanced Accuracy: 0.8155


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 19, Batch: 0, Loss: 0.1689
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 5, Loss: 0.1574
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 10, Loss: 0.1699
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 15, Loss: 0.1947
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 20, Loss: 0.1692
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 25, Loss: 0.1600
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 19, Batch: 30, Loss: 0.1960
L

  self.pid = os.fork()


Epoch: 19
Average Training Loss: 0.1707
Validation Balanced Accuracy: 0.8150


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 20, Batch: 0, Loss: 0.1639
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 5, Loss: 0.2175
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 10, Loss: 0.1219
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 15, Loss: 0.1294
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 20, Loss: 0.1514
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 25, Loss: 0.1345
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 20, Batch: 30, Loss: 0.1648
L

  self.pid = os.fork()


Epoch: 20
Average Training Loss: 0.1575
Validation Balanced Accuracy: 0.8225


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 21, Batch: 0, Loss: 0.1339
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 5, Loss: 0.1815
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 10, Loss: 0.1307
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 15, Loss: 0.2019
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 20, Loss: 0.1769
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 25, Loss: 0.1151
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 21, Batch: 30, Loss: 0.1644
L

  self.pid = os.fork()


Epoch: 21
Average Training Loss: 0.1491
Validation Balanced Accuracy: 0.8186


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 22, Batch: 0, Loss: 0.1682
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 5, Loss: 0.1193
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 10, Loss: 0.2329
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 15, Loss: 0.1917
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 20, Loss: 0.1314
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 25, Loss: 0.1123
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 22, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 22
Average Training Loss: 0.1424
Validation Balanced Accuracy: 0.8195


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 23, Batch: 0, Loss: 0.1772
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 5, Loss: 0.1390
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 10, Loss: 0.1613
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 15, Loss: 0.1384
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 20, Loss: 0.1667
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 25, Loss: 0.1207
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 23, Batch: 30, Loss: 0.1162
L

  self.pid = os.fork()


Epoch: 23
Average Training Loss: 0.1425
Validation Balanced Accuracy: 0.8193


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 24, Batch: 0, Loss: 0.1258
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 5, Loss: 0.1637
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 10, Loss: 0.1420
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 15, Loss: 0.1423
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 20, Loss: 0.1496
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 25, Loss: 0.1597
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 24, Batch: 

  self.pid = os.fork()


Epoch: 24
Average Training Loss: 0.1365
Validation Balanced Accuracy: 0.8148


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 25, Batch: 0, Loss: 0.1165
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 5, Loss: 0.1180
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 10, Loss: 0.1301
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 15, Loss: 0.1394
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 20, Loss: 0.1048
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 25, Loss: 0.1274
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 30, Loss: 0.1388
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 25, Batch: 35, Loss: 0.1325
Learning rates: [5.5e-

  self.pid = os.fork()


Epoch: 25
Average Training Loss: 0.1331
Validation Balanced Accuracy: 0.8197


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 26, Batch: 0, Loss: 0.1016
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 5, Loss: 0.1217
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 10, Loss: 0.0999
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 15, Loss: 0.1174
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 20, Loss: 0.1222
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 25, Loss: 0.1194
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 26, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 26
Average Training Loss: 0.1312
Validation Balanced Accuracy: 0.8187


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 27, Batch: 0, Loss: 0.1312
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 5, Loss: 0.1523
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 10, Loss: 0.1308
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 15, Loss: 0.1675
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 20, Loss: 0.1260
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 25, Loss: 0.1073
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 27, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 27
Average Training Loss: 0.1271
Validation Balanced Accuracy: 0.8180


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 28, Batch: 0, Loss: 0.1473
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 5, Loss: 0.1271
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 10, Loss: 0.1500
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 15, Loss: 0.1147
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 20, Loss: 0.1138
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 25, Loss: 0.1405
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 28, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 28
Average Training Loss: 0.1252
Validation Balanced Accuracy: 0.8182


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 29, Batch: 0, Loss: 0.1364
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 5, Loss: 0.1156
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 10, Loss: 0.1019
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 15, Loss: 0.1260
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 20, Loss: 0.1171
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 25, Loss: 0.1041
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 29, Batch: 

  self.pid = os.fork()


Epoch: 29
Average Training Loss: 0.1211
Validation Balanced Accuracy: 0.8162


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 30, Batch: 0, Loss: 0.1249
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 5, Loss: 0.1110
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 10, Loss: 0.1107
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 15, Loss: 0.1208
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 20, Loss: 0.0809
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 25, Loss: 0.1192
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 30, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 30
Average Training Loss: 0.1215
Validation Balanced Accuracy: 0.8184


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 31, Batch: 0, Loss: 0.1183
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 5, Loss: 0.1458
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 10, Loss: 0.1361
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 15, Loss: 0.0836
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 20, Loss: 0.1286
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 25, Loss: 0.1501
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 31, Batch: 

  self.pid = os.fork()


Epoch: 31
Average Training Loss: 0.1205
Validation Balanced Accuracy: 0.8175


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 32, Batch: 0, Loss: 0.1261
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 5, Loss: 0.1349
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 10, Loss: 0.0812
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 15, Loss: 0.1070
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 20, Loss: 0.1444
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 25, Loss: 0.1169
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 32, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 32
Average Training Loss: 0.1187
Validation Balanced Accuracy: 0.8199


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 33, Batch: 0, Loss: 0.1139
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 5, Loss: 0.1161
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 10, Loss: 0.1068
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 15, Loss: 0.1274
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 20, Loss: 0.0817
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 25, Loss: 0.0903
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 33, Batch: 30, Loss: 0.1429
Learnin

  self.pid = os.fork()


Epoch: 33
Average Training Loss: 0.1167
Validation Balanced Accuracy: 0.8187


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 34, Batch: 0, Loss: 0.1410
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 5, Loss: 0.1445
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 10, Loss: 0.1170
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 15, Loss: 0.1082
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 20, Loss: 0.1404
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 25, Loss: 0.1242
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 34, Batch: 

  self.pid = os.fork()


Epoch: 34
Average Training Loss: 0.1194
Validation Balanced Accuracy: 0.8194


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 35, Batch: 0, Loss: 0.1236
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 5, Loss: 0.1108
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 10, Loss: 0.0989
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 15, Loss: 0.1139
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 20, Loss: 0.1235
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 25, Loss: 0.0814
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 30, Loss: 0.1255
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 35, Batch: 35, Loss: 0.1463
Learning rates: [1e-05, 0.0001]
GPU 0 memory alloc

  self.pid = os.fork()


Epoch: 35
Average Training Loss: 0.1227
Validation Balanced Accuracy: 0.8120


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 36, Batch: 0, Loss: 0.1073
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 5, Loss: 0.1288
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 10, Loss: 0.1087
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 15, Loss: 0.0994
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 20, Loss: 0.1123
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 25, Loss: 0.1073
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 36, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 36
Average Training Loss: 0.1241
Validation Balanced Accuracy: 0.8199


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 37, Batch: 0, Loss: 0.1250
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 5, Loss: 0.1040
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 10, Loss: 0.1463
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 15, Loss: 0.1196
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 20, Loss: 0.0998
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 25, Loss: 0.1070
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 37, Batch: 30, Loss: 0.0987
L

  self.pid = os.fork()


Epoch: 37
Average Training Loss: 0.1223
Validation Balanced Accuracy: 0.8233


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 38, Batch: 0, Loss: 0.1164
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 5, Loss: 0.1263
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 10, Loss: 0.0913
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 15, Loss: 0.1305
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 20, Loss: 0.1565
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 25, Loss: 0.1110
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 38, Batch: 30, Loss: 0.1217
Learnin

  self.pid = os.fork()


Epoch: 38
Average Training Loss: 0.1219
Validation Balanced Accuracy: 0.8189


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 39, Batch: 0, Loss: 0.1208
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 5, Loss: 0.1088
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 10, Loss: 0.0942
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 15, Loss: 0.1060
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 20, Loss: 0.0845
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 25, Loss: 0.1251
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 39, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 39
Average Training Loss: 0.1151
Validation Balanced Accuracy: 0.8147


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 40, Batch: 0, Loss: 0.0994
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 5, Loss: 0.1304
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 10, Loss: 0.2272
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 15, Loss: 0.1157
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 20, Loss: 0.1099
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 25, Loss: 0.1382
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 40, Batch: 30, Loss: 0.0903
L

  self.pid = os.fork()


Epoch: 40
Average Training Loss: 0.1133
Validation Balanced Accuracy: 0.8203


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 41, Batch: 0, Loss: 0.0959
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 5, Loss: 0.1273
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 10, Loss: 0.1155
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 15, Loss: 0.1501
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 20, Loss: 0.1041
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 25, Loss: 0.1147
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 41, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 41
Average Training Loss: 0.1130
Validation Balanced Accuracy: 0.8193


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 42, Batch: 0, Loss: 0.1066
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 5, Loss: 0.1229
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 10, Loss: 0.1121
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 15, Loss: 0.0846
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 20, Loss: 0.1143
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 25, Loss: 0.0972
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 42, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 42
Average Training Loss: 0.1070
Validation Balanced Accuracy: 0.8128


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 43, Batch: 0, Loss: 0.1136
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 5, Loss: 0.1113
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 10, Loss: 0.1062
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 15, Loss: 0.0894
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 20, Loss: 0.0841
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 25, Loss: 0.1057
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 43, Batch: 30, Loss: 0.1101
L

  self.pid = os.fork()


Epoch: 43
Average Training Loss: 0.1055
Validation Balanced Accuracy: 0.8211


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 44, Batch: 0, Loss: 0.0969
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 5, Loss: 0.1062
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 10, Loss: 0.1120
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 15, Loss: 0.1215
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 20, Loss: 0.0656
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 25, Loss: 0.0991
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 44, Batch: 30, Loss: 0.0944
L

  self.pid = os.fork()


Epoch: 44
Average Training Loss: 0.1019
Validation Balanced Accuracy: 0.8207


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 45, Batch: 0, Loss: 0.1053
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 5, Loss: 0.0774
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 10, Loss: 0.0984
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 15, Loss: 0.1213
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 20, Loss: 0.0905
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 25, Loss: 0.0875
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 45, Batch: 30, Loss: 0.0881
L

  self.pid = os.fork()


Epoch: 45
Average Training Loss: 0.0990
Validation Balanced Accuracy: 0.8219


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 46, Batch: 0, Loss: 0.0827
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 5, Loss: 0.0896
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 10, Loss: 0.1121
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 15, Loss: 0.1021
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 20, Loss: 0.0906
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 25, Loss: 0.0956
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 46, Batch: 30, Loss: 0.1128
L

  self.pid = os.fork()


Epoch: 46
Average Training Loss: 0.0977
Validation Balanced Accuracy: 0.8167


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 47, Batch: 0, Loss: 0.1223
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 5, Loss: 0.0710
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 10, Loss: 0.0873
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 15, Loss: 0.0843
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 20, Loss: 0.0946
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 25, Loss: 0.0893
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 47, Batch: 30, Loss: 0.0929
L

  self.pid = os.fork()


Epoch: 47
Average Training Loss: 0.0961
Validation Balanced Accuracy: 0.8169


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 48, Batch: 0, Loss: 0.1122
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 5, Loss: 0.1050
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 10, Loss: 0.1070
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 15, Loss: 0.0867
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 20, Loss: 0.0994
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 25, Loss: 0.0987
Learning rates: [7.85124354122177e-06, 7.636367895343946e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 48, Batch: 30, Loss: 0.0884
L

  self.pid = os.fork()


Epoch: 48
Average Training Loss: 0.0954
Validation Balanced Accuracy: 0.8203


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 49, Batch: 0, Loss: 0.0909
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 5, Loss: 0.0750
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 10, Loss: 0.0846
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 15, Loss: 0.0825
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 20, Loss: 0.1151
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 25, Loss: 0.1219
Learning rates: [7.542957248827961e-06, 7.297252973710757e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 49, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 49
Average Training Loss: 0.0953
Validation Balanced Accuracy: 0.8199


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 50, Batch: 0, Loss: 0.0922
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 5, Loss: 0.0841
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 10, Loss: 0.0910
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 15, Loss: 0.0969
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 20, Loss: 0.0809
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 25, Loss: 0.1135
Learning rates: [7.222075445642904e-06, 6.944282990207195e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 50, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 50
Average Training Loss: 0.0967
Validation Balanced Accuracy: 0.8224


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 51, Batch: 0, Loss: 0.1078
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 5, Loss: 0.0866
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 10, Loss: 0.0753
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 15, Loss: 0.0949
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 20, Loss: 0.0816
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 25, Loss: 0.0818
Learning rates: [6.890576474687264e-06, 6.57963412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 51, Batch: 30, Loss: 0.1117
L

  self.pid = os.fork()


Epoch: 51
Average Training Loss: 0.0938
Validation Balanced Accuracy: 0.8228


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 52, Batch: 0, Loss: 0.0789
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 5, Loss: 0.0879
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 10, Loss: 0.0882
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 15, Loss: 0.1075
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 20, Loss: 0.1168
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 25, Loss: 0.0814
Learning rates: [6.550504137351575e-06, 6.205554551086733e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 52, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 52
Average Training Loss: 0.0900
Validation Balanced Accuracy: 0.8223


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 53, Batch: 0, Loss: 0.0873
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 5, Loss: 0.0954
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 10, Loss: 0.0778
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 15, Loss: 0.0822
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 20, Loss: 0.1100
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 25, Loss: 0.0983
Learning rates: [6.2039550926810394e-06, 5.8243506019491436e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 53, Batch: 

  self.pid = os.fork()


Epoch: 53
Average Training Loss: 0.0883
Validation Balanced Accuracy: 0.8211


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 54, Batch: 0, Loss: 0.0875
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 5, Loss: 0.1170
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 10, Loss: 0.0975
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 15, Loss: 0.0760
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 20, Loss: 0.0956
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 25, Loss: 0.0695
Learning rates: [5.853065930775303e-06, 5.438372523852833e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 54, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 54
Average Training Loss: 0.0856
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 55, Batch: 0, Loss: 0.1091
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 5, Loss: 0.1010
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 10, Loss: 0.0837
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 15, Loss: 0.0950
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 20, Loss: 0.1095
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 25, Loss: 0.0900
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 30, Loss: 0.0687
Learning rates: [5.5e-06, 5.05e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 55, Batch: 35, Loss: 0.0799
Learning rates: [5.5e-

  self.pid = os.fork()


Epoch: 55
Average Training Loss: 0.0855
Validation Balanced Accuracy: 0.8202


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 56, Batch: 0, Loss: 0.0984
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 5, Loss: 0.0779
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 10, Loss: 0.0929
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 15, Loss: 0.0804
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 20, Loss: 0.0778
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 25, Loss: 0.0743
Learning rates: [5.146934069224698e-06, 4.661627476147168e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 56, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 56
Average Training Loss: 0.0867
Validation Balanced Accuracy: 0.8214


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 57, Batch: 0, Loss: 0.1168
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 5, Loss: 0.0936
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 10, Loss: 0.0916
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 15, Loss: 0.0876
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 20, Loss: 0.0822
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 25, Loss: 0.0796
Learning rates: [4.796044907318962e-06, 4.2756493980508586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 57, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 57
Average Training Loss: 0.0834
Validation Balanced Accuracy: 0.8246


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 58, Batch: 0, Loss: 0.0719
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 5, Loss: 0.0765
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 10, Loss: 0.0934
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 15, Loss: 0.0804
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 20, Loss: 0.0658
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 25, Loss: 0.1009
Learning rates: [4.4494958626484265e-06, 3.894445448913269e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 58, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 58
Average Training Loss: 0.0831
Validation Balanced Accuracy: 0.8189


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 59, Batch: 0, Loss: 0.0735
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 5, Loss: 0.0962
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 10, Loss: 0.0829
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 15, Loss: 0.0716
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 20, Loss: 0.0739
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 25, Loss: 0.0802
Learning rates: [4.109423525312737e-06, 3.5203658778440106e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 59, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 59
Average Training Loss: 0.0815
Validation Balanced Accuracy: 0.8184


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 60, Batch: 0, Loss: 0.1082
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 5, Loss: 0.0827
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 10, Loss: 0.0657
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 15, Loss: 0.1036
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 20, Loss: 0.0801
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 25, Loss: 0.0793
Learning rates: [3.777924554357096e-06, 3.1557170097928055e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 60, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 60
Average Training Loss: 0.0825
Validation Balanced Accuracy: 0.8211


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 61, Batch: 0, Loss: 0.0779
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 5, Loss: 0.0707
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 10, Loss: 0.0746
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 15, Loss: 0.0904
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 20, Loss: 0.0699
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 25, Loss: 0.0874
Learning rates: [3.45704275117204e-06, 2.8027470262892437e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 61, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 61
Average Training Loss: 0.0816
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 62, Batch: 0, Loss: 0.0988
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 5, Loss: 0.0984
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 10, Loss: 0.0756
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 15, Loss: 0.0805
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 20, Loss: 0.0624
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 25, Loss: 0.0952
Learning rates: [3.1487564587782306e-06, 2.4636321046560538e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 62, Batch: 

  self.pid = os.fork()


Epoch: 62
Average Training Loss: 0.0796
Validation Balanced Accuracy: 0.8196


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 63, Batch: 0, Loss: 0.0740
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 5, Loss: 0.0690
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 10, Loss: 0.0851
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 15, Loss: 0.0665
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 20, Loss: 0.0809
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 25, Loss: 0.0732
Learning rates: [2.8549663646838717e-06, 2.1404630011522586e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 63, Batch: 

  self.pid = os.fork()


Epoch: 63
Average Training Loss: 0.0815
Validation Balanced Accuracy: 0.8182


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 64, Batch: 0, Loss: 0.0710
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 5, Loss: 0.1007
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 10, Loss: 0.0934
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 15, Loss: 0.0727
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 20, Loss: 0.0732
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 25, Loss: 0.0841
Learning rates: [2.577483782514174e-06, 1.8352321607655915e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 64, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 64
Average Training Loss: 0.0792
Validation Balanced Accuracy: 0.8213


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 65, Batch: 0, Loss: 0.0657
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 5, Loss: 0.0738
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 10, Loss: 0.0678
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 15, Loss: 0.0915
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 20, Loss: 0.0895
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 25, Loss: 0.0905
Learning rates: [2.3180194846605362e-06, 1.54982143312659e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 65, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 65
Average Training Loss: 0.0785
Validation Balanced Accuracy: 0.8208


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 66, Batch: 0, Loss: 0.0788
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 5, Loss: 0.0712
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 10, Loss: 0.0845
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 15, Loss: 0.0655
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 20, Loss: 0.1048
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 25, Loss: 0.0932
Learning rates: [2.0781731547998605e-06, 1.285990470279847e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 66, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 66
Average Training Loss: 0.0793
Validation Balanced Accuracy: 0.8216


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 67, Batch: 0, Loss: 0.0685
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 5, Loss: 0.0775
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 10, Loss: 0.0877
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 15, Loss: 0.0825
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 20, Loss: 0.0913
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 25, Loss: 0.0836
Learning rates: [1.8594235253127369e-06, 1.0453658778440109e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 67, Batch: 

  self.pid = os.fork()


Epoch: 67
Average Training Loss: 0.0784
Validation Balanced Accuracy: 0.8206


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 68, Batch: 0, Loss: 0.1025
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 5, Loss: 0.0788
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 10, Loss: 0.0671
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 15, Loss: 0.1025
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 20, Loss: 0.0825
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 25, Loss: 0.0862
Learning rates: [1.6631192604065851e-06, 8.294311864472437e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 68, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 68
Average Training Loss: 0.0801
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 69, Batch: 0, Loss: 0.0723
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 5, Loss: 0.0801
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 10, Loss: 0.0830
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 15, Loss: 0.0870
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 20, Loss: 0.0936
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 25, Loss: 0.0924
Learning rates: [1.4904706411523449e-06, 6.395177052675795e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 69, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 69
Average Training Loss: 0.0796
Validation Balanced Accuracy: 0.8210


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 70, Batch: 0, Loss: 0.0691
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 5, Loss: 0.0762
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 10, Loss: 0.0694
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 15, Loss: 0.0703
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 20, Loss: 0.0752
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 25, Loss: 0.0825
Learning rates: [1.3425421036992096e-06, 4.767963140691306e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 70, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 70
Average Training Loss: 0.0782
Validation Balanced Accuracy: 0.8207


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 71, Batch: 0, Loss: 0.0710
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 5, Loss: 0.0695
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 10, Loss: 0.0621
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 15, Loss: 0.0716
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 20, Loss: 0.0888
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 25, Loss: 0.0609
Learning rates: [1.220245676671809e-06, 3.4227024433899e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 71, Batch: 30, Loss: 0.0673
Learnin

  self.pid = os.fork()


Epoch: 71
Average Training Loss: 0.0762
Validation Balanced Accuracy: 0.8212


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 72, Batch: 0, Loss: 0.0677
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 5, Loss: 0.0724
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 10, Loss: 0.0986
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 15, Loss: 0.1294
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 20, Loss: 0.0760
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 25, Loss: 0.0820
Learning rates: [1.1243353582104554e-06, 2.36768894031501e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 72, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 72
Average Training Loss: 0.0788
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 73, Batch: 0, Loss: 0.0927
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 5, Loss: 0.0617
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 10, Loss: 0.0663
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 15, Loss: 0.0633
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 20, Loss: 0.0829
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 25, Loss: 0.0924
Learning rates: [1.0554024673218804e-06, 1.6094271405406859e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 73, Batch: 

  self.pid = os.fork()


Epoch: 73
Average Training Loss: 0.0769
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 74, Batch: 0, Loss: 0.0923
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 5, Loss: 0.0740
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 10, Loss: 0.0958
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 15, Loss: 0.0652
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 20, Loss: 0.0888
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 25, Loss: 0.0720
Learning rates: [1.013871998200924e-06, 1.1525919802101657e-06]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 74, Batch: 30, Lo

  self.pid = os.fork()


Epoch: 74
Average Training Loss: 0.0765
Validation Balanced Accuracy: 0.8197


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 75, Batch: 0, Loss: 0.0783
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 5, Loss: 0.0814
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 10, Loss: 0.0811
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 15, Loss: 0.0907
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 20, Loss: 0.0649
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 25, Loss: 0.0974
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 30, Loss: 0.0587
Learning rates: [1e-05, 0.0001]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 75, Batch: 35, Loss: 0.0850
Learning rates: [1e-05, 0.0001]
GPU 0 memory alloc

  self.pid = os.fork()


Epoch: 75
Average Training Loss: 0.0850
Validation Balanced Accuracy: 0.8160


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 76, Batch: 0, Loss: 0.0861
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 5, Loss: 0.0814
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 10, Loss: 0.0759
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 15, Loss: 0.0738
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 20, Loss: 0.0755
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 25, Loss: 0.1202
Learning rates: [9.996530663083255e-06, 9.996183729391579e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 76, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 76
Average Training Loss: 0.0871
Validation Balanced Accuracy: 0.8199


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 77, Batch: 0, Loss: 0.0938
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 5, Loss: 0.0924
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 10, Loss: 0.0832
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 15, Loss: 0.1142
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 20, Loss: 0.1081
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 25, Loss: 0.0838
Learning rates: [9.986128001799077e-06, 9.984740801978984e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 77, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 77
Average Training Loss: 0.0854
Validation Balanced Accuracy: 0.8231


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 78, Batch: 0, Loss: 0.0904
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 5, Loss: 0.0803
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 10, Loss: 0.0805
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 15, Loss: 0.0827
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 20, Loss: 0.0879
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 25, Loss: 0.0628
Learning rates: [9.96880805629717e-06, 9.965688861926886e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 78, Batch: 30, Loss: 0.0828
L

  self.pid = os.fork()


Epoch: 78
Average Training Loss: 0.0828
Validation Balanced Accuracy: 0.8230


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 79, Batch: 0, Loss: 0.0727
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 5, Loss: 0.0677
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 10, Loss: 0.0787
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 15, Loss: 0.0779
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 20, Loss: 0.0786
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 25, Loss: 0.0758
Learning rates: [9.94459753267812e-06, 9.939057285945933e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 79, Batch: 30, Loss: 0.0624
L

  self.pid = os.fork()


Epoch: 79
Average Training Loss: 0.0806
Validation Balanced Accuracy: 0.8208


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 80, Batch: 0, Loss: 0.0651
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 5, Loss: 0.0830
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 10, Loss: 0.0728
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 15, Loss: 0.0809
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 20, Loss: 0.0774
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 25, Loss: 0.0872
Learning rates: [9.913533761814537e-06, 9.90488713799599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 80, Batch: 30, Loss: 0.0896
L

  self.pid = os.fork()


Epoch: 80
Average Training Loss: 0.0812
Validation Balanced Accuracy: 0.8218


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 81, Batch: 0, Loss: 0.0621
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 5, Loss: 0.0730
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 10, Loss: 0.0857
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 15, Loss: 0.0700
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 20, Loss: 0.0799
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 25, Loss: 0.0766
Learning rates: [9.875664641789545e-06, 9.8632311059685e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 81, Batch: 30, Loss: 0.0636
Learnin

  self.pid = os.fork()


Epoch: 81
Average Training Loss: 0.0784
Validation Balanced Accuracy: 0.8254


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 82, Batch: 0, Loss: 0.0819
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 5, Loss: 0.0720
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 10, Loss: 0.0726
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 15, Loss: 0.0787
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 20, Loss: 0.0705
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 25, Loss: 0.0854
Learning rates: [9.831048564041412e-06, 9.814153420445554e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 82, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 82
Average Training Loss: 0.0766
Validation Balanced Accuracy: 0.8200


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 83, Batch: 0, Loss: 0.0583
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 5, Loss: 0.0885
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 10, Loss: 0.0816
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 15, Loss: 0.0693
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 20, Loss: 0.0764
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 25, Loss: 0.0667
Learning rates: [9.779754323328192e-06, 9.757729755661011e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 83, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 83
Average Training Loss: 0.0740
Validation Balanced Accuracy: 0.8154


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 84, Batch: 0, Loss: 0.0666
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 5, Loss: 0.0747
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 10, Loss: 0.0576
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 15, Loss: 0.0699
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 20, Loss: 0.0794
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 25, Loss: 0.0878
Learning rates: [9.72186101165118e-06, 9.694047112816297e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 84, Batch: 30, Loss: 0.0727
L

  self.pid = os.fork()


Epoch: 84
Average Training Loss: 0.0744
Validation Balanced Accuracy: 0.8203


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 85, Batch: 0, Loss: 0.0684
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 5, Loss: 0.0761
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 10, Loss: 0.0718
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 15, Loss: 0.0667
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 20, Loss: 0.0740
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 25, Loss: 0.0631
Learning rates: [9.65745789630079e-06, 9.623203685930869e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 85, Batch: 30, Loss: 0.0613
L

  self.pid = os.fork()


Epoch: 85
Average Training Loss: 0.0727
Validation Balanced Accuracy: 0.8202


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 86, Batch: 0, Loss: 0.0663
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 5, Loss: 0.0827
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 10, Loss: 0.0732
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 15, Loss: 0.0820
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 20, Loss: 0.0747
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 25, Loss: 0.0926
Learning rates: [9.586644282212866e-06, 9.545308710434153e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 86, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 86
Average Training Loss: 0.0745
Validation Balanced Accuracy: 0.8214


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 87, Batch: 0, Loss: 0.0818
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 5, Loss: 0.0654
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 10, Loss: 0.0613
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 15, Loss: 0.0665
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 20, Loss: 0.0630
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 25, Loss: 0.0731
Learning rates: [9.509529358847657e-06, 9.460482294732421e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 87, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 87
Average Training Loss: 0.0718
Validation Balanced Accuracy: 0.8258


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 88, Batch: 0, Loss: 0.0675
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 5, Loss: 0.0705
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 10, Loss: 0.0618
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 15, Loss: 0.0796
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 20, Loss: 0.0699
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 25, Loss: 0.0532
Learning rates: [9.426232031827589e-06, 9.368855235010347e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 88, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 88
Average Training Loss: 0.0697
Validation Balanced Accuracy: 0.8181


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 89, Batch: 0, Loss: 0.1362
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 5, Loss: 0.0691
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 10, Loss: 0.0674
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 15, Loss: 0.0629
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 20, Loss: 0.0886
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 25, Loss: 0.0820
Learning rates: [9.336880739593415e-06, 9.270568813552756e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 89, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 89
Average Training Loss: 0.0733
Validation Balanced Accuracy: 0.8205


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 90, Batch: 0, Loss: 0.0730
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 5, Loss: 0.0628
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 10, Loss: 0.0706
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 15, Loss: 0.0654
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 20, Loss: 0.0687
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 25, Loss: 0.0854
Learning rates: [9.241613255361455e-06, 9.1657745808976e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 90, Batch: 30, Loss: 0.0672
Learnin

  self.pid = os.fork()


Epoch: 90
Average Training Loss: 0.0701
Validation Balanced Accuracy: 0.8228


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 91, Batch: 0, Loss: 0.0659
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 5, Loss: 0.0662
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 10, Loss: 0.0595
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 15, Loss: 0.0526
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 20, Loss: 0.0688
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 25, Loss: 0.0909
Learning rates: [9.140576474687265e-06, 9.05463412215599e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 91, Batch: 30, Loss: 0.0745
L

  self.pid = os.fork()


Epoch: 91
Average Training Loss: 0.0703
Validation Balanced Accuracy: 0.8213


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 92, Batch: 0, Loss: 0.0535
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 5, Loss: 0.0786
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 10, Loss: 0.0782
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 15, Loss: 0.0571
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 20, Loss: 0.0766
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 25, Loss: 0.0651
Learning rates: [9.033926188963353e-06, 8.937318807859687e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 92, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 92
Average Training Loss: 0.0699
Validation Balanced Accuracy: 0.8208


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 93, Batch: 0, Loss: 0.0705
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 5, Loss: 0.0662
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 10, Loss: 0.0814
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 15, Loss: 0.0542
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 20, Loss: 0.0734
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 25, Loss: 0.0782
Learning rates: [8.92182684520014e-06, 8.814009529720155e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 93, Batch: 30, Loss: 0.0705
L

  self.pid = os.fork()


Epoch: 93
Average Training Loss: 0.0685
Validation Balanced Accuracy: 0.8194


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 94, Batch: 0, Loss: 0.0585
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 5, Loss: 0.0671
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 10, Loss: 0.0616
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 15, Loss: 0.0716
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 20, Loss: 0.0605
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 25, Loss: 0.0576
Learning rates: [8.804451292460586e-06, 8.684896421706644e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 94, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 94
Average Training Loss: 0.0651
Validation Balanced Accuracy: 0.8203


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 95, Batch: 0, Loss: 0.0755
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 5, Loss: 0.0697
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 10, Loss: 0.0852
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 15, Loss: 0.0576
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 20, Loss: 0.0529
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 25, Loss: 0.0607
Learning rates: [8.681980515339464e-06, 8.55017856687341e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 95, Batch: 30, Loss: 0.0642
L

  self.pid = os.fork()


Epoch: 95
Average Training Loss: 0.0677
Validation Balanced Accuracy: 0.8200


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 96, Batch: 0, Loss: 0.0721
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 5, Loss: 0.0619
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 10, Loss: 0.0724
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 15, Loss: 0.0634
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 20, Loss: 0.0626
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 25, Loss: 0.0669
Learning rates: [8.554603354898239e-06, 8.410063690388063e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 96, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 96
Average Training Loss: 0.0671
Validation Balanced Accuracy: 0.8247


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 97, Batch: 0, Loss: 0.0648
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 5, Loss: 0.0637
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 10, Loss: 0.0669
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 15, Loss: 0.0718
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 20, Loss: 0.0680
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 25, Loss: 0.0676
Learning rates: [8.422516217485828e-06, 8.26476783923441e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 97, Batch: 30, Loss: 0.0852
L

  self.pid = os.fork()


Epoch: 97
Average Training Loss: 0.0665
Validation Balanced Accuracy: 0.8229


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 98, Batch: 0, Loss: 0.0579
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 5, Loss: 0.0661
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 10, Loss: 0.0593
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 15, Loss: 0.0606
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 20, Loss: 0.0714
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 25, Loss: 0.0638
Learning rates: [8.285922771894254e-06, 8.114515049083679e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 98, Batch: 30, Loss: 0.

  self.pid = os.fork()


Epoch: 98
Average Training Loss: 0.0662
Validation Balanced Accuracy: 0.8185


  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch: 99, Batch: 0, Loss: 0.0579
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 5, Loss: 0.0550
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 10, Loss: 0.0691
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 15, Loss: 0.0660
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 20, Loss: 0.0517
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 25, Loss: 0.0751
Learning rates: [8.14503363531613e-06, 7.959536998847742e-05]
GPU 0 memory allocated: 0.85 GB
GPU 1 memory allocated: 0.02 GB
Epoch: 99, Batch: 30, Loss: 0.0868
L

  self.pid = os.fork()


Epoch: 99
Average Training Loss: 0.0656
Validation Balanced Accuracy: 0.8247


In [7]:
import torch
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
import numpy as np
from torchvision.transforms import Resize
from torchvision.transforms.functional import InterpolationMode

def evaluate_model_on_test_set(model, X_test_normalized, y_test, processor):
    resize_transform = Resize(
        size=(48, 48),
        interpolation=InterpolationMode.BILINEAR,
        antialias=True
    )
    
    model.eval()
    predictions = torch.zeros((len(X_test_normalized), 48, 48)).cuda()
    y_test_tensor = torch.from_numpy(y_test.reshape(-1, 48, 48)).cuda()
    
    with torch.no_grad():
        for i, image in enumerate(X_test_normalized):
            image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
            inputs = processor(
                images=image_rgb,
                return_tensors="pt",
                do_rescale=False
            )
            pixel_values = inputs['pixel_values'].cuda()
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)
            pred = probs[0, 1]
            
            if pred.shape != (48, 48):
                pred = resize_transform(pred.unsqueeze(0)).squeeze()
            predictions[i] = pred
    
    predictions = (predictions > 0.5).float()
    y_true_flat = y_test_tensor.flatten()
    y_pred_flat = predictions.flatten()
    
    # Calculate metrics
    tp = torch.sum((y_true_flat == 1) & (y_pred_flat == 1)).float()
    tn = torch.sum((y_true_flat == 0) & (y_pred_flat == 0)).float()
    fp = torch.sum((y_true_flat == 0) & (y_pred_flat == 1)).float()
    fn = torch.sum((y_true_flat == 1) & (y_pred_flat == 0)).float()
    
    sensitivity = tp / (tp + fn + 1e-7)
    specificity = tn / (tn + fp + 1e-7)
    balanced_acc = (sensitivity + specificity) / 2
    
    return {
        'balanced_accuracy': balanced_acc.item(),
        'sensitivity': sensitivity.item(),
        'specificity': specificity.item(),
        'true_positives': tp.item(),
        'true_negatives': tn.item(),
        'false_positives': fp.item(),
        'false_negatives': fn.item()
    }

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b3")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b3",
    num_labels=2,
    ignore_mismatched_sizes=True
).cuda()

# Load the best checkpoint
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])

# Evaluate the model
metrics = evaluate_model_on_test_set(model, X_test_normalized, y_test, processor)

# Print results
print("\nTest Set Metrics:")
print(f"Balanced Accuracy: {metrics['balanced_accuracy']:.4f}")
print(f"Sensitivity: {metrics['sensitivity']:.4f}")
print(f"Specificity: {metrics['specificity']:.4f}")
print("\nDetailed Counts:")
print(f"True Positives: {metrics['true_positives']}")
print(f"True Negatives: {metrics['true_negatives']}")
print(f"False Positives: {metrics['false_positives']}")
print(f"False Negatives: {metrics['false_negatives']}")

  return func(*args, **kwargs)
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b3 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pt')


RuntimeError: Error(s) in loading state_dict for SegformerForSemanticSegmentation:
	Missing key(s) in state_dict: "segformer.encoder.patch_embeddings.0.proj.weight", "segformer.encoder.patch_embeddings.0.proj.bias", "segformer.encoder.patch_embeddings.0.layer_norm.weight", "segformer.encoder.patch_embeddings.0.layer_norm.bias", "segformer.encoder.patch_embeddings.1.proj.weight", "segformer.encoder.patch_embeddings.1.proj.bias", "segformer.encoder.patch_embeddings.1.layer_norm.weight", "segformer.encoder.patch_embeddings.1.layer_norm.bias", "segformer.encoder.patch_embeddings.2.proj.weight", "segformer.encoder.patch_embeddings.2.proj.bias", "segformer.encoder.patch_embeddings.2.layer_norm.weight", "segformer.encoder.patch_embeddings.2.layer_norm.bias", "segformer.encoder.patch_embeddings.3.proj.weight", "segformer.encoder.patch_embeddings.3.proj.bias", "segformer.encoder.patch_embeddings.3.layer_norm.weight", "segformer.encoder.patch_embeddings.3.layer_norm.bias", "segformer.encoder.block.0.0.layer_norm_1.weight", "segformer.encoder.block.0.0.layer_norm_1.bias", "segformer.encoder.block.0.0.attention.self.query.weight", "segformer.encoder.block.0.0.attention.self.query.bias", "segformer.encoder.block.0.0.attention.self.key.weight", "segformer.encoder.block.0.0.attention.self.key.bias", "segformer.encoder.block.0.0.attention.self.value.weight", "segformer.encoder.block.0.0.attention.self.value.bias", "segformer.encoder.block.0.0.attention.self.sr.weight", "segformer.encoder.block.0.0.attention.self.sr.bias", "segformer.encoder.block.0.0.attention.self.layer_norm.weight", "segformer.encoder.block.0.0.attention.self.layer_norm.bias", "segformer.encoder.block.0.0.attention.output.dense.weight", "segformer.encoder.block.0.0.attention.output.dense.bias", "segformer.encoder.block.0.0.layer_norm_2.weight", "segformer.encoder.block.0.0.layer_norm_2.bias", "segformer.encoder.block.0.0.mlp.dense1.weight", "segformer.encoder.block.0.0.mlp.dense1.bias", "segformer.encoder.block.0.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.0.mlp.dense2.weight", "segformer.encoder.block.0.0.mlp.dense2.bias", "segformer.encoder.block.0.1.layer_norm_1.weight", "segformer.encoder.block.0.1.layer_norm_1.bias", "segformer.encoder.block.0.1.attention.self.query.weight", "segformer.encoder.block.0.1.attention.self.query.bias", "segformer.encoder.block.0.1.attention.self.key.weight", "segformer.encoder.block.0.1.attention.self.key.bias", "segformer.encoder.block.0.1.attention.self.value.weight", "segformer.encoder.block.0.1.attention.self.value.bias", "segformer.encoder.block.0.1.attention.self.sr.weight", "segformer.encoder.block.0.1.attention.self.sr.bias", "segformer.encoder.block.0.1.attention.self.layer_norm.weight", "segformer.encoder.block.0.1.attention.self.layer_norm.bias", "segformer.encoder.block.0.1.attention.output.dense.weight", "segformer.encoder.block.0.1.attention.output.dense.bias", "segformer.encoder.block.0.1.layer_norm_2.weight", "segformer.encoder.block.0.1.layer_norm_2.bias", "segformer.encoder.block.0.1.mlp.dense1.weight", "segformer.encoder.block.0.1.mlp.dense1.bias", "segformer.encoder.block.0.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.1.mlp.dense2.weight", "segformer.encoder.block.0.1.mlp.dense2.bias", "segformer.encoder.block.0.2.layer_norm_1.weight", "segformer.encoder.block.0.2.layer_norm_1.bias", "segformer.encoder.block.0.2.attention.self.query.weight", "segformer.encoder.block.0.2.attention.self.query.bias", "segformer.encoder.block.0.2.attention.self.key.weight", "segformer.encoder.block.0.2.attention.self.key.bias", "segformer.encoder.block.0.2.attention.self.value.weight", "segformer.encoder.block.0.2.attention.self.value.bias", "segformer.encoder.block.0.2.attention.self.sr.weight", "segformer.encoder.block.0.2.attention.self.sr.bias", "segformer.encoder.block.0.2.attention.self.layer_norm.weight", "segformer.encoder.block.0.2.attention.self.layer_norm.bias", "segformer.encoder.block.0.2.attention.output.dense.weight", "segformer.encoder.block.0.2.attention.output.dense.bias", "segformer.encoder.block.0.2.layer_norm_2.weight", "segformer.encoder.block.0.2.layer_norm_2.bias", "segformer.encoder.block.0.2.mlp.dense1.weight", "segformer.encoder.block.0.2.mlp.dense1.bias", "segformer.encoder.block.0.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.2.mlp.dense2.weight", "segformer.encoder.block.0.2.mlp.dense2.bias", "segformer.encoder.block.1.0.layer_norm_1.weight", "segformer.encoder.block.1.0.layer_norm_1.bias", "segformer.encoder.block.1.0.attention.self.query.weight", "segformer.encoder.block.1.0.attention.self.query.bias", "segformer.encoder.block.1.0.attention.self.key.weight", "segformer.encoder.block.1.0.attention.self.key.bias", "segformer.encoder.block.1.0.attention.self.value.weight", "segformer.encoder.block.1.0.attention.self.value.bias", "segformer.encoder.block.1.0.attention.self.sr.weight", "segformer.encoder.block.1.0.attention.self.sr.bias", "segformer.encoder.block.1.0.attention.self.layer_norm.weight", "segformer.encoder.block.1.0.attention.self.layer_norm.bias", "segformer.encoder.block.1.0.attention.output.dense.weight", "segformer.encoder.block.1.0.attention.output.dense.bias", "segformer.encoder.block.1.0.layer_norm_2.weight", "segformer.encoder.block.1.0.layer_norm_2.bias", "segformer.encoder.block.1.0.mlp.dense1.weight", "segformer.encoder.block.1.0.mlp.dense1.bias", "segformer.encoder.block.1.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.0.mlp.dense2.weight", "segformer.encoder.block.1.0.mlp.dense2.bias", "segformer.encoder.block.1.1.layer_norm_1.weight", "segformer.encoder.block.1.1.layer_norm_1.bias", "segformer.encoder.block.1.1.attention.self.query.weight", "segformer.encoder.block.1.1.attention.self.query.bias", "segformer.encoder.block.1.1.attention.self.key.weight", "segformer.encoder.block.1.1.attention.self.key.bias", "segformer.encoder.block.1.1.attention.self.value.weight", "segformer.encoder.block.1.1.attention.self.value.bias", "segformer.encoder.block.1.1.attention.self.sr.weight", "segformer.encoder.block.1.1.attention.self.sr.bias", "segformer.encoder.block.1.1.attention.self.layer_norm.weight", "segformer.encoder.block.1.1.attention.self.layer_norm.bias", "segformer.encoder.block.1.1.attention.output.dense.weight", "segformer.encoder.block.1.1.attention.output.dense.bias", "segformer.encoder.block.1.1.layer_norm_2.weight", "segformer.encoder.block.1.1.layer_norm_2.bias", "segformer.encoder.block.1.1.mlp.dense1.weight", "segformer.encoder.block.1.1.mlp.dense1.bias", "segformer.encoder.block.1.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.1.mlp.dense2.weight", "segformer.encoder.block.1.1.mlp.dense2.bias", "segformer.encoder.block.1.2.layer_norm_1.weight", "segformer.encoder.block.1.2.layer_norm_1.bias", "segformer.encoder.block.1.2.attention.self.query.weight", "segformer.encoder.block.1.2.attention.self.query.bias", "segformer.encoder.block.1.2.attention.self.key.weight", "segformer.encoder.block.1.2.attention.self.key.bias", "segformer.encoder.block.1.2.attention.self.value.weight", "segformer.encoder.block.1.2.attention.self.value.bias", "segformer.encoder.block.1.2.attention.self.sr.weight", "segformer.encoder.block.1.2.attention.self.sr.bias", "segformer.encoder.block.1.2.attention.self.layer_norm.weight", "segformer.encoder.block.1.2.attention.self.layer_norm.bias", "segformer.encoder.block.1.2.attention.output.dense.weight", "segformer.encoder.block.1.2.attention.output.dense.bias", "segformer.encoder.block.1.2.layer_norm_2.weight", "segformer.encoder.block.1.2.layer_norm_2.bias", "segformer.encoder.block.1.2.mlp.dense1.weight", "segformer.encoder.block.1.2.mlp.dense1.bias", "segformer.encoder.block.1.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.2.mlp.dense2.weight", "segformer.encoder.block.1.2.mlp.dense2.bias", "segformer.encoder.block.1.3.layer_norm_1.weight", "segformer.encoder.block.1.3.layer_norm_1.bias", "segformer.encoder.block.1.3.attention.self.query.weight", "segformer.encoder.block.1.3.attention.self.query.bias", "segformer.encoder.block.1.3.attention.self.key.weight", "segformer.encoder.block.1.3.attention.self.key.bias", "segformer.encoder.block.1.3.attention.self.value.weight", "segformer.encoder.block.1.3.attention.self.value.bias", "segformer.encoder.block.1.3.attention.self.sr.weight", "segformer.encoder.block.1.3.attention.self.sr.bias", "segformer.encoder.block.1.3.attention.self.layer_norm.weight", "segformer.encoder.block.1.3.attention.self.layer_norm.bias", "segformer.encoder.block.1.3.attention.output.dense.weight", "segformer.encoder.block.1.3.attention.output.dense.bias", "segformer.encoder.block.1.3.layer_norm_2.weight", "segformer.encoder.block.1.3.layer_norm_2.bias", "segformer.encoder.block.1.3.mlp.dense1.weight", "segformer.encoder.block.1.3.mlp.dense1.bias", "segformer.encoder.block.1.3.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.3.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.3.mlp.dense2.weight", "segformer.encoder.block.1.3.mlp.dense2.bias", "segformer.encoder.block.2.0.layer_norm_1.weight", "segformer.encoder.block.2.0.layer_norm_1.bias", "segformer.encoder.block.2.0.attention.self.query.weight", "segformer.encoder.block.2.0.attention.self.query.bias", "segformer.encoder.block.2.0.attention.self.key.weight", "segformer.encoder.block.2.0.attention.self.key.bias", "segformer.encoder.block.2.0.attention.self.value.weight", "segformer.encoder.block.2.0.attention.self.value.bias", "segformer.encoder.block.2.0.attention.self.sr.weight", "segformer.encoder.block.2.0.attention.self.sr.bias", "segformer.encoder.block.2.0.attention.self.layer_norm.weight", "segformer.encoder.block.2.0.attention.self.layer_norm.bias", "segformer.encoder.block.2.0.attention.output.dense.weight", "segformer.encoder.block.2.0.attention.output.dense.bias", "segformer.encoder.block.2.0.layer_norm_2.weight", "segformer.encoder.block.2.0.layer_norm_2.bias", "segformer.encoder.block.2.0.mlp.dense1.weight", "segformer.encoder.block.2.0.mlp.dense1.bias", "segformer.encoder.block.2.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.0.mlp.dense2.weight", "segformer.encoder.block.2.0.mlp.dense2.bias", "segformer.encoder.block.2.1.layer_norm_1.weight", "segformer.encoder.block.2.1.layer_norm_1.bias", "segformer.encoder.block.2.1.attention.self.query.weight", "segformer.encoder.block.2.1.attention.self.query.bias", "segformer.encoder.block.2.1.attention.self.key.weight", "segformer.encoder.block.2.1.attention.self.key.bias", "segformer.encoder.block.2.1.attention.self.value.weight", "segformer.encoder.block.2.1.attention.self.value.bias", "segformer.encoder.block.2.1.attention.self.sr.weight", "segformer.encoder.block.2.1.attention.self.sr.bias", "segformer.encoder.block.2.1.attention.self.layer_norm.weight", "segformer.encoder.block.2.1.attention.self.layer_norm.bias", "segformer.encoder.block.2.1.attention.output.dense.weight", "segformer.encoder.block.2.1.attention.output.dense.bias", "segformer.encoder.block.2.1.layer_norm_2.weight", "segformer.encoder.block.2.1.layer_norm_2.bias", "segformer.encoder.block.2.1.mlp.dense1.weight", "segformer.encoder.block.2.1.mlp.dense1.bias", "segformer.encoder.block.2.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.1.mlp.dense2.weight", "segformer.encoder.block.2.1.mlp.dense2.bias", "segformer.encoder.block.2.2.layer_norm_1.weight", "segformer.encoder.block.2.2.layer_norm_1.bias", "segformer.encoder.block.2.2.attention.self.query.weight", "segformer.encoder.block.2.2.attention.self.query.bias", "segformer.encoder.block.2.2.attention.self.key.weight", "segformer.encoder.block.2.2.attention.self.key.bias", "segformer.encoder.block.2.2.attention.self.value.weight", "segformer.encoder.block.2.2.attention.self.value.bias", "segformer.encoder.block.2.2.attention.self.sr.weight", "segformer.encoder.block.2.2.attention.self.sr.bias", "segformer.encoder.block.2.2.attention.self.layer_norm.weight", "segformer.encoder.block.2.2.attention.self.layer_norm.bias", "segformer.encoder.block.2.2.attention.output.dense.weight", "segformer.encoder.block.2.2.attention.output.dense.bias", "segformer.encoder.block.2.2.layer_norm_2.weight", "segformer.encoder.block.2.2.layer_norm_2.bias", "segformer.encoder.block.2.2.mlp.dense1.weight", "segformer.encoder.block.2.2.mlp.dense1.bias", "segformer.encoder.block.2.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.2.mlp.dense2.weight", "segformer.encoder.block.2.2.mlp.dense2.bias", "segformer.encoder.block.2.3.layer_norm_1.weight", "segformer.encoder.block.2.3.layer_norm_1.bias", "segformer.encoder.block.2.3.attention.self.query.weight", "segformer.encoder.block.2.3.attention.self.query.bias", "segformer.encoder.block.2.3.attention.self.key.weight", "segformer.encoder.block.2.3.attention.self.key.bias", "segformer.encoder.block.2.3.attention.self.value.weight", "segformer.encoder.block.2.3.attention.self.value.bias", "segformer.encoder.block.2.3.attention.self.sr.weight", "segformer.encoder.block.2.3.attention.self.sr.bias", "segformer.encoder.block.2.3.attention.self.layer_norm.weight", "segformer.encoder.block.2.3.attention.self.layer_norm.bias", "segformer.encoder.block.2.3.attention.output.dense.weight", "segformer.encoder.block.2.3.attention.output.dense.bias", "segformer.encoder.block.2.3.layer_norm_2.weight", "segformer.encoder.block.2.3.layer_norm_2.bias", "segformer.encoder.block.2.3.mlp.dense1.weight", "segformer.encoder.block.2.3.mlp.dense1.bias", "segformer.encoder.block.2.3.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.3.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.3.mlp.dense2.weight", "segformer.encoder.block.2.3.mlp.dense2.bias", "segformer.encoder.block.2.4.layer_norm_1.weight", "segformer.encoder.block.2.4.layer_norm_1.bias", "segformer.encoder.block.2.4.attention.self.query.weight", "segformer.encoder.block.2.4.attention.self.query.bias", "segformer.encoder.block.2.4.attention.self.key.weight", "segformer.encoder.block.2.4.attention.self.key.bias", "segformer.encoder.block.2.4.attention.self.value.weight", "segformer.encoder.block.2.4.attention.self.value.bias", "segformer.encoder.block.2.4.attention.self.sr.weight", "segformer.encoder.block.2.4.attention.self.sr.bias", "segformer.encoder.block.2.4.attention.self.layer_norm.weight", "segformer.encoder.block.2.4.attention.self.layer_norm.bias", "segformer.encoder.block.2.4.attention.output.dense.weight", "segformer.encoder.block.2.4.attention.output.dense.bias", "segformer.encoder.block.2.4.layer_norm_2.weight", "segformer.encoder.block.2.4.layer_norm_2.bias", "segformer.encoder.block.2.4.mlp.dense1.weight", "segformer.encoder.block.2.4.mlp.dense1.bias", "segformer.encoder.block.2.4.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.4.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.4.mlp.dense2.weight", "segformer.encoder.block.2.4.mlp.dense2.bias", "segformer.encoder.block.2.5.layer_norm_1.weight", "segformer.encoder.block.2.5.layer_norm_1.bias", "segformer.encoder.block.2.5.attention.self.query.weight", "segformer.encoder.block.2.5.attention.self.query.bias", "segformer.encoder.block.2.5.attention.self.key.weight", "segformer.encoder.block.2.5.attention.self.key.bias", "segformer.encoder.block.2.5.attention.self.value.weight", "segformer.encoder.block.2.5.attention.self.value.bias", "segformer.encoder.block.2.5.attention.self.sr.weight", "segformer.encoder.block.2.5.attention.self.sr.bias", "segformer.encoder.block.2.5.attention.self.layer_norm.weight", "segformer.encoder.block.2.5.attention.self.layer_norm.bias", "segformer.encoder.block.2.5.attention.output.dense.weight", "segformer.encoder.block.2.5.attention.output.dense.bias", "segformer.encoder.block.2.5.layer_norm_2.weight", "segformer.encoder.block.2.5.layer_norm_2.bias", "segformer.encoder.block.2.5.mlp.dense1.weight", "segformer.encoder.block.2.5.mlp.dense1.bias", "segformer.encoder.block.2.5.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.5.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.5.mlp.dense2.weight", "segformer.encoder.block.2.5.mlp.dense2.bias", "segformer.encoder.block.2.6.layer_norm_1.weight", "segformer.encoder.block.2.6.layer_norm_1.bias", "segformer.encoder.block.2.6.attention.self.query.weight", "segformer.encoder.block.2.6.attention.self.query.bias", "segformer.encoder.block.2.6.attention.self.key.weight", "segformer.encoder.block.2.6.attention.self.key.bias", "segformer.encoder.block.2.6.attention.self.value.weight", "segformer.encoder.block.2.6.attention.self.value.bias", "segformer.encoder.block.2.6.attention.self.sr.weight", "segformer.encoder.block.2.6.attention.self.sr.bias", "segformer.encoder.block.2.6.attention.self.layer_norm.weight", "segformer.encoder.block.2.6.attention.self.layer_norm.bias", "segformer.encoder.block.2.6.attention.output.dense.weight", "segformer.encoder.block.2.6.attention.output.dense.bias", "segformer.encoder.block.2.6.layer_norm_2.weight", "segformer.encoder.block.2.6.layer_norm_2.bias", "segformer.encoder.block.2.6.mlp.dense1.weight", "segformer.encoder.block.2.6.mlp.dense1.bias", "segformer.encoder.block.2.6.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.6.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.6.mlp.dense2.weight", "segformer.encoder.block.2.6.mlp.dense2.bias", "segformer.encoder.block.2.7.layer_norm_1.weight", "segformer.encoder.block.2.7.layer_norm_1.bias", "segformer.encoder.block.2.7.attention.self.query.weight", "segformer.encoder.block.2.7.attention.self.query.bias", "segformer.encoder.block.2.7.attention.self.key.weight", "segformer.encoder.block.2.7.attention.self.key.bias", "segformer.encoder.block.2.7.attention.self.value.weight", "segformer.encoder.block.2.7.attention.self.value.bias", "segformer.encoder.block.2.7.attention.self.sr.weight", "segformer.encoder.block.2.7.attention.self.sr.bias", "segformer.encoder.block.2.7.attention.self.layer_norm.weight", "segformer.encoder.block.2.7.attention.self.layer_norm.bias", "segformer.encoder.block.2.7.attention.output.dense.weight", "segformer.encoder.block.2.7.attention.output.dense.bias", "segformer.encoder.block.2.7.layer_norm_2.weight", "segformer.encoder.block.2.7.layer_norm_2.bias", "segformer.encoder.block.2.7.mlp.dense1.weight", "segformer.encoder.block.2.7.mlp.dense1.bias", "segformer.encoder.block.2.7.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.7.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.7.mlp.dense2.weight", "segformer.encoder.block.2.7.mlp.dense2.bias", "segformer.encoder.block.2.8.layer_norm_1.weight", "segformer.encoder.block.2.8.layer_norm_1.bias", "segformer.encoder.block.2.8.attention.self.query.weight", "segformer.encoder.block.2.8.attention.self.query.bias", "segformer.encoder.block.2.8.attention.self.key.weight", "segformer.encoder.block.2.8.attention.self.key.bias", "segformer.encoder.block.2.8.attention.self.value.weight", "segformer.encoder.block.2.8.attention.self.value.bias", "segformer.encoder.block.2.8.attention.self.sr.weight", "segformer.encoder.block.2.8.attention.self.sr.bias", "segformer.encoder.block.2.8.attention.self.layer_norm.weight", "segformer.encoder.block.2.8.attention.self.layer_norm.bias", "segformer.encoder.block.2.8.attention.output.dense.weight", "segformer.encoder.block.2.8.attention.output.dense.bias", "segformer.encoder.block.2.8.layer_norm_2.weight", "segformer.encoder.block.2.8.layer_norm_2.bias", "segformer.encoder.block.2.8.mlp.dense1.weight", "segformer.encoder.block.2.8.mlp.dense1.bias", "segformer.encoder.block.2.8.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.8.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.8.mlp.dense2.weight", "segformer.encoder.block.2.8.mlp.dense2.bias", "segformer.encoder.block.2.9.layer_norm_1.weight", "segformer.encoder.block.2.9.layer_norm_1.bias", "segformer.encoder.block.2.9.attention.self.query.weight", "segformer.encoder.block.2.9.attention.self.query.bias", "segformer.encoder.block.2.9.attention.self.key.weight", "segformer.encoder.block.2.9.attention.self.key.bias", "segformer.encoder.block.2.9.attention.self.value.weight", "segformer.encoder.block.2.9.attention.self.value.bias", "segformer.encoder.block.2.9.attention.self.sr.weight", "segformer.encoder.block.2.9.attention.self.sr.bias", "segformer.encoder.block.2.9.attention.self.layer_norm.weight", "segformer.encoder.block.2.9.attention.self.layer_norm.bias", "segformer.encoder.block.2.9.attention.output.dense.weight", "segformer.encoder.block.2.9.attention.output.dense.bias", "segformer.encoder.block.2.9.layer_norm_2.weight", "segformer.encoder.block.2.9.layer_norm_2.bias", "segformer.encoder.block.2.9.mlp.dense1.weight", "segformer.encoder.block.2.9.mlp.dense1.bias", "segformer.encoder.block.2.9.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.9.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.9.mlp.dense2.weight", "segformer.encoder.block.2.9.mlp.dense2.bias", "segformer.encoder.block.2.10.layer_norm_1.weight", "segformer.encoder.block.2.10.layer_norm_1.bias", "segformer.encoder.block.2.10.attention.self.query.weight", "segformer.encoder.block.2.10.attention.self.query.bias", "segformer.encoder.block.2.10.attention.self.key.weight", "segformer.encoder.block.2.10.attention.self.key.bias", "segformer.encoder.block.2.10.attention.self.value.weight", "segformer.encoder.block.2.10.attention.self.value.bias", "segformer.encoder.block.2.10.attention.self.sr.weight", "segformer.encoder.block.2.10.attention.self.sr.bias", "segformer.encoder.block.2.10.attention.self.layer_norm.weight", "segformer.encoder.block.2.10.attention.self.layer_norm.bias", "segformer.encoder.block.2.10.attention.output.dense.weight", "segformer.encoder.block.2.10.attention.output.dense.bias", "segformer.encoder.block.2.10.layer_norm_2.weight", "segformer.encoder.block.2.10.layer_norm_2.bias", "segformer.encoder.block.2.10.mlp.dense1.weight", "segformer.encoder.block.2.10.mlp.dense1.bias", "segformer.encoder.block.2.10.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.10.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.10.mlp.dense2.weight", "segformer.encoder.block.2.10.mlp.dense2.bias", "segformer.encoder.block.2.11.layer_norm_1.weight", "segformer.encoder.block.2.11.layer_norm_1.bias", "segformer.encoder.block.2.11.attention.self.query.weight", "segformer.encoder.block.2.11.attention.self.query.bias", "segformer.encoder.block.2.11.attention.self.key.weight", "segformer.encoder.block.2.11.attention.self.key.bias", "segformer.encoder.block.2.11.attention.self.value.weight", "segformer.encoder.block.2.11.attention.self.value.bias", "segformer.encoder.block.2.11.attention.self.sr.weight", "segformer.encoder.block.2.11.attention.self.sr.bias", "segformer.encoder.block.2.11.attention.self.layer_norm.weight", "segformer.encoder.block.2.11.attention.self.layer_norm.bias", "segformer.encoder.block.2.11.attention.output.dense.weight", "segformer.encoder.block.2.11.attention.output.dense.bias", "segformer.encoder.block.2.11.layer_norm_2.weight", "segformer.encoder.block.2.11.layer_norm_2.bias", "segformer.encoder.block.2.11.mlp.dense1.weight", "segformer.encoder.block.2.11.mlp.dense1.bias", "segformer.encoder.block.2.11.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.11.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.11.mlp.dense2.weight", "segformer.encoder.block.2.11.mlp.dense2.bias", "segformer.encoder.block.2.12.layer_norm_1.weight", "segformer.encoder.block.2.12.layer_norm_1.bias", "segformer.encoder.block.2.12.attention.self.query.weight", "segformer.encoder.block.2.12.attention.self.query.bias", "segformer.encoder.block.2.12.attention.self.key.weight", "segformer.encoder.block.2.12.attention.self.key.bias", "segformer.encoder.block.2.12.attention.self.value.weight", "segformer.encoder.block.2.12.attention.self.value.bias", "segformer.encoder.block.2.12.attention.self.sr.weight", "segformer.encoder.block.2.12.attention.self.sr.bias", "segformer.encoder.block.2.12.attention.self.layer_norm.weight", "segformer.encoder.block.2.12.attention.self.layer_norm.bias", "segformer.encoder.block.2.12.attention.output.dense.weight", "segformer.encoder.block.2.12.attention.output.dense.bias", "segformer.encoder.block.2.12.layer_norm_2.weight", "segformer.encoder.block.2.12.layer_norm_2.bias", "segformer.encoder.block.2.12.mlp.dense1.weight", "segformer.encoder.block.2.12.mlp.dense1.bias", "segformer.encoder.block.2.12.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.12.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.12.mlp.dense2.weight", "segformer.encoder.block.2.12.mlp.dense2.bias", "segformer.encoder.block.2.13.layer_norm_1.weight", "segformer.encoder.block.2.13.layer_norm_1.bias", "segformer.encoder.block.2.13.attention.self.query.weight", "segformer.encoder.block.2.13.attention.self.query.bias", "segformer.encoder.block.2.13.attention.self.key.weight", "segformer.encoder.block.2.13.attention.self.key.bias", "segformer.encoder.block.2.13.attention.self.value.weight", "segformer.encoder.block.2.13.attention.self.value.bias", "segformer.encoder.block.2.13.attention.self.sr.weight", "segformer.encoder.block.2.13.attention.self.sr.bias", "segformer.encoder.block.2.13.attention.self.layer_norm.weight", "segformer.encoder.block.2.13.attention.self.layer_norm.bias", "segformer.encoder.block.2.13.attention.output.dense.weight", "segformer.encoder.block.2.13.attention.output.dense.bias", "segformer.encoder.block.2.13.layer_norm_2.weight", "segformer.encoder.block.2.13.layer_norm_2.bias", "segformer.encoder.block.2.13.mlp.dense1.weight", "segformer.encoder.block.2.13.mlp.dense1.bias", "segformer.encoder.block.2.13.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.13.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.13.mlp.dense2.weight", "segformer.encoder.block.2.13.mlp.dense2.bias", "segformer.encoder.block.2.14.layer_norm_1.weight", "segformer.encoder.block.2.14.layer_norm_1.bias", "segformer.encoder.block.2.14.attention.self.query.weight", "segformer.encoder.block.2.14.attention.self.query.bias", "segformer.encoder.block.2.14.attention.self.key.weight", "segformer.encoder.block.2.14.attention.self.key.bias", "segformer.encoder.block.2.14.attention.self.value.weight", "segformer.encoder.block.2.14.attention.self.value.bias", "segformer.encoder.block.2.14.attention.self.sr.weight", "segformer.encoder.block.2.14.attention.self.sr.bias", "segformer.encoder.block.2.14.attention.self.layer_norm.weight", "segformer.encoder.block.2.14.attention.self.layer_norm.bias", "segformer.encoder.block.2.14.attention.output.dense.weight", "segformer.encoder.block.2.14.attention.output.dense.bias", "segformer.encoder.block.2.14.layer_norm_2.weight", "segformer.encoder.block.2.14.layer_norm_2.bias", "segformer.encoder.block.2.14.mlp.dense1.weight", "segformer.encoder.block.2.14.mlp.dense1.bias", "segformer.encoder.block.2.14.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.14.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.14.mlp.dense2.weight", "segformer.encoder.block.2.14.mlp.dense2.bias", "segformer.encoder.block.2.15.layer_norm_1.weight", "segformer.encoder.block.2.15.layer_norm_1.bias", "segformer.encoder.block.2.15.attention.self.query.weight", "segformer.encoder.block.2.15.attention.self.query.bias", "segformer.encoder.block.2.15.attention.self.key.weight", "segformer.encoder.block.2.15.attention.self.key.bias", "segformer.encoder.block.2.15.attention.self.value.weight", "segformer.encoder.block.2.15.attention.self.value.bias", "segformer.encoder.block.2.15.attention.self.sr.weight", "segformer.encoder.block.2.15.attention.self.sr.bias", "segformer.encoder.block.2.15.attention.self.layer_norm.weight", "segformer.encoder.block.2.15.attention.self.layer_norm.bias", "segformer.encoder.block.2.15.attention.output.dense.weight", "segformer.encoder.block.2.15.attention.output.dense.bias", "segformer.encoder.block.2.15.layer_norm_2.weight", "segformer.encoder.block.2.15.layer_norm_2.bias", "segformer.encoder.block.2.15.mlp.dense1.weight", "segformer.encoder.block.2.15.mlp.dense1.bias", "segformer.encoder.block.2.15.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.15.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.15.mlp.dense2.weight", "segformer.encoder.block.2.15.mlp.dense2.bias", "segformer.encoder.block.2.16.layer_norm_1.weight", "segformer.encoder.block.2.16.layer_norm_1.bias", "segformer.encoder.block.2.16.attention.self.query.weight", "segformer.encoder.block.2.16.attention.self.query.bias", "segformer.encoder.block.2.16.attention.self.key.weight", "segformer.encoder.block.2.16.attention.self.key.bias", "segformer.encoder.block.2.16.attention.self.value.weight", "segformer.encoder.block.2.16.attention.self.value.bias", "segformer.encoder.block.2.16.attention.self.sr.weight", "segformer.encoder.block.2.16.attention.self.sr.bias", "segformer.encoder.block.2.16.attention.self.layer_norm.weight", "segformer.encoder.block.2.16.attention.self.layer_norm.bias", "segformer.encoder.block.2.16.attention.output.dense.weight", "segformer.encoder.block.2.16.attention.output.dense.bias", "segformer.encoder.block.2.16.layer_norm_2.weight", "segformer.encoder.block.2.16.layer_norm_2.bias", "segformer.encoder.block.2.16.mlp.dense1.weight", "segformer.encoder.block.2.16.mlp.dense1.bias", "segformer.encoder.block.2.16.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.16.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.16.mlp.dense2.weight", "segformer.encoder.block.2.16.mlp.dense2.bias", "segformer.encoder.block.2.17.layer_norm_1.weight", "segformer.encoder.block.2.17.layer_norm_1.bias", "segformer.encoder.block.2.17.attention.self.query.weight", "segformer.encoder.block.2.17.attention.self.query.bias", "segformer.encoder.block.2.17.attention.self.key.weight", "segformer.encoder.block.2.17.attention.self.key.bias", "segformer.encoder.block.2.17.attention.self.value.weight", "segformer.encoder.block.2.17.attention.self.value.bias", "segformer.encoder.block.2.17.attention.self.sr.weight", "segformer.encoder.block.2.17.attention.self.sr.bias", "segformer.encoder.block.2.17.attention.self.layer_norm.weight", "segformer.encoder.block.2.17.attention.self.layer_norm.bias", "segformer.encoder.block.2.17.attention.output.dense.weight", "segformer.encoder.block.2.17.attention.output.dense.bias", "segformer.encoder.block.2.17.layer_norm_2.weight", "segformer.encoder.block.2.17.layer_norm_2.bias", "segformer.encoder.block.2.17.mlp.dense1.weight", "segformer.encoder.block.2.17.mlp.dense1.bias", "segformer.encoder.block.2.17.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.17.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.17.mlp.dense2.weight", "segformer.encoder.block.2.17.mlp.dense2.bias", "segformer.encoder.block.3.0.layer_norm_1.weight", "segformer.encoder.block.3.0.layer_norm_1.bias", "segformer.encoder.block.3.0.attention.self.query.weight", "segformer.encoder.block.3.0.attention.self.query.bias", "segformer.encoder.block.3.0.attention.self.key.weight", "segformer.encoder.block.3.0.attention.self.key.bias", "segformer.encoder.block.3.0.attention.self.value.weight", "segformer.encoder.block.3.0.attention.self.value.bias", "segformer.encoder.block.3.0.attention.output.dense.weight", "segformer.encoder.block.3.0.attention.output.dense.bias", "segformer.encoder.block.3.0.layer_norm_2.weight", "segformer.encoder.block.3.0.layer_norm_2.bias", "segformer.encoder.block.3.0.mlp.dense1.weight", "segformer.encoder.block.3.0.mlp.dense1.bias", "segformer.encoder.block.3.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.0.mlp.dense2.weight", "segformer.encoder.block.3.0.mlp.dense2.bias", "segformer.encoder.block.3.1.layer_norm_1.weight", "segformer.encoder.block.3.1.layer_norm_1.bias", "segformer.encoder.block.3.1.attention.self.query.weight", "segformer.encoder.block.3.1.attention.self.query.bias", "segformer.encoder.block.3.1.attention.self.key.weight", "segformer.encoder.block.3.1.attention.self.key.bias", "segformer.encoder.block.3.1.attention.self.value.weight", "segformer.encoder.block.3.1.attention.self.value.bias", "segformer.encoder.block.3.1.attention.output.dense.weight", "segformer.encoder.block.3.1.attention.output.dense.bias", "segformer.encoder.block.3.1.layer_norm_2.weight", "segformer.encoder.block.3.1.layer_norm_2.bias", "segformer.encoder.block.3.1.mlp.dense1.weight", "segformer.encoder.block.3.1.mlp.dense1.bias", "segformer.encoder.block.3.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.1.mlp.dense2.weight", "segformer.encoder.block.3.1.mlp.dense2.bias", "segformer.encoder.block.3.2.layer_norm_1.weight", "segformer.encoder.block.3.2.layer_norm_1.bias", "segformer.encoder.block.3.2.attention.self.query.weight", "segformer.encoder.block.3.2.attention.self.query.bias", "segformer.encoder.block.3.2.attention.self.key.weight", "segformer.encoder.block.3.2.attention.self.key.bias", "segformer.encoder.block.3.2.attention.self.value.weight", "segformer.encoder.block.3.2.attention.self.value.bias", "segformer.encoder.block.3.2.attention.output.dense.weight", "segformer.encoder.block.3.2.attention.output.dense.bias", "segformer.encoder.block.3.2.layer_norm_2.weight", "segformer.encoder.block.3.2.layer_norm_2.bias", "segformer.encoder.block.3.2.mlp.dense1.weight", "segformer.encoder.block.3.2.mlp.dense1.bias", "segformer.encoder.block.3.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.2.mlp.dense2.weight", "segformer.encoder.block.3.2.mlp.dense2.bias", "segformer.encoder.layer_norm.0.weight", "segformer.encoder.layer_norm.0.bias", "segformer.encoder.layer_norm.1.weight", "segformer.encoder.layer_norm.1.bias", "segformer.encoder.layer_norm.2.weight", "segformer.encoder.layer_norm.2.bias", "segformer.encoder.layer_norm.3.weight", "segformer.encoder.layer_norm.3.bias", "decode_head.linear_c.0.proj.weight", "decode_head.linear_c.0.proj.bias", "decode_head.linear_c.1.proj.weight", "decode_head.linear_c.1.proj.bias", "decode_head.linear_c.2.proj.weight", "decode_head.linear_c.2.proj.bias", "decode_head.linear_c.3.proj.weight", "decode_head.linear_c.3.proj.bias", "decode_head.linear_fuse.weight", "decode_head.batch_norm.weight", "decode_head.batch_norm.bias", "decode_head.batch_norm.running_mean", "decode_head.batch_norm.running_var", "decode_head.classifier.weight", "decode_head.classifier.bias". 
	Unexpected key(s) in state_dict: "module.segformer.encoder.patch_embeddings.0.proj.weight", "module.segformer.encoder.patch_embeddings.0.proj.bias", "module.segformer.encoder.patch_embeddings.0.layer_norm.weight", "module.segformer.encoder.patch_embeddings.0.layer_norm.bias", "module.segformer.encoder.patch_embeddings.1.proj.weight", "module.segformer.encoder.patch_embeddings.1.proj.bias", "module.segformer.encoder.patch_embeddings.1.layer_norm.weight", "module.segformer.encoder.patch_embeddings.1.layer_norm.bias", "module.segformer.encoder.patch_embeddings.2.proj.weight", "module.segformer.encoder.patch_embeddings.2.proj.bias", "module.segformer.encoder.patch_embeddings.2.layer_norm.weight", "module.segformer.encoder.patch_embeddings.2.layer_norm.bias", "module.segformer.encoder.patch_embeddings.3.proj.weight", "module.segformer.encoder.patch_embeddings.3.proj.bias", "module.segformer.encoder.patch_embeddings.3.layer_norm.weight", "module.segformer.encoder.patch_embeddings.3.layer_norm.bias", "module.segformer.encoder.block.0.0.layer_norm_1.weight", "module.segformer.encoder.block.0.0.layer_norm_1.bias", "module.segformer.encoder.block.0.0.attention.self.query.weight", "module.segformer.encoder.block.0.0.attention.self.query.bias", "module.segformer.encoder.block.0.0.attention.self.key.weight", "module.segformer.encoder.block.0.0.attention.self.key.bias", "module.segformer.encoder.block.0.0.attention.self.value.weight", "module.segformer.encoder.block.0.0.attention.self.value.bias", "module.segformer.encoder.block.0.0.attention.self.sr.weight", "module.segformer.encoder.block.0.0.attention.self.sr.bias", "module.segformer.encoder.block.0.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.0.attention.output.dense.weight", "module.segformer.encoder.block.0.0.attention.output.dense.bias", "module.segformer.encoder.block.0.0.layer_norm_2.weight", "module.segformer.encoder.block.0.0.layer_norm_2.bias", "module.segformer.encoder.block.0.0.mlp.dense1.weight", "module.segformer.encoder.block.0.0.mlp.dense1.bias", "module.segformer.encoder.block.0.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.0.mlp.dense2.weight", "module.segformer.encoder.block.0.0.mlp.dense2.bias", "module.segformer.encoder.block.0.1.layer_norm_1.weight", "module.segformer.encoder.block.0.1.layer_norm_1.bias", "module.segformer.encoder.block.0.1.attention.self.query.weight", "module.segformer.encoder.block.0.1.attention.self.query.bias", "module.segformer.encoder.block.0.1.attention.self.key.weight", "module.segformer.encoder.block.0.1.attention.self.key.bias", "module.segformer.encoder.block.0.1.attention.self.value.weight", "module.segformer.encoder.block.0.1.attention.self.value.bias", "module.segformer.encoder.block.0.1.attention.self.sr.weight", "module.segformer.encoder.block.0.1.attention.self.sr.bias", "module.segformer.encoder.block.0.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.1.attention.output.dense.weight", "module.segformer.encoder.block.0.1.attention.output.dense.bias", "module.segformer.encoder.block.0.1.layer_norm_2.weight", "module.segformer.encoder.block.0.1.layer_norm_2.bias", "module.segformer.encoder.block.0.1.mlp.dense1.weight", "module.segformer.encoder.block.0.1.mlp.dense1.bias", "module.segformer.encoder.block.0.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.1.mlp.dense2.weight", "module.segformer.encoder.block.0.1.mlp.dense2.bias", "module.segformer.encoder.block.0.2.layer_norm_1.weight", "module.segformer.encoder.block.0.2.layer_norm_1.bias", "module.segformer.encoder.block.0.2.attention.self.query.weight", "module.segformer.encoder.block.0.2.attention.self.query.bias", "module.segformer.encoder.block.0.2.attention.self.key.weight", "module.segformer.encoder.block.0.2.attention.self.key.bias", "module.segformer.encoder.block.0.2.attention.self.value.weight", "module.segformer.encoder.block.0.2.attention.self.value.bias", "module.segformer.encoder.block.0.2.attention.self.sr.weight", "module.segformer.encoder.block.0.2.attention.self.sr.bias", "module.segformer.encoder.block.0.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.2.attention.output.dense.weight", "module.segformer.encoder.block.0.2.attention.output.dense.bias", "module.segformer.encoder.block.0.2.layer_norm_2.weight", "module.segformer.encoder.block.0.2.layer_norm_2.bias", "module.segformer.encoder.block.0.2.mlp.dense1.weight", "module.segformer.encoder.block.0.2.mlp.dense1.bias", "module.segformer.encoder.block.0.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.2.mlp.dense2.weight", "module.segformer.encoder.block.0.2.mlp.dense2.bias", "module.segformer.encoder.block.1.0.layer_norm_1.weight", "module.segformer.encoder.block.1.0.layer_norm_1.bias", "module.segformer.encoder.block.1.0.attention.self.query.weight", "module.segformer.encoder.block.1.0.attention.self.query.bias", "module.segformer.encoder.block.1.0.attention.self.key.weight", "module.segformer.encoder.block.1.0.attention.self.key.bias", "module.segformer.encoder.block.1.0.attention.self.value.weight", "module.segformer.encoder.block.1.0.attention.self.value.bias", "module.segformer.encoder.block.1.0.attention.self.sr.weight", "module.segformer.encoder.block.1.0.attention.self.sr.bias", "module.segformer.encoder.block.1.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.0.attention.output.dense.weight", "module.segformer.encoder.block.1.0.attention.output.dense.bias", "module.segformer.encoder.block.1.0.layer_norm_2.weight", "module.segformer.encoder.block.1.0.layer_norm_2.bias", "module.segformer.encoder.block.1.0.mlp.dense1.weight", "module.segformer.encoder.block.1.0.mlp.dense1.bias", "module.segformer.encoder.block.1.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.0.mlp.dense2.weight", "module.segformer.encoder.block.1.0.mlp.dense2.bias", "module.segformer.encoder.block.1.1.layer_norm_1.weight", "module.segformer.encoder.block.1.1.layer_norm_1.bias", "module.segformer.encoder.block.1.1.attention.self.query.weight", "module.segformer.encoder.block.1.1.attention.self.query.bias", "module.segformer.encoder.block.1.1.attention.self.key.weight", "module.segformer.encoder.block.1.1.attention.self.key.bias", "module.segformer.encoder.block.1.1.attention.self.value.weight", "module.segformer.encoder.block.1.1.attention.self.value.bias", "module.segformer.encoder.block.1.1.attention.self.sr.weight", "module.segformer.encoder.block.1.1.attention.self.sr.bias", "module.segformer.encoder.block.1.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.1.attention.output.dense.weight", "module.segformer.encoder.block.1.1.attention.output.dense.bias", "module.segformer.encoder.block.1.1.layer_norm_2.weight", "module.segformer.encoder.block.1.1.layer_norm_2.bias", "module.segformer.encoder.block.1.1.mlp.dense1.weight", "module.segformer.encoder.block.1.1.mlp.dense1.bias", "module.segformer.encoder.block.1.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.1.mlp.dense2.weight", "module.segformer.encoder.block.1.1.mlp.dense2.bias", "module.segformer.encoder.block.1.2.layer_norm_1.weight", "module.segformer.encoder.block.1.2.layer_norm_1.bias", "module.segformer.encoder.block.1.2.attention.self.query.weight", "module.segformer.encoder.block.1.2.attention.self.query.bias", "module.segformer.encoder.block.1.2.attention.self.key.weight", "module.segformer.encoder.block.1.2.attention.self.key.bias", "module.segformer.encoder.block.1.2.attention.self.value.weight", "module.segformer.encoder.block.1.2.attention.self.value.bias", "module.segformer.encoder.block.1.2.attention.self.sr.weight", "module.segformer.encoder.block.1.2.attention.self.sr.bias", "module.segformer.encoder.block.1.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.2.attention.output.dense.weight", "module.segformer.encoder.block.1.2.attention.output.dense.bias", "module.segformer.encoder.block.1.2.layer_norm_2.weight", "module.segformer.encoder.block.1.2.layer_norm_2.bias", "module.segformer.encoder.block.1.2.mlp.dense1.weight", "module.segformer.encoder.block.1.2.mlp.dense1.bias", "module.segformer.encoder.block.1.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.2.mlp.dense2.weight", "module.segformer.encoder.block.1.2.mlp.dense2.bias", "module.segformer.encoder.block.1.3.layer_norm_1.weight", "module.segformer.encoder.block.1.3.layer_norm_1.bias", "module.segformer.encoder.block.1.3.attention.self.query.weight", "module.segformer.encoder.block.1.3.attention.self.query.bias", "module.segformer.encoder.block.1.3.attention.self.key.weight", "module.segformer.encoder.block.1.3.attention.self.key.bias", "module.segformer.encoder.block.1.3.attention.self.value.weight", "module.segformer.encoder.block.1.3.attention.self.value.bias", "module.segformer.encoder.block.1.3.attention.self.sr.weight", "module.segformer.encoder.block.1.3.attention.self.sr.bias", "module.segformer.encoder.block.1.3.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.3.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.3.attention.output.dense.weight", "module.segformer.encoder.block.1.3.attention.output.dense.bias", "module.segformer.encoder.block.1.3.layer_norm_2.weight", "module.segformer.encoder.block.1.3.layer_norm_2.bias", "module.segformer.encoder.block.1.3.mlp.dense1.weight", "module.segformer.encoder.block.1.3.mlp.dense1.bias", "module.segformer.encoder.block.1.3.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.3.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.3.mlp.dense2.weight", "module.segformer.encoder.block.1.3.mlp.dense2.bias", "module.segformer.encoder.block.2.0.layer_norm_1.weight", "module.segformer.encoder.block.2.0.layer_norm_1.bias", "module.segformer.encoder.block.2.0.attention.self.query.weight", "module.segformer.encoder.block.2.0.attention.self.query.bias", "module.segformer.encoder.block.2.0.attention.self.key.weight", "module.segformer.encoder.block.2.0.attention.self.key.bias", "module.segformer.encoder.block.2.0.attention.self.value.weight", "module.segformer.encoder.block.2.0.attention.self.value.bias", "module.segformer.encoder.block.2.0.attention.self.sr.weight", "module.segformer.encoder.block.2.0.attention.self.sr.bias", "module.segformer.encoder.block.2.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.0.attention.output.dense.weight", "module.segformer.encoder.block.2.0.attention.output.dense.bias", "module.segformer.encoder.block.2.0.layer_norm_2.weight", "module.segformer.encoder.block.2.0.layer_norm_2.bias", "module.segformer.encoder.block.2.0.mlp.dense1.weight", "module.segformer.encoder.block.2.0.mlp.dense1.bias", "module.segformer.encoder.block.2.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.0.mlp.dense2.weight", "module.segformer.encoder.block.2.0.mlp.dense2.bias", "module.segformer.encoder.block.2.1.layer_norm_1.weight", "module.segformer.encoder.block.2.1.layer_norm_1.bias", "module.segformer.encoder.block.2.1.attention.self.query.weight", "module.segformer.encoder.block.2.1.attention.self.query.bias", "module.segformer.encoder.block.2.1.attention.self.key.weight", "module.segformer.encoder.block.2.1.attention.self.key.bias", "module.segformer.encoder.block.2.1.attention.self.value.weight", "module.segformer.encoder.block.2.1.attention.self.value.bias", "module.segformer.encoder.block.2.1.attention.self.sr.weight", "module.segformer.encoder.block.2.1.attention.self.sr.bias", "module.segformer.encoder.block.2.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.1.attention.output.dense.weight", "module.segformer.encoder.block.2.1.attention.output.dense.bias", "module.segformer.encoder.block.2.1.layer_norm_2.weight", "module.segformer.encoder.block.2.1.layer_norm_2.bias", "module.segformer.encoder.block.2.1.mlp.dense1.weight", "module.segformer.encoder.block.2.1.mlp.dense1.bias", "module.segformer.encoder.block.2.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.1.mlp.dense2.weight", "module.segformer.encoder.block.2.1.mlp.dense2.bias", "module.segformer.encoder.block.2.2.layer_norm_1.weight", "module.segformer.encoder.block.2.2.layer_norm_1.bias", "module.segformer.encoder.block.2.2.attention.self.query.weight", "module.segformer.encoder.block.2.2.attention.self.query.bias", "module.segformer.encoder.block.2.2.attention.self.key.weight", "module.segformer.encoder.block.2.2.attention.self.key.bias", "module.segformer.encoder.block.2.2.attention.self.value.weight", "module.segformer.encoder.block.2.2.attention.self.value.bias", "module.segformer.encoder.block.2.2.attention.self.sr.weight", "module.segformer.encoder.block.2.2.attention.self.sr.bias", "module.segformer.encoder.block.2.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.2.attention.output.dense.weight", "module.segformer.encoder.block.2.2.attention.output.dense.bias", "module.segformer.encoder.block.2.2.layer_norm_2.weight", "module.segformer.encoder.block.2.2.layer_norm_2.bias", "module.segformer.encoder.block.2.2.mlp.dense1.weight", "module.segformer.encoder.block.2.2.mlp.dense1.bias", "module.segformer.encoder.block.2.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.2.mlp.dense2.weight", "module.segformer.encoder.block.2.2.mlp.dense2.bias", "module.segformer.encoder.block.2.3.layer_norm_1.weight", "module.segformer.encoder.block.2.3.layer_norm_1.bias", "module.segformer.encoder.block.2.3.attention.self.query.weight", "module.segformer.encoder.block.2.3.attention.self.query.bias", "module.segformer.encoder.block.2.3.attention.self.key.weight", "module.segformer.encoder.block.2.3.attention.self.key.bias", "module.segformer.encoder.block.2.3.attention.self.value.weight", "module.segformer.encoder.block.2.3.attention.self.value.bias", "module.segformer.encoder.block.2.3.attention.self.sr.weight", "module.segformer.encoder.block.2.3.attention.self.sr.bias", "module.segformer.encoder.block.2.3.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.3.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.3.attention.output.dense.weight", "module.segformer.encoder.block.2.3.attention.output.dense.bias", "module.segformer.encoder.block.2.3.layer_norm_2.weight", "module.segformer.encoder.block.2.3.layer_norm_2.bias", "module.segformer.encoder.block.2.3.mlp.dense1.weight", "module.segformer.encoder.block.2.3.mlp.dense1.bias", "module.segformer.encoder.block.2.3.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.3.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.3.mlp.dense2.weight", "module.segformer.encoder.block.2.3.mlp.dense2.bias", "module.segformer.encoder.block.2.4.layer_norm_1.weight", "module.segformer.encoder.block.2.4.layer_norm_1.bias", "module.segformer.encoder.block.2.4.attention.self.query.weight", "module.segformer.encoder.block.2.4.attention.self.query.bias", "module.segformer.encoder.block.2.4.attention.self.key.weight", "module.segformer.encoder.block.2.4.attention.self.key.bias", "module.segformer.encoder.block.2.4.attention.self.value.weight", "module.segformer.encoder.block.2.4.attention.self.value.bias", "module.segformer.encoder.block.2.4.attention.self.sr.weight", "module.segformer.encoder.block.2.4.attention.self.sr.bias", "module.segformer.encoder.block.2.4.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.4.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.4.attention.output.dense.weight", "module.segformer.encoder.block.2.4.attention.output.dense.bias", "module.segformer.encoder.block.2.4.layer_norm_2.weight", "module.segformer.encoder.block.2.4.layer_norm_2.bias", "module.segformer.encoder.block.2.4.mlp.dense1.weight", "module.segformer.encoder.block.2.4.mlp.dense1.bias", "module.segformer.encoder.block.2.4.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.4.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.4.mlp.dense2.weight", "module.segformer.encoder.block.2.4.mlp.dense2.bias", "module.segformer.encoder.block.2.5.layer_norm_1.weight", "module.segformer.encoder.block.2.5.layer_norm_1.bias", "module.segformer.encoder.block.2.5.attention.self.query.weight", "module.segformer.encoder.block.2.5.attention.self.query.bias", "module.segformer.encoder.block.2.5.attention.self.key.weight", "module.segformer.encoder.block.2.5.attention.self.key.bias", "module.segformer.encoder.block.2.5.attention.self.value.weight", "module.segformer.encoder.block.2.5.attention.self.value.bias", "module.segformer.encoder.block.2.5.attention.self.sr.weight", "module.segformer.encoder.block.2.5.attention.self.sr.bias", "module.segformer.encoder.block.2.5.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.5.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.5.attention.output.dense.weight", "module.segformer.encoder.block.2.5.attention.output.dense.bias", "module.segformer.encoder.block.2.5.layer_norm_2.weight", "module.segformer.encoder.block.2.5.layer_norm_2.bias", "module.segformer.encoder.block.2.5.mlp.dense1.weight", "module.segformer.encoder.block.2.5.mlp.dense1.bias", "module.segformer.encoder.block.2.5.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.5.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.5.mlp.dense2.weight", "module.segformer.encoder.block.2.5.mlp.dense2.bias", "module.segformer.encoder.block.2.6.layer_norm_1.weight", "module.segformer.encoder.block.2.6.layer_norm_1.bias", "module.segformer.encoder.block.2.6.attention.self.query.weight", "module.segformer.encoder.block.2.6.attention.self.query.bias", "module.segformer.encoder.block.2.6.attention.self.key.weight", "module.segformer.encoder.block.2.6.attention.self.key.bias", "module.segformer.encoder.block.2.6.attention.self.value.weight", "module.segformer.encoder.block.2.6.attention.self.value.bias", "module.segformer.encoder.block.2.6.attention.self.sr.weight", "module.segformer.encoder.block.2.6.attention.self.sr.bias", "module.segformer.encoder.block.2.6.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.6.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.6.attention.output.dense.weight", "module.segformer.encoder.block.2.6.attention.output.dense.bias", "module.segformer.encoder.block.2.6.layer_norm_2.weight", "module.segformer.encoder.block.2.6.layer_norm_2.bias", "module.segformer.encoder.block.2.6.mlp.dense1.weight", "module.segformer.encoder.block.2.6.mlp.dense1.bias", "module.segformer.encoder.block.2.6.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.6.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.6.mlp.dense2.weight", "module.segformer.encoder.block.2.6.mlp.dense2.bias", "module.segformer.encoder.block.2.7.layer_norm_1.weight", "module.segformer.encoder.block.2.7.layer_norm_1.bias", "module.segformer.encoder.block.2.7.attention.self.query.weight", "module.segformer.encoder.block.2.7.attention.self.query.bias", "module.segformer.encoder.block.2.7.attention.self.key.weight", "module.segformer.encoder.block.2.7.attention.self.key.bias", "module.segformer.encoder.block.2.7.attention.self.value.weight", "module.segformer.encoder.block.2.7.attention.self.value.bias", "module.segformer.encoder.block.2.7.attention.self.sr.weight", "module.segformer.encoder.block.2.7.attention.self.sr.bias", "module.segformer.encoder.block.2.7.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.7.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.7.attention.output.dense.weight", "module.segformer.encoder.block.2.7.attention.output.dense.bias", "module.segformer.encoder.block.2.7.layer_norm_2.weight", "module.segformer.encoder.block.2.7.layer_norm_2.bias", "module.segformer.encoder.block.2.7.mlp.dense1.weight", "module.segformer.encoder.block.2.7.mlp.dense1.bias", "module.segformer.encoder.block.2.7.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.7.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.7.mlp.dense2.weight", "module.segformer.encoder.block.2.7.mlp.dense2.bias", "module.segformer.encoder.block.2.8.layer_norm_1.weight", "module.segformer.encoder.block.2.8.layer_norm_1.bias", "module.segformer.encoder.block.2.8.attention.self.query.weight", "module.segformer.encoder.block.2.8.attention.self.query.bias", "module.segformer.encoder.block.2.8.attention.self.key.weight", "module.segformer.encoder.block.2.8.attention.self.key.bias", "module.segformer.encoder.block.2.8.attention.self.value.weight", "module.segformer.encoder.block.2.8.attention.self.value.bias", "module.segformer.encoder.block.2.8.attention.self.sr.weight", "module.segformer.encoder.block.2.8.attention.self.sr.bias", "module.segformer.encoder.block.2.8.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.8.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.8.attention.output.dense.weight", "module.segformer.encoder.block.2.8.attention.output.dense.bias", "module.segformer.encoder.block.2.8.layer_norm_2.weight", "module.segformer.encoder.block.2.8.layer_norm_2.bias", "module.segformer.encoder.block.2.8.mlp.dense1.weight", "module.segformer.encoder.block.2.8.mlp.dense1.bias", "module.segformer.encoder.block.2.8.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.8.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.8.mlp.dense2.weight", "module.segformer.encoder.block.2.8.mlp.dense2.bias", "module.segformer.encoder.block.2.9.layer_norm_1.weight", "module.segformer.encoder.block.2.9.layer_norm_1.bias", "module.segformer.encoder.block.2.9.attention.self.query.weight", "module.segformer.encoder.block.2.9.attention.self.query.bias", "module.segformer.encoder.block.2.9.attention.self.key.weight", "module.segformer.encoder.block.2.9.attention.self.key.bias", "module.segformer.encoder.block.2.9.attention.self.value.weight", "module.segformer.encoder.block.2.9.attention.self.value.bias", "module.segformer.encoder.block.2.9.attention.self.sr.weight", "module.segformer.encoder.block.2.9.attention.self.sr.bias", "module.segformer.encoder.block.2.9.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.9.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.9.attention.output.dense.weight", "module.segformer.encoder.block.2.9.attention.output.dense.bias", "module.segformer.encoder.block.2.9.layer_norm_2.weight", "module.segformer.encoder.block.2.9.layer_norm_2.bias", "module.segformer.encoder.block.2.9.mlp.dense1.weight", "module.segformer.encoder.block.2.9.mlp.dense1.bias", "module.segformer.encoder.block.2.9.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.9.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.9.mlp.dense2.weight", "module.segformer.encoder.block.2.9.mlp.dense2.bias", "module.segformer.encoder.block.2.10.layer_norm_1.weight", "module.segformer.encoder.block.2.10.layer_norm_1.bias", "module.segformer.encoder.block.2.10.attention.self.query.weight", "module.segformer.encoder.block.2.10.attention.self.query.bias", "module.segformer.encoder.block.2.10.attention.self.key.weight", "module.segformer.encoder.block.2.10.attention.self.key.bias", "module.segformer.encoder.block.2.10.attention.self.value.weight", "module.segformer.encoder.block.2.10.attention.self.value.bias", "module.segformer.encoder.block.2.10.attention.self.sr.weight", "module.segformer.encoder.block.2.10.attention.self.sr.bias", "module.segformer.encoder.block.2.10.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.10.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.10.attention.output.dense.weight", "module.segformer.encoder.block.2.10.attention.output.dense.bias", "module.segformer.encoder.block.2.10.layer_norm_2.weight", "module.segformer.encoder.block.2.10.layer_norm_2.bias", "module.segformer.encoder.block.2.10.mlp.dense1.weight", "module.segformer.encoder.block.2.10.mlp.dense1.bias", "module.segformer.encoder.block.2.10.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.10.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.10.mlp.dense2.weight", "module.segformer.encoder.block.2.10.mlp.dense2.bias", "module.segformer.encoder.block.2.11.layer_norm_1.weight", "module.segformer.encoder.block.2.11.layer_norm_1.bias", "module.segformer.encoder.block.2.11.attention.self.query.weight", "module.segformer.encoder.block.2.11.attention.self.query.bias", "module.segformer.encoder.block.2.11.attention.self.key.weight", "module.segformer.encoder.block.2.11.attention.self.key.bias", "module.segformer.encoder.block.2.11.attention.self.value.weight", "module.segformer.encoder.block.2.11.attention.self.value.bias", "module.segformer.encoder.block.2.11.attention.self.sr.weight", "module.segformer.encoder.block.2.11.attention.self.sr.bias", "module.segformer.encoder.block.2.11.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.11.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.11.attention.output.dense.weight", "module.segformer.encoder.block.2.11.attention.output.dense.bias", "module.segformer.encoder.block.2.11.layer_norm_2.weight", "module.segformer.encoder.block.2.11.layer_norm_2.bias", "module.segformer.encoder.block.2.11.mlp.dense1.weight", "module.segformer.encoder.block.2.11.mlp.dense1.bias", "module.segformer.encoder.block.2.11.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.11.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.11.mlp.dense2.weight", "module.segformer.encoder.block.2.11.mlp.dense2.bias", "module.segformer.encoder.block.2.12.layer_norm_1.weight", "module.segformer.encoder.block.2.12.layer_norm_1.bias", "module.segformer.encoder.block.2.12.attention.self.query.weight", "module.segformer.encoder.block.2.12.attention.self.query.bias", "module.segformer.encoder.block.2.12.attention.self.key.weight", "module.segformer.encoder.block.2.12.attention.self.key.bias", "module.segformer.encoder.block.2.12.attention.self.value.weight", "module.segformer.encoder.block.2.12.attention.self.value.bias", "module.segformer.encoder.block.2.12.attention.self.sr.weight", "module.segformer.encoder.block.2.12.attention.self.sr.bias", "module.segformer.encoder.block.2.12.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.12.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.12.attention.output.dense.weight", "module.segformer.encoder.block.2.12.attention.output.dense.bias", "module.segformer.encoder.block.2.12.layer_norm_2.weight", "module.segformer.encoder.block.2.12.layer_norm_2.bias", "module.segformer.encoder.block.2.12.mlp.dense1.weight", "module.segformer.encoder.block.2.12.mlp.dense1.bias", "module.segformer.encoder.block.2.12.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.12.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.12.mlp.dense2.weight", "module.segformer.encoder.block.2.12.mlp.dense2.bias", "module.segformer.encoder.block.2.13.layer_norm_1.weight", "module.segformer.encoder.block.2.13.layer_norm_1.bias", "module.segformer.encoder.block.2.13.attention.self.query.weight", "module.segformer.encoder.block.2.13.attention.self.query.bias", "module.segformer.encoder.block.2.13.attention.self.key.weight", "module.segformer.encoder.block.2.13.attention.self.key.bias", "module.segformer.encoder.block.2.13.attention.self.value.weight", "module.segformer.encoder.block.2.13.attention.self.value.bias", "module.segformer.encoder.block.2.13.attention.self.sr.weight", "module.segformer.encoder.block.2.13.attention.self.sr.bias", "module.segformer.encoder.block.2.13.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.13.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.13.attention.output.dense.weight", "module.segformer.encoder.block.2.13.attention.output.dense.bias", "module.segformer.encoder.block.2.13.layer_norm_2.weight", "module.segformer.encoder.block.2.13.layer_norm_2.bias", "module.segformer.encoder.block.2.13.mlp.dense1.weight", "module.segformer.encoder.block.2.13.mlp.dense1.bias", "module.segformer.encoder.block.2.13.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.13.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.13.mlp.dense2.weight", "module.segformer.encoder.block.2.13.mlp.dense2.bias", "module.segformer.encoder.block.2.14.layer_norm_1.weight", "module.segformer.encoder.block.2.14.layer_norm_1.bias", "module.segformer.encoder.block.2.14.attention.self.query.weight", "module.segformer.encoder.block.2.14.attention.self.query.bias", "module.segformer.encoder.block.2.14.attention.self.key.weight", "module.segformer.encoder.block.2.14.attention.self.key.bias", "module.segformer.encoder.block.2.14.attention.self.value.weight", "module.segformer.encoder.block.2.14.attention.self.value.bias", "module.segformer.encoder.block.2.14.attention.self.sr.weight", "module.segformer.encoder.block.2.14.attention.self.sr.bias", "module.segformer.encoder.block.2.14.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.14.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.14.attention.output.dense.weight", "module.segformer.encoder.block.2.14.attention.output.dense.bias", "module.segformer.encoder.block.2.14.layer_norm_2.weight", "module.segformer.encoder.block.2.14.layer_norm_2.bias", "module.segformer.encoder.block.2.14.mlp.dense1.weight", "module.segformer.encoder.block.2.14.mlp.dense1.bias", "module.segformer.encoder.block.2.14.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.14.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.14.mlp.dense2.weight", "module.segformer.encoder.block.2.14.mlp.dense2.bias", "module.segformer.encoder.block.2.15.layer_norm_1.weight", "module.segformer.encoder.block.2.15.layer_norm_1.bias", "module.segformer.encoder.block.2.15.attention.self.query.weight", "module.segformer.encoder.block.2.15.attention.self.query.bias", "module.segformer.encoder.block.2.15.attention.self.key.weight", "module.segformer.encoder.block.2.15.attention.self.key.bias", "module.segformer.encoder.block.2.15.attention.self.value.weight", "module.segformer.encoder.block.2.15.attention.self.value.bias", "module.segformer.encoder.block.2.15.attention.self.sr.weight", "module.segformer.encoder.block.2.15.attention.self.sr.bias", "module.segformer.encoder.block.2.15.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.15.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.15.attention.output.dense.weight", "module.segformer.encoder.block.2.15.attention.output.dense.bias", "module.segformer.encoder.block.2.15.layer_norm_2.weight", "module.segformer.encoder.block.2.15.layer_norm_2.bias", "module.segformer.encoder.block.2.15.mlp.dense1.weight", "module.segformer.encoder.block.2.15.mlp.dense1.bias", "module.segformer.encoder.block.2.15.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.15.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.15.mlp.dense2.weight", "module.segformer.encoder.block.2.15.mlp.dense2.bias", "module.segformer.encoder.block.2.16.layer_norm_1.weight", "module.segformer.encoder.block.2.16.layer_norm_1.bias", "module.segformer.encoder.block.2.16.attention.self.query.weight", "module.segformer.encoder.block.2.16.attention.self.query.bias", "module.segformer.encoder.block.2.16.attention.self.key.weight", "module.segformer.encoder.block.2.16.attention.self.key.bias", "module.segformer.encoder.block.2.16.attention.self.value.weight", "module.segformer.encoder.block.2.16.attention.self.value.bias", "module.segformer.encoder.block.2.16.attention.self.sr.weight", "module.segformer.encoder.block.2.16.attention.self.sr.bias", "module.segformer.encoder.block.2.16.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.16.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.16.attention.output.dense.weight", "module.segformer.encoder.block.2.16.attention.output.dense.bias", "module.segformer.encoder.block.2.16.layer_norm_2.weight", "module.segformer.encoder.block.2.16.layer_norm_2.bias", "module.segformer.encoder.block.2.16.mlp.dense1.weight", "module.segformer.encoder.block.2.16.mlp.dense1.bias", "module.segformer.encoder.block.2.16.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.16.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.16.mlp.dense2.weight", "module.segformer.encoder.block.2.16.mlp.dense2.bias", "module.segformer.encoder.block.2.17.layer_norm_1.weight", "module.segformer.encoder.block.2.17.layer_norm_1.bias", "module.segformer.encoder.block.2.17.attention.self.query.weight", "module.segformer.encoder.block.2.17.attention.self.query.bias", "module.segformer.encoder.block.2.17.attention.self.key.weight", "module.segformer.encoder.block.2.17.attention.self.key.bias", "module.segformer.encoder.block.2.17.attention.self.value.weight", "module.segformer.encoder.block.2.17.attention.self.value.bias", "module.segformer.encoder.block.2.17.attention.self.sr.weight", "module.segformer.encoder.block.2.17.attention.self.sr.bias", "module.segformer.encoder.block.2.17.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.17.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.17.attention.output.dense.weight", "module.segformer.encoder.block.2.17.attention.output.dense.bias", "module.segformer.encoder.block.2.17.layer_norm_2.weight", "module.segformer.encoder.block.2.17.layer_norm_2.bias", "module.segformer.encoder.block.2.17.mlp.dense1.weight", "module.segformer.encoder.block.2.17.mlp.dense1.bias", "module.segformer.encoder.block.2.17.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.17.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.17.mlp.dense2.weight", "module.segformer.encoder.block.2.17.mlp.dense2.bias", "module.segformer.encoder.block.3.0.layer_norm_1.weight", "module.segformer.encoder.block.3.0.layer_norm_1.bias", "module.segformer.encoder.block.3.0.attention.self.query.weight", "module.segformer.encoder.block.3.0.attention.self.query.bias", "module.segformer.encoder.block.3.0.attention.self.key.weight", "module.segformer.encoder.block.3.0.attention.self.key.bias", "module.segformer.encoder.block.3.0.attention.self.value.weight", "module.segformer.encoder.block.3.0.attention.self.value.bias", "module.segformer.encoder.block.3.0.attention.output.dense.weight", "module.segformer.encoder.block.3.0.attention.output.dense.bias", "module.segformer.encoder.block.3.0.layer_norm_2.weight", "module.segformer.encoder.block.3.0.layer_norm_2.bias", "module.segformer.encoder.block.3.0.mlp.dense1.weight", "module.segformer.encoder.block.3.0.mlp.dense1.bias", "module.segformer.encoder.block.3.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.0.mlp.dense2.weight", "module.segformer.encoder.block.3.0.mlp.dense2.bias", "module.segformer.encoder.block.3.1.layer_norm_1.weight", "module.segformer.encoder.block.3.1.layer_norm_1.bias", "module.segformer.encoder.block.3.1.attention.self.query.weight", "module.segformer.encoder.block.3.1.attention.self.query.bias", "module.segformer.encoder.block.3.1.attention.self.key.weight", "module.segformer.encoder.block.3.1.attention.self.key.bias", "module.segformer.encoder.block.3.1.attention.self.value.weight", "module.segformer.encoder.block.3.1.attention.self.value.bias", "module.segformer.encoder.block.3.1.attention.output.dense.weight", "module.segformer.encoder.block.3.1.attention.output.dense.bias", "module.segformer.encoder.block.3.1.layer_norm_2.weight", "module.segformer.encoder.block.3.1.layer_norm_2.bias", "module.segformer.encoder.block.3.1.mlp.dense1.weight", "module.segformer.encoder.block.3.1.mlp.dense1.bias", "module.segformer.encoder.block.3.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.1.mlp.dense2.weight", "module.segformer.encoder.block.3.1.mlp.dense2.bias", "module.segformer.encoder.block.3.2.layer_norm_1.weight", "module.segformer.encoder.block.3.2.layer_norm_1.bias", "module.segformer.encoder.block.3.2.attention.self.query.weight", "module.segformer.encoder.block.3.2.attention.self.query.bias", "module.segformer.encoder.block.3.2.attention.self.key.weight", "module.segformer.encoder.block.3.2.attention.self.key.bias", "module.segformer.encoder.block.3.2.attention.self.value.weight", "module.segformer.encoder.block.3.2.attention.self.value.bias", "module.segformer.encoder.block.3.2.attention.output.dense.weight", "module.segformer.encoder.block.3.2.attention.output.dense.bias", "module.segformer.encoder.block.3.2.layer_norm_2.weight", "module.segformer.encoder.block.3.2.layer_norm_2.bias", "module.segformer.encoder.block.3.2.mlp.dense1.weight", "module.segformer.encoder.block.3.2.mlp.dense1.bias", "module.segformer.encoder.block.3.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.2.mlp.dense2.weight", "module.segformer.encoder.block.3.2.mlp.dense2.bias", "module.segformer.encoder.layer_norm.0.weight", "module.segformer.encoder.layer_norm.0.bias", "module.segformer.encoder.layer_norm.1.weight", "module.segformer.encoder.layer_norm.1.bias", "module.segformer.encoder.layer_norm.2.weight", "module.segformer.encoder.layer_norm.2.bias", "module.segformer.encoder.layer_norm.3.weight", "module.segformer.encoder.layer_norm.3.bias", "module.decode_head.linear_c.0.proj.weight", "module.decode_head.linear_c.0.proj.bias", "module.decode_head.linear_c.1.proj.weight", "module.decode_head.linear_c.1.proj.bias", "module.decode_head.linear_c.2.proj.weight", "module.decode_head.linear_c.2.proj.bias", "module.decode_head.linear_c.3.proj.weight", "module.decode_head.linear_c.3.proj.bias", "module.decode_head.linear_fuse.weight", "module.decode_head.batch_norm.weight", "module.decode_head.batch_norm.bias", "module.decode_head.batch_norm.running_mean", "module.decode_head.batch_norm.running_var", "module.decode_head.batch_norm.num_batches_tracked", "module.decode_head.classifier.weight", "module.decode_head.classifier.bias". 

In [43]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torch.cuda.amp import autocast, GradScaler
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from sklearn.model_selection import train_test_split

# Check GPU availability and set device
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

class CraterDataset(Dataset):
    def __init__(self, images, masks, processor, transform=None):
        self.images = images
        self.masks = masks
        self.processor = processor
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        mask = self.masks[idx]
        image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
        
        if self.transform:
            image_tensor = torch.from_numpy(image_rgb).permute(2, 0, 1)
            image_tensor = self.transform(image_tensor)
            image_rgb = image_tensor.permute(1, 2, 0).numpy()

        inputs = self.processor(
            images=image_rgb, 
            segmentation_maps=mask, 
            return_tensors="pt",
            do_rescale=False
        )
        return {
            'pixel_values': inputs['pixel_values'].squeeze(),
            'labels': inputs['labels'].squeeze()
        }

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train_epoch(model, train_loader, optimizer, scheduler, scaler, device, gradient_accumulation_steps):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    
    for batch_idx, batch in enumerate(train_loader):
        with autocast():
            outputs = model(
                pixel_values=batch['pixel_values'].to(device),
                labels=batch['labels'].to(device)
            )
            loss = outputs.loss.mean() / gradient_accumulation_steps
        
        scaler.scale(loss).backward()
        
        if (batch_idx + 1) % gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            scheduler.step()
        
        total_loss += loss.item() * gradient_accumulation_steps
        
        if batch_idx % 5 == 0:
            print(f"Batch: {batch_idx}, Loss: {loss.item()*gradient_accumulation_steps:.4f}")
            
    return total_loss / len(train_loader)

def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(
                pixel_values=batch['pixel_values'].to(device),
                labels=batch['labels'].to(device)
            )
            loss = outputs.loss.mean()
            total_loss += loss.item()
    return total_loss / len(val_loader)

print("Starting data preparation...")

# Split data into train, validation, and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Reshape and normalize data
X_train_reshaped = X_train.reshape(-1, 48, 48, 1)
X_val_reshaped = X_val.reshape(-1, 48, 48, 1)
y_train_reshaped = y_train.reshape(-1, 48, 48)
y_val_reshaped = y_val.reshape(-1, 48, 48)

X_train_normalized = X_train_reshaped / 255.0
X_val_normalized = X_val_reshaped / 255.0

print("Initializing model and processor...")

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b5")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b5",
    num_labels=2,
    ignore_mismatched_sizes=True
)

print(f"Model Parameters: {count_parameters(model):,}")

# Move model to GPU and enable data parallel
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = nn.DataParallel(model)
model = model.cuda()

# Data augmentation
transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomRotation(10),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5)
])

# Create datasets
train_dataset = CraterDataset(X_train_normalized, y_train_reshaped, processor, transform=transform)
val_dataset = CraterDataset(X_val_normalized, y_val_reshaped, processor, transform=None)

# Create dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Training setup
optimizer = torch.optim.AdamW([
    {'params': model.module.segformer.parameters(), 'lr': 5e-6},
    {'params': model.module.decode_head.parameters(), 'lr': 5e-5}
])

num_epochs = 30
steps_per_epoch = len(train_dataloader)
total_steps = num_epochs * steps_per_epoch

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=[5e-5, 5e-4],
    epochs=num_epochs,
    steps_per_epoch=steps_per_epoch,
    pct_start=0.3,
    anneal_strategy='cos'
)

scaler = GradScaler()
gradient_accumulation_steps = 4
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_val_loss = float('inf')
patience = 5
patience_counter = 0

# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, 
                           scaler, device, gradient_accumulation_steps)
    val_loss = validate(model, val_dataloader, device)
    
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'val_loss': val_loss,
            'train_loss': train_loss
        }, 'best_model.pt')
        print(f"Saved new best model with validation loss: {val_loss:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break

print("Training completed!")

CUDA available: True
Number of GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4
Starting data preparation...
Initializing model and processor...


preprocessor_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

  return func(*args, **kwargs)


config.json:   0%|          | 0.00/70.0k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/328M [00:00<?, ?B/s]

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b5 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Parameters: 84,594,882
Using 2 GPUs!

Epoch 1/30


  scaler = GradScaler()
  self.pid = os.fork()
  with autocast():
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Batch: 0, Loss: 0.7051
Batch: 5, Loss: 0.7081
Batch: 10, Loss: 0.7014
Batch: 15, Loss: 0.6943
Batch: 20, Loss: 0.6492
Batch: 25, Loss: 0.6818
Batch: 30, Loss: 0.6723
Batch: 35, Loss: 0.6643
Batch: 40, Loss: 0.6563
Batch: 45, Loss: 0.6894


  self.pid = os.fork()


Epoch 1: Train Loss = 0.6776, Val Loss = 0.6120
Saved new best model with validation loss: 0.6120

Epoch 2/30
Batch: 0, Loss: 0.6412
Batch: 5, Loss: 0.6391
Batch: 10, Loss: 0.6498
Batch: 15, Loss: 0.6234
Batch: 20, Loss: 0.5952
Batch: 25, Loss: 0.6085
Batch: 30, Loss: 0.6365
Batch: 35, Loss: 0.5877
Batch: 40, Loss: 0.6053
Batch: 45, Loss: 0.6046
Epoch 2: Train Loss = 0.6251, Val Loss = 0.5499
Saved new best model with validation loss: 0.5499

Epoch 3/30
Batch: 0, Loss: 0.5552
Batch: 5, Loss: 0.6012
Batch: 10, Loss: 0.6128
Batch: 15, Loss: 0.5749
Batch: 20, Loss: 0.5750
Batch: 25, Loss: 0.5832
Batch: 30, Loss: 0.5652
Batch: 35, Loss: 0.6221
Batch: 40, Loss: 0.6072
Batch: 45, Loss: 0.5603
Epoch 3: Train Loss = 0.6006, Val Loss = 0.5155
Saved new best model with validation loss: 0.5155

Epoch 4/30
Batch: 0, Loss: 0.5317
Batch: 5, Loss: 0.5897
Batch: 10, Loss: 0.5831
Batch: 15, Loss: 0.5833
Batch: 20, Loss: 0.5630
Batch: 25, Loss: 0.6070
Batch: 30, Loss: 0.5715
Batch: 35, Loss: 0.5676
Batc

In [6]:
from torchvision.transforms import Resize
from torchvision.transforms.functional import InterpolationMode

# Create resize transform
resize_transform = Resize(
    size=(48, 48),
    interpolation=InterpolationMode.BILINEAR,
    antialias=True
)

# Evaluate model
model.eval()
X_test_reshaped = X_test.reshape(-1, 48, 48, 1)
X_test_normalized = X_test_reshaped / 255.0

print("Making predictions on test set...")
predictions = np.zeros((len(X_test_normalized), 48, 48))

for i, image in enumerate(X_test_normalized):
    if i % 20 == 0:
        print(f"Predicting image {i}/{len(X_test_normalized)}")
    pred = predict(image, model, processor)
    
    # Resize prediction using torchvision if necessary
    if pred.shape != (48, 48):
        pred_tensor = torch.from_numpy(pred).unsqueeze(0)  # Add batch and channel dims
        pred_resized = resize_transform(pred_tensor).squeeze().numpy()
        predictions[i] = pred_resized
    else:
        predictions[i] = pred

print(f"Ground truth shape: {y_test.reshape(-1, 48, 48).shape}")
print(f"Predictions shape: {predictions.shape}")

# Calculate balanced accuracy
y_true_flat = y_test.reshape(-1, 48, 48).flatten()
y_pred_flat = predictions.flatten()
y_pred_flat = (y_pred_flat > 0.5).astype(int)

print(f"Flattened shapes - True: {y_true_flat.shape}, Pred: {y_pred_flat.shape}")
balanced_acc = balanced_accuracy_score(y_true_flat, y_pred_flat)
print(f"Final Balanced Accuracy: {balanced_acc:.4f}")

Making predictions on test set...
Predicting image 0/110


NameError: name 'predict' is not defined

In [8]:
import torch
import numpy as np
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from torchvision.transforms import Resize
from torchvision.transforms.functional import InterpolationMode
from sklearn.metrics import balanced_accuracy_score

def predict(image, model, processor):
    """
    Make prediction for a single image
    
    Args:
        image: numpy array of shape (48, 48, 1)
        model: trained SegFormer model
        processor: SegFormer processor
    
    Returns:
        numpy array of shape (48, 48) with probability predictions
    """
    model.eval()
    with torch.no_grad():
        # Convert image to RGB
        image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
        
        # Process image
        inputs = processor(
            images=image_rgb,
            return_tensors="pt",
            do_rescale=False
        )
        
        # Move to GPU
        pixel_values = inputs['pixel_values'].cuda()
        
        # Get predictions
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        
        # Convert to probabilities and move to CPU
        probs = torch.softmax(logits, dim=1)
        pred = probs[0, 1].cpu().numpy()  # Take class 1 probability
        
        return pred

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b3")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b3",
    num_labels=2,
    ignore_mismatched_sizes=True
).cuda()

# Load the best checkpoint
checkpoint = torch.load('best_model.pt')
model.load_state_dict(checkpoint['model_state_dict'])

# Create resize transform
resize_transform = Resize(
    size=(48, 48),
    interpolation=InterpolationMode.BILINEAR,
    antialias=True
)

# Evaluate model
model.eval()
X_test_reshaped = X_test.reshape(-1, 48, 48, 1)
X_test_normalized = X_test_reshaped / 255.0

print("Making predictions on test set...")
predictions = np.zeros((len(X_test_normalized), 48, 48))

for i, image in enumerate(X_test_normalized):
    if i % 20 == 0:
        print(f"Predicting image {i}/{len(X_test_normalized)}")
    pred = predict(image, model, processor)
    
    # Resize prediction using torchvision if necessary
    if pred.shape != (48, 48):
        pred_tensor = torch.from_numpy(pred).unsqueeze(0)  # Add batch and channel dims
        pred_resized = resize_transform(pred_tensor).squeeze().numpy()
        predictions[i] = pred_resized
    else:
        predictions[i] = pred

print(f"Ground truth shape: {y_test.reshape(-1, 48, 48).shape}")
print(f"Predictions shape: {predictions.shape}")

# Calculate balanced accuracy
y_true_flat = y_test.reshape(-1, 48, 48).flatten()
y_pred_flat = predictions.flatten()
y_pred_flat = (y_pred_flat > 0.5).astype(int)

print(f"Flattened shapes - True: {y_true_flat.shape}, Pred: {y_pred_flat.shape}")
balanced_acc = balanced_accuracy_score(y_true_flat, y_pred_flat)
print(f"Final Balanced Accuracy: {balanced_acc:.4f}")

# Calculate additional metrics
tp = np.sum((y_true_flat == 1) & (y_pred_flat == 1))
tn = np.sum((y_true_flat == 0) & (y_pred_flat == 0))
fp = np.sum((y_true_flat == 0) & (y_pred_flat == 1))
fn = np.sum((y_true_flat == 1) & (y_pred_flat == 0))

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print("\nDetailed Metrics:")
print(f"Sensitivity (True Positive Rate): {sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {specificity:.4f}")
print(f"True Positives: {tp}")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b3 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pt')


RuntimeError: Error(s) in loading state_dict for SegformerForSemanticSegmentation:
	Missing key(s) in state_dict: "segformer.encoder.patch_embeddings.0.proj.weight", "segformer.encoder.patch_embeddings.0.proj.bias", "segformer.encoder.patch_embeddings.0.layer_norm.weight", "segformer.encoder.patch_embeddings.0.layer_norm.bias", "segformer.encoder.patch_embeddings.1.proj.weight", "segformer.encoder.patch_embeddings.1.proj.bias", "segformer.encoder.patch_embeddings.1.layer_norm.weight", "segformer.encoder.patch_embeddings.1.layer_norm.bias", "segformer.encoder.patch_embeddings.2.proj.weight", "segformer.encoder.patch_embeddings.2.proj.bias", "segformer.encoder.patch_embeddings.2.layer_norm.weight", "segformer.encoder.patch_embeddings.2.layer_norm.bias", "segformer.encoder.patch_embeddings.3.proj.weight", "segformer.encoder.patch_embeddings.3.proj.bias", "segformer.encoder.patch_embeddings.3.layer_norm.weight", "segformer.encoder.patch_embeddings.3.layer_norm.bias", "segformer.encoder.block.0.0.layer_norm_1.weight", "segformer.encoder.block.0.0.layer_norm_1.bias", "segformer.encoder.block.0.0.attention.self.query.weight", "segformer.encoder.block.0.0.attention.self.query.bias", "segformer.encoder.block.0.0.attention.self.key.weight", "segformer.encoder.block.0.0.attention.self.key.bias", "segformer.encoder.block.0.0.attention.self.value.weight", "segformer.encoder.block.0.0.attention.self.value.bias", "segformer.encoder.block.0.0.attention.self.sr.weight", "segformer.encoder.block.0.0.attention.self.sr.bias", "segformer.encoder.block.0.0.attention.self.layer_norm.weight", "segformer.encoder.block.0.0.attention.self.layer_norm.bias", "segformer.encoder.block.0.0.attention.output.dense.weight", "segformer.encoder.block.0.0.attention.output.dense.bias", "segformer.encoder.block.0.0.layer_norm_2.weight", "segformer.encoder.block.0.0.layer_norm_2.bias", "segformer.encoder.block.0.0.mlp.dense1.weight", "segformer.encoder.block.0.0.mlp.dense1.bias", "segformer.encoder.block.0.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.0.mlp.dense2.weight", "segformer.encoder.block.0.0.mlp.dense2.bias", "segformer.encoder.block.0.1.layer_norm_1.weight", "segformer.encoder.block.0.1.layer_norm_1.bias", "segformer.encoder.block.0.1.attention.self.query.weight", "segformer.encoder.block.0.1.attention.self.query.bias", "segformer.encoder.block.0.1.attention.self.key.weight", "segformer.encoder.block.0.1.attention.self.key.bias", "segformer.encoder.block.0.1.attention.self.value.weight", "segformer.encoder.block.0.1.attention.self.value.bias", "segformer.encoder.block.0.1.attention.self.sr.weight", "segformer.encoder.block.0.1.attention.self.sr.bias", "segformer.encoder.block.0.1.attention.self.layer_norm.weight", "segformer.encoder.block.0.1.attention.self.layer_norm.bias", "segformer.encoder.block.0.1.attention.output.dense.weight", "segformer.encoder.block.0.1.attention.output.dense.bias", "segformer.encoder.block.0.1.layer_norm_2.weight", "segformer.encoder.block.0.1.layer_norm_2.bias", "segformer.encoder.block.0.1.mlp.dense1.weight", "segformer.encoder.block.0.1.mlp.dense1.bias", "segformer.encoder.block.0.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.1.mlp.dense2.weight", "segformer.encoder.block.0.1.mlp.dense2.bias", "segformer.encoder.block.0.2.layer_norm_1.weight", "segformer.encoder.block.0.2.layer_norm_1.bias", "segformer.encoder.block.0.2.attention.self.query.weight", "segformer.encoder.block.0.2.attention.self.query.bias", "segformer.encoder.block.0.2.attention.self.key.weight", "segformer.encoder.block.0.2.attention.self.key.bias", "segformer.encoder.block.0.2.attention.self.value.weight", "segformer.encoder.block.0.2.attention.self.value.bias", "segformer.encoder.block.0.2.attention.self.sr.weight", "segformer.encoder.block.0.2.attention.self.sr.bias", "segformer.encoder.block.0.2.attention.self.layer_norm.weight", "segformer.encoder.block.0.2.attention.self.layer_norm.bias", "segformer.encoder.block.0.2.attention.output.dense.weight", "segformer.encoder.block.0.2.attention.output.dense.bias", "segformer.encoder.block.0.2.layer_norm_2.weight", "segformer.encoder.block.0.2.layer_norm_2.bias", "segformer.encoder.block.0.2.mlp.dense1.weight", "segformer.encoder.block.0.2.mlp.dense1.bias", "segformer.encoder.block.0.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.0.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.0.2.mlp.dense2.weight", "segformer.encoder.block.0.2.mlp.dense2.bias", "segformer.encoder.block.1.0.layer_norm_1.weight", "segformer.encoder.block.1.0.layer_norm_1.bias", "segformer.encoder.block.1.0.attention.self.query.weight", "segformer.encoder.block.1.0.attention.self.query.bias", "segformer.encoder.block.1.0.attention.self.key.weight", "segformer.encoder.block.1.0.attention.self.key.bias", "segformer.encoder.block.1.0.attention.self.value.weight", "segformer.encoder.block.1.0.attention.self.value.bias", "segformer.encoder.block.1.0.attention.self.sr.weight", "segformer.encoder.block.1.0.attention.self.sr.bias", "segformer.encoder.block.1.0.attention.self.layer_norm.weight", "segformer.encoder.block.1.0.attention.self.layer_norm.bias", "segformer.encoder.block.1.0.attention.output.dense.weight", "segformer.encoder.block.1.0.attention.output.dense.bias", "segformer.encoder.block.1.0.layer_norm_2.weight", "segformer.encoder.block.1.0.layer_norm_2.bias", "segformer.encoder.block.1.0.mlp.dense1.weight", "segformer.encoder.block.1.0.mlp.dense1.bias", "segformer.encoder.block.1.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.0.mlp.dense2.weight", "segformer.encoder.block.1.0.mlp.dense2.bias", "segformer.encoder.block.1.1.layer_norm_1.weight", "segformer.encoder.block.1.1.layer_norm_1.bias", "segformer.encoder.block.1.1.attention.self.query.weight", "segformer.encoder.block.1.1.attention.self.query.bias", "segformer.encoder.block.1.1.attention.self.key.weight", "segformer.encoder.block.1.1.attention.self.key.bias", "segformer.encoder.block.1.1.attention.self.value.weight", "segformer.encoder.block.1.1.attention.self.value.bias", "segformer.encoder.block.1.1.attention.self.sr.weight", "segformer.encoder.block.1.1.attention.self.sr.bias", "segformer.encoder.block.1.1.attention.self.layer_norm.weight", "segformer.encoder.block.1.1.attention.self.layer_norm.bias", "segformer.encoder.block.1.1.attention.output.dense.weight", "segformer.encoder.block.1.1.attention.output.dense.bias", "segformer.encoder.block.1.1.layer_norm_2.weight", "segformer.encoder.block.1.1.layer_norm_2.bias", "segformer.encoder.block.1.1.mlp.dense1.weight", "segformer.encoder.block.1.1.mlp.dense1.bias", "segformer.encoder.block.1.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.1.mlp.dense2.weight", "segformer.encoder.block.1.1.mlp.dense2.bias", "segformer.encoder.block.1.2.layer_norm_1.weight", "segformer.encoder.block.1.2.layer_norm_1.bias", "segformer.encoder.block.1.2.attention.self.query.weight", "segformer.encoder.block.1.2.attention.self.query.bias", "segformer.encoder.block.1.2.attention.self.key.weight", "segformer.encoder.block.1.2.attention.self.key.bias", "segformer.encoder.block.1.2.attention.self.value.weight", "segformer.encoder.block.1.2.attention.self.value.bias", "segformer.encoder.block.1.2.attention.self.sr.weight", "segformer.encoder.block.1.2.attention.self.sr.bias", "segformer.encoder.block.1.2.attention.self.layer_norm.weight", "segformer.encoder.block.1.2.attention.self.layer_norm.bias", "segformer.encoder.block.1.2.attention.output.dense.weight", "segformer.encoder.block.1.2.attention.output.dense.bias", "segformer.encoder.block.1.2.layer_norm_2.weight", "segformer.encoder.block.1.2.layer_norm_2.bias", "segformer.encoder.block.1.2.mlp.dense1.weight", "segformer.encoder.block.1.2.mlp.dense1.bias", "segformer.encoder.block.1.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.2.mlp.dense2.weight", "segformer.encoder.block.1.2.mlp.dense2.bias", "segformer.encoder.block.1.3.layer_norm_1.weight", "segformer.encoder.block.1.3.layer_norm_1.bias", "segformer.encoder.block.1.3.attention.self.query.weight", "segformer.encoder.block.1.3.attention.self.query.bias", "segformer.encoder.block.1.3.attention.self.key.weight", "segformer.encoder.block.1.3.attention.self.key.bias", "segformer.encoder.block.1.3.attention.self.value.weight", "segformer.encoder.block.1.3.attention.self.value.bias", "segformer.encoder.block.1.3.attention.self.sr.weight", "segformer.encoder.block.1.3.attention.self.sr.bias", "segformer.encoder.block.1.3.attention.self.layer_norm.weight", "segformer.encoder.block.1.3.attention.self.layer_norm.bias", "segformer.encoder.block.1.3.attention.output.dense.weight", "segformer.encoder.block.1.3.attention.output.dense.bias", "segformer.encoder.block.1.3.layer_norm_2.weight", "segformer.encoder.block.1.3.layer_norm_2.bias", "segformer.encoder.block.1.3.mlp.dense1.weight", "segformer.encoder.block.1.3.mlp.dense1.bias", "segformer.encoder.block.1.3.mlp.dwconv.dwconv.weight", "segformer.encoder.block.1.3.mlp.dwconv.dwconv.bias", "segformer.encoder.block.1.3.mlp.dense2.weight", "segformer.encoder.block.1.3.mlp.dense2.bias", "segformer.encoder.block.2.0.layer_norm_1.weight", "segformer.encoder.block.2.0.layer_norm_1.bias", "segformer.encoder.block.2.0.attention.self.query.weight", "segformer.encoder.block.2.0.attention.self.query.bias", "segformer.encoder.block.2.0.attention.self.key.weight", "segformer.encoder.block.2.0.attention.self.key.bias", "segformer.encoder.block.2.0.attention.self.value.weight", "segformer.encoder.block.2.0.attention.self.value.bias", "segformer.encoder.block.2.0.attention.self.sr.weight", "segformer.encoder.block.2.0.attention.self.sr.bias", "segformer.encoder.block.2.0.attention.self.layer_norm.weight", "segformer.encoder.block.2.0.attention.self.layer_norm.bias", "segformer.encoder.block.2.0.attention.output.dense.weight", "segformer.encoder.block.2.0.attention.output.dense.bias", "segformer.encoder.block.2.0.layer_norm_2.weight", "segformer.encoder.block.2.0.layer_norm_2.bias", "segformer.encoder.block.2.0.mlp.dense1.weight", "segformer.encoder.block.2.0.mlp.dense1.bias", "segformer.encoder.block.2.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.0.mlp.dense2.weight", "segformer.encoder.block.2.0.mlp.dense2.bias", "segformer.encoder.block.2.1.layer_norm_1.weight", "segformer.encoder.block.2.1.layer_norm_1.bias", "segformer.encoder.block.2.1.attention.self.query.weight", "segformer.encoder.block.2.1.attention.self.query.bias", "segformer.encoder.block.2.1.attention.self.key.weight", "segformer.encoder.block.2.1.attention.self.key.bias", "segformer.encoder.block.2.1.attention.self.value.weight", "segformer.encoder.block.2.1.attention.self.value.bias", "segformer.encoder.block.2.1.attention.self.sr.weight", "segformer.encoder.block.2.1.attention.self.sr.bias", "segformer.encoder.block.2.1.attention.self.layer_norm.weight", "segformer.encoder.block.2.1.attention.self.layer_norm.bias", "segformer.encoder.block.2.1.attention.output.dense.weight", "segformer.encoder.block.2.1.attention.output.dense.bias", "segformer.encoder.block.2.1.layer_norm_2.weight", "segformer.encoder.block.2.1.layer_norm_2.bias", "segformer.encoder.block.2.1.mlp.dense1.weight", "segformer.encoder.block.2.1.mlp.dense1.bias", "segformer.encoder.block.2.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.1.mlp.dense2.weight", "segformer.encoder.block.2.1.mlp.dense2.bias", "segformer.encoder.block.2.2.layer_norm_1.weight", "segformer.encoder.block.2.2.layer_norm_1.bias", "segformer.encoder.block.2.2.attention.self.query.weight", "segformer.encoder.block.2.2.attention.self.query.bias", "segformer.encoder.block.2.2.attention.self.key.weight", "segformer.encoder.block.2.2.attention.self.key.bias", "segformer.encoder.block.2.2.attention.self.value.weight", "segformer.encoder.block.2.2.attention.self.value.bias", "segformer.encoder.block.2.2.attention.self.sr.weight", "segformer.encoder.block.2.2.attention.self.sr.bias", "segformer.encoder.block.2.2.attention.self.layer_norm.weight", "segformer.encoder.block.2.2.attention.self.layer_norm.bias", "segformer.encoder.block.2.2.attention.output.dense.weight", "segformer.encoder.block.2.2.attention.output.dense.bias", "segformer.encoder.block.2.2.layer_norm_2.weight", "segformer.encoder.block.2.2.layer_norm_2.bias", "segformer.encoder.block.2.2.mlp.dense1.weight", "segformer.encoder.block.2.2.mlp.dense1.bias", "segformer.encoder.block.2.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.2.mlp.dense2.weight", "segformer.encoder.block.2.2.mlp.dense2.bias", "segformer.encoder.block.2.3.layer_norm_1.weight", "segformer.encoder.block.2.3.layer_norm_1.bias", "segformer.encoder.block.2.3.attention.self.query.weight", "segformer.encoder.block.2.3.attention.self.query.bias", "segformer.encoder.block.2.3.attention.self.key.weight", "segformer.encoder.block.2.3.attention.self.key.bias", "segformer.encoder.block.2.3.attention.self.value.weight", "segformer.encoder.block.2.3.attention.self.value.bias", "segformer.encoder.block.2.3.attention.self.sr.weight", "segformer.encoder.block.2.3.attention.self.sr.bias", "segformer.encoder.block.2.3.attention.self.layer_norm.weight", "segformer.encoder.block.2.3.attention.self.layer_norm.bias", "segformer.encoder.block.2.3.attention.output.dense.weight", "segformer.encoder.block.2.3.attention.output.dense.bias", "segformer.encoder.block.2.3.layer_norm_2.weight", "segformer.encoder.block.2.3.layer_norm_2.bias", "segformer.encoder.block.2.3.mlp.dense1.weight", "segformer.encoder.block.2.3.mlp.dense1.bias", "segformer.encoder.block.2.3.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.3.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.3.mlp.dense2.weight", "segformer.encoder.block.2.3.mlp.dense2.bias", "segformer.encoder.block.2.4.layer_norm_1.weight", "segformer.encoder.block.2.4.layer_norm_1.bias", "segformer.encoder.block.2.4.attention.self.query.weight", "segformer.encoder.block.2.4.attention.self.query.bias", "segformer.encoder.block.2.4.attention.self.key.weight", "segformer.encoder.block.2.4.attention.self.key.bias", "segformer.encoder.block.2.4.attention.self.value.weight", "segformer.encoder.block.2.4.attention.self.value.bias", "segformer.encoder.block.2.4.attention.self.sr.weight", "segformer.encoder.block.2.4.attention.self.sr.bias", "segformer.encoder.block.2.4.attention.self.layer_norm.weight", "segformer.encoder.block.2.4.attention.self.layer_norm.bias", "segformer.encoder.block.2.4.attention.output.dense.weight", "segformer.encoder.block.2.4.attention.output.dense.bias", "segformer.encoder.block.2.4.layer_norm_2.weight", "segformer.encoder.block.2.4.layer_norm_2.bias", "segformer.encoder.block.2.4.mlp.dense1.weight", "segformer.encoder.block.2.4.mlp.dense1.bias", "segformer.encoder.block.2.4.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.4.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.4.mlp.dense2.weight", "segformer.encoder.block.2.4.mlp.dense2.bias", "segformer.encoder.block.2.5.layer_norm_1.weight", "segformer.encoder.block.2.5.layer_norm_1.bias", "segformer.encoder.block.2.5.attention.self.query.weight", "segformer.encoder.block.2.5.attention.self.query.bias", "segformer.encoder.block.2.5.attention.self.key.weight", "segformer.encoder.block.2.5.attention.self.key.bias", "segformer.encoder.block.2.5.attention.self.value.weight", "segformer.encoder.block.2.5.attention.self.value.bias", "segformer.encoder.block.2.5.attention.self.sr.weight", "segformer.encoder.block.2.5.attention.self.sr.bias", "segformer.encoder.block.2.5.attention.self.layer_norm.weight", "segformer.encoder.block.2.5.attention.self.layer_norm.bias", "segformer.encoder.block.2.5.attention.output.dense.weight", "segformer.encoder.block.2.5.attention.output.dense.bias", "segformer.encoder.block.2.5.layer_norm_2.weight", "segformer.encoder.block.2.5.layer_norm_2.bias", "segformer.encoder.block.2.5.mlp.dense1.weight", "segformer.encoder.block.2.5.mlp.dense1.bias", "segformer.encoder.block.2.5.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.5.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.5.mlp.dense2.weight", "segformer.encoder.block.2.5.mlp.dense2.bias", "segformer.encoder.block.2.6.layer_norm_1.weight", "segformer.encoder.block.2.6.layer_norm_1.bias", "segformer.encoder.block.2.6.attention.self.query.weight", "segformer.encoder.block.2.6.attention.self.query.bias", "segformer.encoder.block.2.6.attention.self.key.weight", "segformer.encoder.block.2.6.attention.self.key.bias", "segformer.encoder.block.2.6.attention.self.value.weight", "segformer.encoder.block.2.6.attention.self.value.bias", "segformer.encoder.block.2.6.attention.self.sr.weight", "segformer.encoder.block.2.6.attention.self.sr.bias", "segformer.encoder.block.2.6.attention.self.layer_norm.weight", "segformer.encoder.block.2.6.attention.self.layer_norm.bias", "segformer.encoder.block.2.6.attention.output.dense.weight", "segformer.encoder.block.2.6.attention.output.dense.bias", "segformer.encoder.block.2.6.layer_norm_2.weight", "segformer.encoder.block.2.6.layer_norm_2.bias", "segformer.encoder.block.2.6.mlp.dense1.weight", "segformer.encoder.block.2.6.mlp.dense1.bias", "segformer.encoder.block.2.6.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.6.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.6.mlp.dense2.weight", "segformer.encoder.block.2.6.mlp.dense2.bias", "segformer.encoder.block.2.7.layer_norm_1.weight", "segformer.encoder.block.2.7.layer_norm_1.bias", "segformer.encoder.block.2.7.attention.self.query.weight", "segformer.encoder.block.2.7.attention.self.query.bias", "segformer.encoder.block.2.7.attention.self.key.weight", "segformer.encoder.block.2.7.attention.self.key.bias", "segformer.encoder.block.2.7.attention.self.value.weight", "segformer.encoder.block.2.7.attention.self.value.bias", "segformer.encoder.block.2.7.attention.self.sr.weight", "segformer.encoder.block.2.7.attention.self.sr.bias", "segformer.encoder.block.2.7.attention.self.layer_norm.weight", "segformer.encoder.block.2.7.attention.self.layer_norm.bias", "segformer.encoder.block.2.7.attention.output.dense.weight", "segformer.encoder.block.2.7.attention.output.dense.bias", "segformer.encoder.block.2.7.layer_norm_2.weight", "segformer.encoder.block.2.7.layer_norm_2.bias", "segformer.encoder.block.2.7.mlp.dense1.weight", "segformer.encoder.block.2.7.mlp.dense1.bias", "segformer.encoder.block.2.7.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.7.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.7.mlp.dense2.weight", "segformer.encoder.block.2.7.mlp.dense2.bias", "segformer.encoder.block.2.8.layer_norm_1.weight", "segformer.encoder.block.2.8.layer_norm_1.bias", "segformer.encoder.block.2.8.attention.self.query.weight", "segformer.encoder.block.2.8.attention.self.query.bias", "segformer.encoder.block.2.8.attention.self.key.weight", "segformer.encoder.block.2.8.attention.self.key.bias", "segformer.encoder.block.2.8.attention.self.value.weight", "segformer.encoder.block.2.8.attention.self.value.bias", "segformer.encoder.block.2.8.attention.self.sr.weight", "segformer.encoder.block.2.8.attention.self.sr.bias", "segformer.encoder.block.2.8.attention.self.layer_norm.weight", "segformer.encoder.block.2.8.attention.self.layer_norm.bias", "segformer.encoder.block.2.8.attention.output.dense.weight", "segformer.encoder.block.2.8.attention.output.dense.bias", "segformer.encoder.block.2.8.layer_norm_2.weight", "segformer.encoder.block.2.8.layer_norm_2.bias", "segformer.encoder.block.2.8.mlp.dense1.weight", "segformer.encoder.block.2.8.mlp.dense1.bias", "segformer.encoder.block.2.8.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.8.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.8.mlp.dense2.weight", "segformer.encoder.block.2.8.mlp.dense2.bias", "segformer.encoder.block.2.9.layer_norm_1.weight", "segformer.encoder.block.2.9.layer_norm_1.bias", "segformer.encoder.block.2.9.attention.self.query.weight", "segformer.encoder.block.2.9.attention.self.query.bias", "segformer.encoder.block.2.9.attention.self.key.weight", "segformer.encoder.block.2.9.attention.self.key.bias", "segformer.encoder.block.2.9.attention.self.value.weight", "segformer.encoder.block.2.9.attention.self.value.bias", "segformer.encoder.block.2.9.attention.self.sr.weight", "segformer.encoder.block.2.9.attention.self.sr.bias", "segformer.encoder.block.2.9.attention.self.layer_norm.weight", "segformer.encoder.block.2.9.attention.self.layer_norm.bias", "segformer.encoder.block.2.9.attention.output.dense.weight", "segformer.encoder.block.2.9.attention.output.dense.bias", "segformer.encoder.block.2.9.layer_norm_2.weight", "segformer.encoder.block.2.9.layer_norm_2.bias", "segformer.encoder.block.2.9.mlp.dense1.weight", "segformer.encoder.block.2.9.mlp.dense1.bias", "segformer.encoder.block.2.9.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.9.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.9.mlp.dense2.weight", "segformer.encoder.block.2.9.mlp.dense2.bias", "segformer.encoder.block.2.10.layer_norm_1.weight", "segformer.encoder.block.2.10.layer_norm_1.bias", "segformer.encoder.block.2.10.attention.self.query.weight", "segformer.encoder.block.2.10.attention.self.query.bias", "segformer.encoder.block.2.10.attention.self.key.weight", "segformer.encoder.block.2.10.attention.self.key.bias", "segformer.encoder.block.2.10.attention.self.value.weight", "segformer.encoder.block.2.10.attention.self.value.bias", "segformer.encoder.block.2.10.attention.self.sr.weight", "segformer.encoder.block.2.10.attention.self.sr.bias", "segformer.encoder.block.2.10.attention.self.layer_norm.weight", "segformer.encoder.block.2.10.attention.self.layer_norm.bias", "segformer.encoder.block.2.10.attention.output.dense.weight", "segformer.encoder.block.2.10.attention.output.dense.bias", "segformer.encoder.block.2.10.layer_norm_2.weight", "segformer.encoder.block.2.10.layer_norm_2.bias", "segformer.encoder.block.2.10.mlp.dense1.weight", "segformer.encoder.block.2.10.mlp.dense1.bias", "segformer.encoder.block.2.10.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.10.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.10.mlp.dense2.weight", "segformer.encoder.block.2.10.mlp.dense2.bias", "segformer.encoder.block.2.11.layer_norm_1.weight", "segformer.encoder.block.2.11.layer_norm_1.bias", "segformer.encoder.block.2.11.attention.self.query.weight", "segformer.encoder.block.2.11.attention.self.query.bias", "segformer.encoder.block.2.11.attention.self.key.weight", "segformer.encoder.block.2.11.attention.self.key.bias", "segformer.encoder.block.2.11.attention.self.value.weight", "segformer.encoder.block.2.11.attention.self.value.bias", "segformer.encoder.block.2.11.attention.self.sr.weight", "segformer.encoder.block.2.11.attention.self.sr.bias", "segformer.encoder.block.2.11.attention.self.layer_norm.weight", "segformer.encoder.block.2.11.attention.self.layer_norm.bias", "segformer.encoder.block.2.11.attention.output.dense.weight", "segformer.encoder.block.2.11.attention.output.dense.bias", "segformer.encoder.block.2.11.layer_norm_2.weight", "segformer.encoder.block.2.11.layer_norm_2.bias", "segformer.encoder.block.2.11.mlp.dense1.weight", "segformer.encoder.block.2.11.mlp.dense1.bias", "segformer.encoder.block.2.11.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.11.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.11.mlp.dense2.weight", "segformer.encoder.block.2.11.mlp.dense2.bias", "segformer.encoder.block.2.12.layer_norm_1.weight", "segformer.encoder.block.2.12.layer_norm_1.bias", "segformer.encoder.block.2.12.attention.self.query.weight", "segformer.encoder.block.2.12.attention.self.query.bias", "segformer.encoder.block.2.12.attention.self.key.weight", "segformer.encoder.block.2.12.attention.self.key.bias", "segformer.encoder.block.2.12.attention.self.value.weight", "segformer.encoder.block.2.12.attention.self.value.bias", "segformer.encoder.block.2.12.attention.self.sr.weight", "segformer.encoder.block.2.12.attention.self.sr.bias", "segformer.encoder.block.2.12.attention.self.layer_norm.weight", "segformer.encoder.block.2.12.attention.self.layer_norm.bias", "segformer.encoder.block.2.12.attention.output.dense.weight", "segformer.encoder.block.2.12.attention.output.dense.bias", "segformer.encoder.block.2.12.layer_norm_2.weight", "segformer.encoder.block.2.12.layer_norm_2.bias", "segformer.encoder.block.2.12.mlp.dense1.weight", "segformer.encoder.block.2.12.mlp.dense1.bias", "segformer.encoder.block.2.12.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.12.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.12.mlp.dense2.weight", "segformer.encoder.block.2.12.mlp.dense2.bias", "segformer.encoder.block.2.13.layer_norm_1.weight", "segformer.encoder.block.2.13.layer_norm_1.bias", "segformer.encoder.block.2.13.attention.self.query.weight", "segformer.encoder.block.2.13.attention.self.query.bias", "segformer.encoder.block.2.13.attention.self.key.weight", "segformer.encoder.block.2.13.attention.self.key.bias", "segformer.encoder.block.2.13.attention.self.value.weight", "segformer.encoder.block.2.13.attention.self.value.bias", "segformer.encoder.block.2.13.attention.self.sr.weight", "segformer.encoder.block.2.13.attention.self.sr.bias", "segformer.encoder.block.2.13.attention.self.layer_norm.weight", "segformer.encoder.block.2.13.attention.self.layer_norm.bias", "segformer.encoder.block.2.13.attention.output.dense.weight", "segformer.encoder.block.2.13.attention.output.dense.bias", "segformer.encoder.block.2.13.layer_norm_2.weight", "segformer.encoder.block.2.13.layer_norm_2.bias", "segformer.encoder.block.2.13.mlp.dense1.weight", "segformer.encoder.block.2.13.mlp.dense1.bias", "segformer.encoder.block.2.13.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.13.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.13.mlp.dense2.weight", "segformer.encoder.block.2.13.mlp.dense2.bias", "segformer.encoder.block.2.14.layer_norm_1.weight", "segformer.encoder.block.2.14.layer_norm_1.bias", "segformer.encoder.block.2.14.attention.self.query.weight", "segformer.encoder.block.2.14.attention.self.query.bias", "segformer.encoder.block.2.14.attention.self.key.weight", "segformer.encoder.block.2.14.attention.self.key.bias", "segformer.encoder.block.2.14.attention.self.value.weight", "segformer.encoder.block.2.14.attention.self.value.bias", "segformer.encoder.block.2.14.attention.self.sr.weight", "segformer.encoder.block.2.14.attention.self.sr.bias", "segformer.encoder.block.2.14.attention.self.layer_norm.weight", "segformer.encoder.block.2.14.attention.self.layer_norm.bias", "segformer.encoder.block.2.14.attention.output.dense.weight", "segformer.encoder.block.2.14.attention.output.dense.bias", "segformer.encoder.block.2.14.layer_norm_2.weight", "segformer.encoder.block.2.14.layer_norm_2.bias", "segformer.encoder.block.2.14.mlp.dense1.weight", "segformer.encoder.block.2.14.mlp.dense1.bias", "segformer.encoder.block.2.14.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.14.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.14.mlp.dense2.weight", "segformer.encoder.block.2.14.mlp.dense2.bias", "segformer.encoder.block.2.15.layer_norm_1.weight", "segformer.encoder.block.2.15.layer_norm_1.bias", "segformer.encoder.block.2.15.attention.self.query.weight", "segformer.encoder.block.2.15.attention.self.query.bias", "segformer.encoder.block.2.15.attention.self.key.weight", "segformer.encoder.block.2.15.attention.self.key.bias", "segformer.encoder.block.2.15.attention.self.value.weight", "segformer.encoder.block.2.15.attention.self.value.bias", "segformer.encoder.block.2.15.attention.self.sr.weight", "segformer.encoder.block.2.15.attention.self.sr.bias", "segformer.encoder.block.2.15.attention.self.layer_norm.weight", "segformer.encoder.block.2.15.attention.self.layer_norm.bias", "segformer.encoder.block.2.15.attention.output.dense.weight", "segformer.encoder.block.2.15.attention.output.dense.bias", "segformer.encoder.block.2.15.layer_norm_2.weight", "segformer.encoder.block.2.15.layer_norm_2.bias", "segformer.encoder.block.2.15.mlp.dense1.weight", "segformer.encoder.block.2.15.mlp.dense1.bias", "segformer.encoder.block.2.15.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.15.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.15.mlp.dense2.weight", "segformer.encoder.block.2.15.mlp.dense2.bias", "segformer.encoder.block.2.16.layer_norm_1.weight", "segformer.encoder.block.2.16.layer_norm_1.bias", "segformer.encoder.block.2.16.attention.self.query.weight", "segformer.encoder.block.2.16.attention.self.query.bias", "segformer.encoder.block.2.16.attention.self.key.weight", "segformer.encoder.block.2.16.attention.self.key.bias", "segformer.encoder.block.2.16.attention.self.value.weight", "segformer.encoder.block.2.16.attention.self.value.bias", "segformer.encoder.block.2.16.attention.self.sr.weight", "segformer.encoder.block.2.16.attention.self.sr.bias", "segformer.encoder.block.2.16.attention.self.layer_norm.weight", "segformer.encoder.block.2.16.attention.self.layer_norm.bias", "segformer.encoder.block.2.16.attention.output.dense.weight", "segformer.encoder.block.2.16.attention.output.dense.bias", "segformer.encoder.block.2.16.layer_norm_2.weight", "segformer.encoder.block.2.16.layer_norm_2.bias", "segformer.encoder.block.2.16.mlp.dense1.weight", "segformer.encoder.block.2.16.mlp.dense1.bias", "segformer.encoder.block.2.16.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.16.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.16.mlp.dense2.weight", "segformer.encoder.block.2.16.mlp.dense2.bias", "segformer.encoder.block.2.17.layer_norm_1.weight", "segformer.encoder.block.2.17.layer_norm_1.bias", "segformer.encoder.block.2.17.attention.self.query.weight", "segformer.encoder.block.2.17.attention.self.query.bias", "segformer.encoder.block.2.17.attention.self.key.weight", "segformer.encoder.block.2.17.attention.self.key.bias", "segformer.encoder.block.2.17.attention.self.value.weight", "segformer.encoder.block.2.17.attention.self.value.bias", "segformer.encoder.block.2.17.attention.self.sr.weight", "segformer.encoder.block.2.17.attention.self.sr.bias", "segformer.encoder.block.2.17.attention.self.layer_norm.weight", "segformer.encoder.block.2.17.attention.self.layer_norm.bias", "segformer.encoder.block.2.17.attention.output.dense.weight", "segformer.encoder.block.2.17.attention.output.dense.bias", "segformer.encoder.block.2.17.layer_norm_2.weight", "segformer.encoder.block.2.17.layer_norm_2.bias", "segformer.encoder.block.2.17.mlp.dense1.weight", "segformer.encoder.block.2.17.mlp.dense1.bias", "segformer.encoder.block.2.17.mlp.dwconv.dwconv.weight", "segformer.encoder.block.2.17.mlp.dwconv.dwconv.bias", "segformer.encoder.block.2.17.mlp.dense2.weight", "segformer.encoder.block.2.17.mlp.dense2.bias", "segformer.encoder.block.3.0.layer_norm_1.weight", "segformer.encoder.block.3.0.layer_norm_1.bias", "segformer.encoder.block.3.0.attention.self.query.weight", "segformer.encoder.block.3.0.attention.self.query.bias", "segformer.encoder.block.3.0.attention.self.key.weight", "segformer.encoder.block.3.0.attention.self.key.bias", "segformer.encoder.block.3.0.attention.self.value.weight", "segformer.encoder.block.3.0.attention.self.value.bias", "segformer.encoder.block.3.0.attention.output.dense.weight", "segformer.encoder.block.3.0.attention.output.dense.bias", "segformer.encoder.block.3.0.layer_norm_2.weight", "segformer.encoder.block.3.0.layer_norm_2.bias", "segformer.encoder.block.3.0.mlp.dense1.weight", "segformer.encoder.block.3.0.mlp.dense1.bias", "segformer.encoder.block.3.0.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.0.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.0.mlp.dense2.weight", "segformer.encoder.block.3.0.mlp.dense2.bias", "segformer.encoder.block.3.1.layer_norm_1.weight", "segformer.encoder.block.3.1.layer_norm_1.bias", "segformer.encoder.block.3.1.attention.self.query.weight", "segformer.encoder.block.3.1.attention.self.query.bias", "segformer.encoder.block.3.1.attention.self.key.weight", "segformer.encoder.block.3.1.attention.self.key.bias", "segformer.encoder.block.3.1.attention.self.value.weight", "segformer.encoder.block.3.1.attention.self.value.bias", "segformer.encoder.block.3.1.attention.output.dense.weight", "segformer.encoder.block.3.1.attention.output.dense.bias", "segformer.encoder.block.3.1.layer_norm_2.weight", "segformer.encoder.block.3.1.layer_norm_2.bias", "segformer.encoder.block.3.1.mlp.dense1.weight", "segformer.encoder.block.3.1.mlp.dense1.bias", "segformer.encoder.block.3.1.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.1.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.1.mlp.dense2.weight", "segformer.encoder.block.3.1.mlp.dense2.bias", "segformer.encoder.block.3.2.layer_norm_1.weight", "segformer.encoder.block.3.2.layer_norm_1.bias", "segformer.encoder.block.3.2.attention.self.query.weight", "segformer.encoder.block.3.2.attention.self.query.bias", "segformer.encoder.block.3.2.attention.self.key.weight", "segformer.encoder.block.3.2.attention.self.key.bias", "segformer.encoder.block.3.2.attention.self.value.weight", "segformer.encoder.block.3.2.attention.self.value.bias", "segformer.encoder.block.3.2.attention.output.dense.weight", "segformer.encoder.block.3.2.attention.output.dense.bias", "segformer.encoder.block.3.2.layer_norm_2.weight", "segformer.encoder.block.3.2.layer_norm_2.bias", "segformer.encoder.block.3.2.mlp.dense1.weight", "segformer.encoder.block.3.2.mlp.dense1.bias", "segformer.encoder.block.3.2.mlp.dwconv.dwconv.weight", "segformer.encoder.block.3.2.mlp.dwconv.dwconv.bias", "segformer.encoder.block.3.2.mlp.dense2.weight", "segformer.encoder.block.3.2.mlp.dense2.bias", "segformer.encoder.layer_norm.0.weight", "segformer.encoder.layer_norm.0.bias", "segformer.encoder.layer_norm.1.weight", "segformer.encoder.layer_norm.1.bias", "segformer.encoder.layer_norm.2.weight", "segformer.encoder.layer_norm.2.bias", "segformer.encoder.layer_norm.3.weight", "segformer.encoder.layer_norm.3.bias", "decode_head.linear_c.0.proj.weight", "decode_head.linear_c.0.proj.bias", "decode_head.linear_c.1.proj.weight", "decode_head.linear_c.1.proj.bias", "decode_head.linear_c.2.proj.weight", "decode_head.linear_c.2.proj.bias", "decode_head.linear_c.3.proj.weight", "decode_head.linear_c.3.proj.bias", "decode_head.linear_fuse.weight", "decode_head.batch_norm.weight", "decode_head.batch_norm.bias", "decode_head.batch_norm.running_mean", "decode_head.batch_norm.running_var", "decode_head.classifier.weight", "decode_head.classifier.bias". 
	Unexpected key(s) in state_dict: "module.segformer.encoder.patch_embeddings.0.proj.weight", "module.segformer.encoder.patch_embeddings.0.proj.bias", "module.segformer.encoder.patch_embeddings.0.layer_norm.weight", "module.segformer.encoder.patch_embeddings.0.layer_norm.bias", "module.segformer.encoder.patch_embeddings.1.proj.weight", "module.segformer.encoder.patch_embeddings.1.proj.bias", "module.segformer.encoder.patch_embeddings.1.layer_norm.weight", "module.segformer.encoder.patch_embeddings.1.layer_norm.bias", "module.segformer.encoder.patch_embeddings.2.proj.weight", "module.segformer.encoder.patch_embeddings.2.proj.bias", "module.segformer.encoder.patch_embeddings.2.layer_norm.weight", "module.segformer.encoder.patch_embeddings.2.layer_norm.bias", "module.segformer.encoder.patch_embeddings.3.proj.weight", "module.segformer.encoder.patch_embeddings.3.proj.bias", "module.segformer.encoder.patch_embeddings.3.layer_norm.weight", "module.segformer.encoder.patch_embeddings.3.layer_norm.bias", "module.segformer.encoder.block.0.0.layer_norm_1.weight", "module.segformer.encoder.block.0.0.layer_norm_1.bias", "module.segformer.encoder.block.0.0.attention.self.query.weight", "module.segformer.encoder.block.0.0.attention.self.query.bias", "module.segformer.encoder.block.0.0.attention.self.key.weight", "module.segformer.encoder.block.0.0.attention.self.key.bias", "module.segformer.encoder.block.0.0.attention.self.value.weight", "module.segformer.encoder.block.0.0.attention.self.value.bias", "module.segformer.encoder.block.0.0.attention.self.sr.weight", "module.segformer.encoder.block.0.0.attention.self.sr.bias", "module.segformer.encoder.block.0.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.0.attention.output.dense.weight", "module.segformer.encoder.block.0.0.attention.output.dense.bias", "module.segformer.encoder.block.0.0.layer_norm_2.weight", "module.segformer.encoder.block.0.0.layer_norm_2.bias", "module.segformer.encoder.block.0.0.mlp.dense1.weight", "module.segformer.encoder.block.0.0.mlp.dense1.bias", "module.segformer.encoder.block.0.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.0.mlp.dense2.weight", "module.segformer.encoder.block.0.0.mlp.dense2.bias", "module.segformer.encoder.block.0.1.layer_norm_1.weight", "module.segformer.encoder.block.0.1.layer_norm_1.bias", "module.segformer.encoder.block.0.1.attention.self.query.weight", "module.segformer.encoder.block.0.1.attention.self.query.bias", "module.segformer.encoder.block.0.1.attention.self.key.weight", "module.segformer.encoder.block.0.1.attention.self.key.bias", "module.segformer.encoder.block.0.1.attention.self.value.weight", "module.segformer.encoder.block.0.1.attention.self.value.bias", "module.segformer.encoder.block.0.1.attention.self.sr.weight", "module.segformer.encoder.block.0.1.attention.self.sr.bias", "module.segformer.encoder.block.0.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.1.attention.output.dense.weight", "module.segformer.encoder.block.0.1.attention.output.dense.bias", "module.segformer.encoder.block.0.1.layer_norm_2.weight", "module.segformer.encoder.block.0.1.layer_norm_2.bias", "module.segformer.encoder.block.0.1.mlp.dense1.weight", "module.segformer.encoder.block.0.1.mlp.dense1.bias", "module.segformer.encoder.block.0.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.1.mlp.dense2.weight", "module.segformer.encoder.block.0.1.mlp.dense2.bias", "module.segformer.encoder.block.0.2.layer_norm_1.weight", "module.segformer.encoder.block.0.2.layer_norm_1.bias", "module.segformer.encoder.block.0.2.attention.self.query.weight", "module.segformer.encoder.block.0.2.attention.self.query.bias", "module.segformer.encoder.block.0.2.attention.self.key.weight", "module.segformer.encoder.block.0.2.attention.self.key.bias", "module.segformer.encoder.block.0.2.attention.self.value.weight", "module.segformer.encoder.block.0.2.attention.self.value.bias", "module.segformer.encoder.block.0.2.attention.self.sr.weight", "module.segformer.encoder.block.0.2.attention.self.sr.bias", "module.segformer.encoder.block.0.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.0.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.0.2.attention.output.dense.weight", "module.segformer.encoder.block.0.2.attention.output.dense.bias", "module.segformer.encoder.block.0.2.layer_norm_2.weight", "module.segformer.encoder.block.0.2.layer_norm_2.bias", "module.segformer.encoder.block.0.2.mlp.dense1.weight", "module.segformer.encoder.block.0.2.mlp.dense1.bias", "module.segformer.encoder.block.0.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.0.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.0.2.mlp.dense2.weight", "module.segformer.encoder.block.0.2.mlp.dense2.bias", "module.segformer.encoder.block.1.0.layer_norm_1.weight", "module.segformer.encoder.block.1.0.layer_norm_1.bias", "module.segformer.encoder.block.1.0.attention.self.query.weight", "module.segformer.encoder.block.1.0.attention.self.query.bias", "module.segformer.encoder.block.1.0.attention.self.key.weight", "module.segformer.encoder.block.1.0.attention.self.key.bias", "module.segformer.encoder.block.1.0.attention.self.value.weight", "module.segformer.encoder.block.1.0.attention.self.value.bias", "module.segformer.encoder.block.1.0.attention.self.sr.weight", "module.segformer.encoder.block.1.0.attention.self.sr.bias", "module.segformer.encoder.block.1.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.0.attention.output.dense.weight", "module.segformer.encoder.block.1.0.attention.output.dense.bias", "module.segformer.encoder.block.1.0.layer_norm_2.weight", "module.segformer.encoder.block.1.0.layer_norm_2.bias", "module.segformer.encoder.block.1.0.mlp.dense1.weight", "module.segformer.encoder.block.1.0.mlp.dense1.bias", "module.segformer.encoder.block.1.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.0.mlp.dense2.weight", "module.segformer.encoder.block.1.0.mlp.dense2.bias", "module.segformer.encoder.block.1.1.layer_norm_1.weight", "module.segformer.encoder.block.1.1.layer_norm_1.bias", "module.segformer.encoder.block.1.1.attention.self.query.weight", "module.segformer.encoder.block.1.1.attention.self.query.bias", "module.segformer.encoder.block.1.1.attention.self.key.weight", "module.segformer.encoder.block.1.1.attention.self.key.bias", "module.segformer.encoder.block.1.1.attention.self.value.weight", "module.segformer.encoder.block.1.1.attention.self.value.bias", "module.segformer.encoder.block.1.1.attention.self.sr.weight", "module.segformer.encoder.block.1.1.attention.self.sr.bias", "module.segformer.encoder.block.1.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.1.attention.output.dense.weight", "module.segformer.encoder.block.1.1.attention.output.dense.bias", "module.segformer.encoder.block.1.1.layer_norm_2.weight", "module.segformer.encoder.block.1.1.layer_norm_2.bias", "module.segformer.encoder.block.1.1.mlp.dense1.weight", "module.segformer.encoder.block.1.1.mlp.dense1.bias", "module.segformer.encoder.block.1.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.1.mlp.dense2.weight", "module.segformer.encoder.block.1.1.mlp.dense2.bias", "module.segformer.encoder.block.1.2.layer_norm_1.weight", "module.segformer.encoder.block.1.2.layer_norm_1.bias", "module.segformer.encoder.block.1.2.attention.self.query.weight", "module.segformer.encoder.block.1.2.attention.self.query.bias", "module.segformer.encoder.block.1.2.attention.self.key.weight", "module.segformer.encoder.block.1.2.attention.self.key.bias", "module.segformer.encoder.block.1.2.attention.self.value.weight", "module.segformer.encoder.block.1.2.attention.self.value.bias", "module.segformer.encoder.block.1.2.attention.self.sr.weight", "module.segformer.encoder.block.1.2.attention.self.sr.bias", "module.segformer.encoder.block.1.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.2.attention.output.dense.weight", "module.segformer.encoder.block.1.2.attention.output.dense.bias", "module.segformer.encoder.block.1.2.layer_norm_2.weight", "module.segformer.encoder.block.1.2.layer_norm_2.bias", "module.segformer.encoder.block.1.2.mlp.dense1.weight", "module.segformer.encoder.block.1.2.mlp.dense1.bias", "module.segformer.encoder.block.1.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.2.mlp.dense2.weight", "module.segformer.encoder.block.1.2.mlp.dense2.bias", "module.segformer.encoder.block.1.3.layer_norm_1.weight", "module.segformer.encoder.block.1.3.layer_norm_1.bias", "module.segformer.encoder.block.1.3.attention.self.query.weight", "module.segformer.encoder.block.1.3.attention.self.query.bias", "module.segformer.encoder.block.1.3.attention.self.key.weight", "module.segformer.encoder.block.1.3.attention.self.key.bias", "module.segformer.encoder.block.1.3.attention.self.value.weight", "module.segformer.encoder.block.1.3.attention.self.value.bias", "module.segformer.encoder.block.1.3.attention.self.sr.weight", "module.segformer.encoder.block.1.3.attention.self.sr.bias", "module.segformer.encoder.block.1.3.attention.self.layer_norm.weight", "module.segformer.encoder.block.1.3.attention.self.layer_norm.bias", "module.segformer.encoder.block.1.3.attention.output.dense.weight", "module.segformer.encoder.block.1.3.attention.output.dense.bias", "module.segformer.encoder.block.1.3.layer_norm_2.weight", "module.segformer.encoder.block.1.3.layer_norm_2.bias", "module.segformer.encoder.block.1.3.mlp.dense1.weight", "module.segformer.encoder.block.1.3.mlp.dense1.bias", "module.segformer.encoder.block.1.3.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.1.3.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.1.3.mlp.dense2.weight", "module.segformer.encoder.block.1.3.mlp.dense2.bias", "module.segformer.encoder.block.2.0.layer_norm_1.weight", "module.segformer.encoder.block.2.0.layer_norm_1.bias", "module.segformer.encoder.block.2.0.attention.self.query.weight", "module.segformer.encoder.block.2.0.attention.self.query.bias", "module.segformer.encoder.block.2.0.attention.self.key.weight", "module.segformer.encoder.block.2.0.attention.self.key.bias", "module.segformer.encoder.block.2.0.attention.self.value.weight", "module.segformer.encoder.block.2.0.attention.self.value.bias", "module.segformer.encoder.block.2.0.attention.self.sr.weight", "module.segformer.encoder.block.2.0.attention.self.sr.bias", "module.segformer.encoder.block.2.0.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.0.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.0.attention.output.dense.weight", "module.segformer.encoder.block.2.0.attention.output.dense.bias", "module.segformer.encoder.block.2.0.layer_norm_2.weight", "module.segformer.encoder.block.2.0.layer_norm_2.bias", "module.segformer.encoder.block.2.0.mlp.dense1.weight", "module.segformer.encoder.block.2.0.mlp.dense1.bias", "module.segformer.encoder.block.2.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.0.mlp.dense2.weight", "module.segformer.encoder.block.2.0.mlp.dense2.bias", "module.segformer.encoder.block.2.1.layer_norm_1.weight", "module.segformer.encoder.block.2.1.layer_norm_1.bias", "module.segformer.encoder.block.2.1.attention.self.query.weight", "module.segformer.encoder.block.2.1.attention.self.query.bias", "module.segformer.encoder.block.2.1.attention.self.key.weight", "module.segformer.encoder.block.2.1.attention.self.key.bias", "module.segformer.encoder.block.2.1.attention.self.value.weight", "module.segformer.encoder.block.2.1.attention.self.value.bias", "module.segformer.encoder.block.2.1.attention.self.sr.weight", "module.segformer.encoder.block.2.1.attention.self.sr.bias", "module.segformer.encoder.block.2.1.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.1.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.1.attention.output.dense.weight", "module.segformer.encoder.block.2.1.attention.output.dense.bias", "module.segformer.encoder.block.2.1.layer_norm_2.weight", "module.segformer.encoder.block.2.1.layer_norm_2.bias", "module.segformer.encoder.block.2.1.mlp.dense1.weight", "module.segformer.encoder.block.2.1.mlp.dense1.bias", "module.segformer.encoder.block.2.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.1.mlp.dense2.weight", "module.segformer.encoder.block.2.1.mlp.dense2.bias", "module.segformer.encoder.block.2.2.layer_norm_1.weight", "module.segformer.encoder.block.2.2.layer_norm_1.bias", "module.segformer.encoder.block.2.2.attention.self.query.weight", "module.segformer.encoder.block.2.2.attention.self.query.bias", "module.segformer.encoder.block.2.2.attention.self.key.weight", "module.segformer.encoder.block.2.2.attention.self.key.bias", "module.segformer.encoder.block.2.2.attention.self.value.weight", "module.segformer.encoder.block.2.2.attention.self.value.bias", "module.segformer.encoder.block.2.2.attention.self.sr.weight", "module.segformer.encoder.block.2.2.attention.self.sr.bias", "module.segformer.encoder.block.2.2.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.2.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.2.attention.output.dense.weight", "module.segformer.encoder.block.2.2.attention.output.dense.bias", "module.segformer.encoder.block.2.2.layer_norm_2.weight", "module.segformer.encoder.block.2.2.layer_norm_2.bias", "module.segformer.encoder.block.2.2.mlp.dense1.weight", "module.segformer.encoder.block.2.2.mlp.dense1.bias", "module.segformer.encoder.block.2.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.2.mlp.dense2.weight", "module.segformer.encoder.block.2.2.mlp.dense2.bias", "module.segformer.encoder.block.2.3.layer_norm_1.weight", "module.segformer.encoder.block.2.3.layer_norm_1.bias", "module.segformer.encoder.block.2.3.attention.self.query.weight", "module.segformer.encoder.block.2.3.attention.self.query.bias", "module.segformer.encoder.block.2.3.attention.self.key.weight", "module.segformer.encoder.block.2.3.attention.self.key.bias", "module.segformer.encoder.block.2.3.attention.self.value.weight", "module.segformer.encoder.block.2.3.attention.self.value.bias", "module.segformer.encoder.block.2.3.attention.self.sr.weight", "module.segformer.encoder.block.2.3.attention.self.sr.bias", "module.segformer.encoder.block.2.3.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.3.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.3.attention.output.dense.weight", "module.segformer.encoder.block.2.3.attention.output.dense.bias", "module.segformer.encoder.block.2.3.layer_norm_2.weight", "module.segformer.encoder.block.2.3.layer_norm_2.bias", "module.segformer.encoder.block.2.3.mlp.dense1.weight", "module.segformer.encoder.block.2.3.mlp.dense1.bias", "module.segformer.encoder.block.2.3.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.3.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.3.mlp.dense2.weight", "module.segformer.encoder.block.2.3.mlp.dense2.bias", "module.segformer.encoder.block.2.4.layer_norm_1.weight", "module.segformer.encoder.block.2.4.layer_norm_1.bias", "module.segformer.encoder.block.2.4.attention.self.query.weight", "module.segformer.encoder.block.2.4.attention.self.query.bias", "module.segformer.encoder.block.2.4.attention.self.key.weight", "module.segformer.encoder.block.2.4.attention.self.key.bias", "module.segformer.encoder.block.2.4.attention.self.value.weight", "module.segformer.encoder.block.2.4.attention.self.value.bias", "module.segformer.encoder.block.2.4.attention.self.sr.weight", "module.segformer.encoder.block.2.4.attention.self.sr.bias", "module.segformer.encoder.block.2.4.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.4.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.4.attention.output.dense.weight", "module.segformer.encoder.block.2.4.attention.output.dense.bias", "module.segformer.encoder.block.2.4.layer_norm_2.weight", "module.segformer.encoder.block.2.4.layer_norm_2.bias", "module.segformer.encoder.block.2.4.mlp.dense1.weight", "module.segformer.encoder.block.2.4.mlp.dense1.bias", "module.segformer.encoder.block.2.4.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.4.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.4.mlp.dense2.weight", "module.segformer.encoder.block.2.4.mlp.dense2.bias", "module.segformer.encoder.block.2.5.layer_norm_1.weight", "module.segformer.encoder.block.2.5.layer_norm_1.bias", "module.segformer.encoder.block.2.5.attention.self.query.weight", "module.segformer.encoder.block.2.5.attention.self.query.bias", "module.segformer.encoder.block.2.5.attention.self.key.weight", "module.segformer.encoder.block.2.5.attention.self.key.bias", "module.segformer.encoder.block.2.5.attention.self.value.weight", "module.segformer.encoder.block.2.5.attention.self.value.bias", "module.segformer.encoder.block.2.5.attention.self.sr.weight", "module.segformer.encoder.block.2.5.attention.self.sr.bias", "module.segformer.encoder.block.2.5.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.5.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.5.attention.output.dense.weight", "module.segformer.encoder.block.2.5.attention.output.dense.bias", "module.segformer.encoder.block.2.5.layer_norm_2.weight", "module.segformer.encoder.block.2.5.layer_norm_2.bias", "module.segformer.encoder.block.2.5.mlp.dense1.weight", "module.segformer.encoder.block.2.5.mlp.dense1.bias", "module.segformer.encoder.block.2.5.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.5.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.5.mlp.dense2.weight", "module.segformer.encoder.block.2.5.mlp.dense2.bias", "module.segformer.encoder.block.2.6.layer_norm_1.weight", "module.segformer.encoder.block.2.6.layer_norm_1.bias", "module.segformer.encoder.block.2.6.attention.self.query.weight", "module.segformer.encoder.block.2.6.attention.self.query.bias", "module.segformer.encoder.block.2.6.attention.self.key.weight", "module.segformer.encoder.block.2.6.attention.self.key.bias", "module.segformer.encoder.block.2.6.attention.self.value.weight", "module.segformer.encoder.block.2.6.attention.self.value.bias", "module.segformer.encoder.block.2.6.attention.self.sr.weight", "module.segformer.encoder.block.2.6.attention.self.sr.bias", "module.segformer.encoder.block.2.6.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.6.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.6.attention.output.dense.weight", "module.segformer.encoder.block.2.6.attention.output.dense.bias", "module.segformer.encoder.block.2.6.layer_norm_2.weight", "module.segformer.encoder.block.2.6.layer_norm_2.bias", "module.segformer.encoder.block.2.6.mlp.dense1.weight", "module.segformer.encoder.block.2.6.mlp.dense1.bias", "module.segformer.encoder.block.2.6.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.6.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.6.mlp.dense2.weight", "module.segformer.encoder.block.2.6.mlp.dense2.bias", "module.segformer.encoder.block.2.7.layer_norm_1.weight", "module.segformer.encoder.block.2.7.layer_norm_1.bias", "module.segformer.encoder.block.2.7.attention.self.query.weight", "module.segformer.encoder.block.2.7.attention.self.query.bias", "module.segformer.encoder.block.2.7.attention.self.key.weight", "module.segformer.encoder.block.2.7.attention.self.key.bias", "module.segformer.encoder.block.2.7.attention.self.value.weight", "module.segformer.encoder.block.2.7.attention.self.value.bias", "module.segformer.encoder.block.2.7.attention.self.sr.weight", "module.segformer.encoder.block.2.7.attention.self.sr.bias", "module.segformer.encoder.block.2.7.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.7.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.7.attention.output.dense.weight", "module.segformer.encoder.block.2.7.attention.output.dense.bias", "module.segformer.encoder.block.2.7.layer_norm_2.weight", "module.segformer.encoder.block.2.7.layer_norm_2.bias", "module.segformer.encoder.block.2.7.mlp.dense1.weight", "module.segformer.encoder.block.2.7.mlp.dense1.bias", "module.segformer.encoder.block.2.7.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.7.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.7.mlp.dense2.weight", "module.segformer.encoder.block.2.7.mlp.dense2.bias", "module.segformer.encoder.block.2.8.layer_norm_1.weight", "module.segformer.encoder.block.2.8.layer_norm_1.bias", "module.segformer.encoder.block.2.8.attention.self.query.weight", "module.segformer.encoder.block.2.8.attention.self.query.bias", "module.segformer.encoder.block.2.8.attention.self.key.weight", "module.segformer.encoder.block.2.8.attention.self.key.bias", "module.segformer.encoder.block.2.8.attention.self.value.weight", "module.segformer.encoder.block.2.8.attention.self.value.bias", "module.segformer.encoder.block.2.8.attention.self.sr.weight", "module.segformer.encoder.block.2.8.attention.self.sr.bias", "module.segformer.encoder.block.2.8.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.8.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.8.attention.output.dense.weight", "module.segformer.encoder.block.2.8.attention.output.dense.bias", "module.segformer.encoder.block.2.8.layer_norm_2.weight", "module.segformer.encoder.block.2.8.layer_norm_2.bias", "module.segformer.encoder.block.2.8.mlp.dense1.weight", "module.segformer.encoder.block.2.8.mlp.dense1.bias", "module.segformer.encoder.block.2.8.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.8.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.8.mlp.dense2.weight", "module.segformer.encoder.block.2.8.mlp.dense2.bias", "module.segformer.encoder.block.2.9.layer_norm_1.weight", "module.segformer.encoder.block.2.9.layer_norm_1.bias", "module.segformer.encoder.block.2.9.attention.self.query.weight", "module.segformer.encoder.block.2.9.attention.self.query.bias", "module.segformer.encoder.block.2.9.attention.self.key.weight", "module.segformer.encoder.block.2.9.attention.self.key.bias", "module.segformer.encoder.block.2.9.attention.self.value.weight", "module.segformer.encoder.block.2.9.attention.self.value.bias", "module.segformer.encoder.block.2.9.attention.self.sr.weight", "module.segformer.encoder.block.2.9.attention.self.sr.bias", "module.segformer.encoder.block.2.9.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.9.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.9.attention.output.dense.weight", "module.segformer.encoder.block.2.9.attention.output.dense.bias", "module.segformer.encoder.block.2.9.layer_norm_2.weight", "module.segformer.encoder.block.2.9.layer_norm_2.bias", "module.segformer.encoder.block.2.9.mlp.dense1.weight", "module.segformer.encoder.block.2.9.mlp.dense1.bias", "module.segformer.encoder.block.2.9.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.9.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.9.mlp.dense2.weight", "module.segformer.encoder.block.2.9.mlp.dense2.bias", "module.segformer.encoder.block.2.10.layer_norm_1.weight", "module.segformer.encoder.block.2.10.layer_norm_1.bias", "module.segformer.encoder.block.2.10.attention.self.query.weight", "module.segformer.encoder.block.2.10.attention.self.query.bias", "module.segformer.encoder.block.2.10.attention.self.key.weight", "module.segformer.encoder.block.2.10.attention.self.key.bias", "module.segformer.encoder.block.2.10.attention.self.value.weight", "module.segformer.encoder.block.2.10.attention.self.value.bias", "module.segformer.encoder.block.2.10.attention.self.sr.weight", "module.segformer.encoder.block.2.10.attention.self.sr.bias", "module.segformer.encoder.block.2.10.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.10.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.10.attention.output.dense.weight", "module.segformer.encoder.block.2.10.attention.output.dense.bias", "module.segformer.encoder.block.2.10.layer_norm_2.weight", "module.segformer.encoder.block.2.10.layer_norm_2.bias", "module.segformer.encoder.block.2.10.mlp.dense1.weight", "module.segformer.encoder.block.2.10.mlp.dense1.bias", "module.segformer.encoder.block.2.10.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.10.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.10.mlp.dense2.weight", "module.segformer.encoder.block.2.10.mlp.dense2.bias", "module.segformer.encoder.block.2.11.layer_norm_1.weight", "module.segformer.encoder.block.2.11.layer_norm_1.bias", "module.segformer.encoder.block.2.11.attention.self.query.weight", "module.segformer.encoder.block.2.11.attention.self.query.bias", "module.segformer.encoder.block.2.11.attention.self.key.weight", "module.segformer.encoder.block.2.11.attention.self.key.bias", "module.segformer.encoder.block.2.11.attention.self.value.weight", "module.segformer.encoder.block.2.11.attention.self.value.bias", "module.segformer.encoder.block.2.11.attention.self.sr.weight", "module.segformer.encoder.block.2.11.attention.self.sr.bias", "module.segformer.encoder.block.2.11.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.11.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.11.attention.output.dense.weight", "module.segformer.encoder.block.2.11.attention.output.dense.bias", "module.segformer.encoder.block.2.11.layer_norm_2.weight", "module.segformer.encoder.block.2.11.layer_norm_2.bias", "module.segformer.encoder.block.2.11.mlp.dense1.weight", "module.segformer.encoder.block.2.11.mlp.dense1.bias", "module.segformer.encoder.block.2.11.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.11.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.11.mlp.dense2.weight", "module.segformer.encoder.block.2.11.mlp.dense2.bias", "module.segformer.encoder.block.2.12.layer_norm_1.weight", "module.segformer.encoder.block.2.12.layer_norm_1.bias", "module.segformer.encoder.block.2.12.attention.self.query.weight", "module.segformer.encoder.block.2.12.attention.self.query.bias", "module.segformer.encoder.block.2.12.attention.self.key.weight", "module.segformer.encoder.block.2.12.attention.self.key.bias", "module.segformer.encoder.block.2.12.attention.self.value.weight", "module.segformer.encoder.block.2.12.attention.self.value.bias", "module.segformer.encoder.block.2.12.attention.self.sr.weight", "module.segformer.encoder.block.2.12.attention.self.sr.bias", "module.segformer.encoder.block.2.12.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.12.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.12.attention.output.dense.weight", "module.segformer.encoder.block.2.12.attention.output.dense.bias", "module.segformer.encoder.block.2.12.layer_norm_2.weight", "module.segformer.encoder.block.2.12.layer_norm_2.bias", "module.segformer.encoder.block.2.12.mlp.dense1.weight", "module.segformer.encoder.block.2.12.mlp.dense1.bias", "module.segformer.encoder.block.2.12.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.12.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.12.mlp.dense2.weight", "module.segformer.encoder.block.2.12.mlp.dense2.bias", "module.segformer.encoder.block.2.13.layer_norm_1.weight", "module.segformer.encoder.block.2.13.layer_norm_1.bias", "module.segformer.encoder.block.2.13.attention.self.query.weight", "module.segformer.encoder.block.2.13.attention.self.query.bias", "module.segformer.encoder.block.2.13.attention.self.key.weight", "module.segformer.encoder.block.2.13.attention.self.key.bias", "module.segformer.encoder.block.2.13.attention.self.value.weight", "module.segformer.encoder.block.2.13.attention.self.value.bias", "module.segformer.encoder.block.2.13.attention.self.sr.weight", "module.segformer.encoder.block.2.13.attention.self.sr.bias", "module.segformer.encoder.block.2.13.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.13.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.13.attention.output.dense.weight", "module.segformer.encoder.block.2.13.attention.output.dense.bias", "module.segformer.encoder.block.2.13.layer_norm_2.weight", "module.segformer.encoder.block.2.13.layer_norm_2.bias", "module.segformer.encoder.block.2.13.mlp.dense1.weight", "module.segformer.encoder.block.2.13.mlp.dense1.bias", "module.segformer.encoder.block.2.13.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.13.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.13.mlp.dense2.weight", "module.segformer.encoder.block.2.13.mlp.dense2.bias", "module.segformer.encoder.block.2.14.layer_norm_1.weight", "module.segformer.encoder.block.2.14.layer_norm_1.bias", "module.segformer.encoder.block.2.14.attention.self.query.weight", "module.segformer.encoder.block.2.14.attention.self.query.bias", "module.segformer.encoder.block.2.14.attention.self.key.weight", "module.segformer.encoder.block.2.14.attention.self.key.bias", "module.segformer.encoder.block.2.14.attention.self.value.weight", "module.segformer.encoder.block.2.14.attention.self.value.bias", "module.segformer.encoder.block.2.14.attention.self.sr.weight", "module.segformer.encoder.block.2.14.attention.self.sr.bias", "module.segformer.encoder.block.2.14.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.14.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.14.attention.output.dense.weight", "module.segformer.encoder.block.2.14.attention.output.dense.bias", "module.segformer.encoder.block.2.14.layer_norm_2.weight", "module.segformer.encoder.block.2.14.layer_norm_2.bias", "module.segformer.encoder.block.2.14.mlp.dense1.weight", "module.segformer.encoder.block.2.14.mlp.dense1.bias", "module.segformer.encoder.block.2.14.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.14.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.14.mlp.dense2.weight", "module.segformer.encoder.block.2.14.mlp.dense2.bias", "module.segformer.encoder.block.2.15.layer_norm_1.weight", "module.segformer.encoder.block.2.15.layer_norm_1.bias", "module.segformer.encoder.block.2.15.attention.self.query.weight", "module.segformer.encoder.block.2.15.attention.self.query.bias", "module.segformer.encoder.block.2.15.attention.self.key.weight", "module.segformer.encoder.block.2.15.attention.self.key.bias", "module.segformer.encoder.block.2.15.attention.self.value.weight", "module.segformer.encoder.block.2.15.attention.self.value.bias", "module.segformer.encoder.block.2.15.attention.self.sr.weight", "module.segformer.encoder.block.2.15.attention.self.sr.bias", "module.segformer.encoder.block.2.15.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.15.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.15.attention.output.dense.weight", "module.segformer.encoder.block.2.15.attention.output.dense.bias", "module.segformer.encoder.block.2.15.layer_norm_2.weight", "module.segformer.encoder.block.2.15.layer_norm_2.bias", "module.segformer.encoder.block.2.15.mlp.dense1.weight", "module.segformer.encoder.block.2.15.mlp.dense1.bias", "module.segformer.encoder.block.2.15.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.15.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.15.mlp.dense2.weight", "module.segformer.encoder.block.2.15.mlp.dense2.bias", "module.segformer.encoder.block.2.16.layer_norm_1.weight", "module.segformer.encoder.block.2.16.layer_norm_1.bias", "module.segformer.encoder.block.2.16.attention.self.query.weight", "module.segformer.encoder.block.2.16.attention.self.query.bias", "module.segformer.encoder.block.2.16.attention.self.key.weight", "module.segformer.encoder.block.2.16.attention.self.key.bias", "module.segformer.encoder.block.2.16.attention.self.value.weight", "module.segformer.encoder.block.2.16.attention.self.value.bias", "module.segformer.encoder.block.2.16.attention.self.sr.weight", "module.segformer.encoder.block.2.16.attention.self.sr.bias", "module.segformer.encoder.block.2.16.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.16.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.16.attention.output.dense.weight", "module.segformer.encoder.block.2.16.attention.output.dense.bias", "module.segformer.encoder.block.2.16.layer_norm_2.weight", "module.segformer.encoder.block.2.16.layer_norm_2.bias", "module.segformer.encoder.block.2.16.mlp.dense1.weight", "module.segformer.encoder.block.2.16.mlp.dense1.bias", "module.segformer.encoder.block.2.16.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.16.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.16.mlp.dense2.weight", "module.segformer.encoder.block.2.16.mlp.dense2.bias", "module.segformer.encoder.block.2.17.layer_norm_1.weight", "module.segformer.encoder.block.2.17.layer_norm_1.bias", "module.segformer.encoder.block.2.17.attention.self.query.weight", "module.segformer.encoder.block.2.17.attention.self.query.bias", "module.segformer.encoder.block.2.17.attention.self.key.weight", "module.segformer.encoder.block.2.17.attention.self.key.bias", "module.segformer.encoder.block.2.17.attention.self.value.weight", "module.segformer.encoder.block.2.17.attention.self.value.bias", "module.segformer.encoder.block.2.17.attention.self.sr.weight", "module.segformer.encoder.block.2.17.attention.self.sr.bias", "module.segformer.encoder.block.2.17.attention.self.layer_norm.weight", "module.segformer.encoder.block.2.17.attention.self.layer_norm.bias", "module.segformer.encoder.block.2.17.attention.output.dense.weight", "module.segformer.encoder.block.2.17.attention.output.dense.bias", "module.segformer.encoder.block.2.17.layer_norm_2.weight", "module.segformer.encoder.block.2.17.layer_norm_2.bias", "module.segformer.encoder.block.2.17.mlp.dense1.weight", "module.segformer.encoder.block.2.17.mlp.dense1.bias", "module.segformer.encoder.block.2.17.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.2.17.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.2.17.mlp.dense2.weight", "module.segformer.encoder.block.2.17.mlp.dense2.bias", "module.segformer.encoder.block.3.0.layer_norm_1.weight", "module.segformer.encoder.block.3.0.layer_norm_1.bias", "module.segformer.encoder.block.3.0.attention.self.query.weight", "module.segformer.encoder.block.3.0.attention.self.query.bias", "module.segformer.encoder.block.3.0.attention.self.key.weight", "module.segformer.encoder.block.3.0.attention.self.key.bias", "module.segformer.encoder.block.3.0.attention.self.value.weight", "module.segformer.encoder.block.3.0.attention.self.value.bias", "module.segformer.encoder.block.3.0.attention.output.dense.weight", "module.segformer.encoder.block.3.0.attention.output.dense.bias", "module.segformer.encoder.block.3.0.layer_norm_2.weight", "module.segformer.encoder.block.3.0.layer_norm_2.bias", "module.segformer.encoder.block.3.0.mlp.dense1.weight", "module.segformer.encoder.block.3.0.mlp.dense1.bias", "module.segformer.encoder.block.3.0.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.0.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.0.mlp.dense2.weight", "module.segformer.encoder.block.3.0.mlp.dense2.bias", "module.segformer.encoder.block.3.1.layer_norm_1.weight", "module.segformer.encoder.block.3.1.layer_norm_1.bias", "module.segformer.encoder.block.3.1.attention.self.query.weight", "module.segformer.encoder.block.3.1.attention.self.query.bias", "module.segformer.encoder.block.3.1.attention.self.key.weight", "module.segformer.encoder.block.3.1.attention.self.key.bias", "module.segformer.encoder.block.3.1.attention.self.value.weight", "module.segformer.encoder.block.3.1.attention.self.value.bias", "module.segformer.encoder.block.3.1.attention.output.dense.weight", "module.segformer.encoder.block.3.1.attention.output.dense.bias", "module.segformer.encoder.block.3.1.layer_norm_2.weight", "module.segformer.encoder.block.3.1.layer_norm_2.bias", "module.segformer.encoder.block.3.1.mlp.dense1.weight", "module.segformer.encoder.block.3.1.mlp.dense1.bias", "module.segformer.encoder.block.3.1.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.1.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.1.mlp.dense2.weight", "module.segformer.encoder.block.3.1.mlp.dense2.bias", "module.segformer.encoder.block.3.2.layer_norm_1.weight", "module.segformer.encoder.block.3.2.layer_norm_1.bias", "module.segformer.encoder.block.3.2.attention.self.query.weight", "module.segformer.encoder.block.3.2.attention.self.query.bias", "module.segformer.encoder.block.3.2.attention.self.key.weight", "module.segformer.encoder.block.3.2.attention.self.key.bias", "module.segformer.encoder.block.3.2.attention.self.value.weight", "module.segformer.encoder.block.3.2.attention.self.value.bias", "module.segformer.encoder.block.3.2.attention.output.dense.weight", "module.segformer.encoder.block.3.2.attention.output.dense.bias", "module.segformer.encoder.block.3.2.layer_norm_2.weight", "module.segformer.encoder.block.3.2.layer_norm_2.bias", "module.segformer.encoder.block.3.2.mlp.dense1.weight", "module.segformer.encoder.block.3.2.mlp.dense1.bias", "module.segformer.encoder.block.3.2.mlp.dwconv.dwconv.weight", "module.segformer.encoder.block.3.2.mlp.dwconv.dwconv.bias", "module.segformer.encoder.block.3.2.mlp.dense2.weight", "module.segformer.encoder.block.3.2.mlp.dense2.bias", "module.segformer.encoder.layer_norm.0.weight", "module.segformer.encoder.layer_norm.0.bias", "module.segformer.encoder.layer_norm.1.weight", "module.segformer.encoder.layer_norm.1.bias", "module.segformer.encoder.layer_norm.2.weight", "module.segformer.encoder.layer_norm.2.bias", "module.segformer.encoder.layer_norm.3.weight", "module.segformer.encoder.layer_norm.3.bias", "module.decode_head.linear_c.0.proj.weight", "module.decode_head.linear_c.0.proj.bias", "module.decode_head.linear_c.1.proj.weight", "module.decode_head.linear_c.1.proj.bias", "module.decode_head.linear_c.2.proj.weight", "module.decode_head.linear_c.2.proj.bias", "module.decode_head.linear_c.3.proj.weight", "module.decode_head.linear_c.3.proj.bias", "module.decode_head.linear_fuse.weight", "module.decode_head.batch_norm.weight", "module.decode_head.batch_norm.bias", "module.decode_head.batch_norm.running_mean", "module.decode_head.batch_norm.running_var", "module.decode_head.batch_norm.num_batches_tracked", "module.decode_head.classifier.weight", "module.decode_head.classifier.bias". 

In [9]:
import torch
import numpy as np
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from torchvision.transforms import Resize
from torchvision.transforms.functional import InterpolationMode
from sklearn.metrics import balanced_accuracy_score

def remove_module_prefix(state_dict):
    """Remove 'module.' prefix from state_dict keys if present"""
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove 'module.' prefix
        else:
            new_key = key
        new_state_dict[new_key] = value
    return new_state_dict

def predict(image, model, processor):
    model.eval()
    with torch.no_grad():
        image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
        inputs = processor(
            images=image_rgb,
            return_tensors="pt",
            do_rescale=False
        )
        pixel_values = inputs['pixel_values'].cuda()
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        pred = probs[0, 1].cpu().numpy()
        return pred

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b3")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b3",
    num_labels=2,
    ignore_mismatched_sizes=True
).cuda()

# Load the best checkpoint
checkpoint = torch.load('best_model.pt')
state_dict = remove_module_prefix(checkpoint['model_state_dict'])
model.load_state_dict(state_dict)

# Create resize transform
resize_transform = Resize(
    size=(48, 48),
    interpolation=InterpolationMode.BILINEAR,
    antialias=True
)

# Evaluate model
model.eval()
X_test_reshaped = X_test.reshape(-1, 48, 48, 1)
X_test_normalized = X_test_reshaped / 255.0

print("Making predictions on test set...")
predictions = np.zeros((len(X_test_normalized), 48, 48))

for i, image in enumerate(X_test_normalized):
    if i % 20 == 0:
        print(f"Predicting image {i}/{len(X_test_normalized)}")
    pred = predict(image, model, processor)
    
    if pred.shape != (48, 48):
        pred_tensor = torch.from_numpy(pred).unsqueeze(0)
        pred_resized = resize_transform(pred_tensor).squeeze().numpy()
        predictions[i] = pred_resized
    else:
        predictions[i] = pred

print(f"Ground truth shape: {y_test.reshape(-1, 48, 48).shape}")
print(f"Predictions shape: {predictions.shape}")

# Calculate metrics
y_true_flat = y_test.reshape(-1, 48, 48).flatten()
y_pred_flat = predictions.flatten()
y_pred_flat = (y_pred_flat > 0.5).astype(int)

print(f"Flattened shapes - True: {y_true_flat.shape}, Pred: {y_pred_flat.shape}")
balanced_acc = balanced_accuracy_score(y_true_flat, y_pred_flat)
print(f"Final Balanced Accuracy: {balanced_acc:.4f}")

# Additional metrics
tp = np.sum((y_true_flat == 1) & (y_pred_flat == 1))
tn = np.sum((y_true_flat == 0) & (y_pred_flat == 0))
fp = np.sum((y_true_flat == 0) & (y_pred_flat == 1))
fn = np.sum((y_true_flat == 1) & (y_pred_flat == 0))

sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)

print("\nDetailed Metrics:")
print(f"Sensitivity (True Positive Rate): {sensitivity:.4f}")
print(f"Specificity (True Negative Rate): {specificity:.4f}")
print(f"True Positives: {tp}")
print(f"True Negatives: {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")

Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b3 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pt')


Making predictions on test set...
Predicting image 0/110
Predicting image 20/110
Predicting image 40/110
Predicting image 60/110
Predicting image 80/110
Predicting image 100/110
Ground truth shape: (110, 48, 48)
Predictions shape: (110, 48, 48)
Flattened shapes - True: (253440,), Pred: (253440,)
Final Balanced Accuracy: 0.8258

Detailed Metrics:
Sensitivity (True Positive Rate): 0.7271
Specificity (True Negative Rate): 0.9246
True Positives: 54176
True Negatives: 165435
False Positives: 13495
False Negatives: 20334


In [10]:
X_final = np.load('/kaggle/input/crater-segmentation/Xtest2_b.npy')

In [11]:
import torch
import numpy as np
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from torchvision.transforms import Resize
from torchvision.transforms.functional import InterpolationMode

def remove_module_prefix(state_dict):
    """Remove 'module.' prefix from state_dict keys if present"""
    new_state_dict = {}
    for key, value in state_dict.items():
        if key.startswith('module.'):
            new_key = key[7:]  # Remove 'module.' prefix
        else:
            new_key = key
        new_state_dict[new_key] = value
    return new_state_dict

def predict(image, model, processor):
    model.eval()
    with torch.no_grad():
        image_rgb = np.repeat(image.reshape(48, 48, 1), 3, axis=-1)
        inputs = processor(
            images=image_rgb,
            return_tensors="pt",
            do_rescale=False
        )
        pixel_values = inputs['pixel_values'].cuda()
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        pred = probs[0, 1].cpu().numpy()
        return pred

# Load and prepare X_final
X_final = np.load('/kaggle/input/crater-segmentation/Xtest2_b.npy')
X_final_normalized = X_final / 255.0

# Initialize model and processor
processor = SegformerImageProcessor.from_pretrained("nvidia/mit-b3")
model = SegformerForSemanticSegmentation.from_pretrained(
    "nvidia/mit-b3",
    num_labels=2,
    ignore_mismatched_sizes=True
).cuda()

# Load the best checkpoint
checkpoint = torch.load('best_model.pt')
state_dict = remove_module_prefix(checkpoint['model_state_dict'])
model.load_state_dict(state_dict)

# Create resize transform
resize_transform = Resize(
    size=(48, 48),
    interpolation=InterpolationMode.BILINEAR,
    antialias=True
)

# Make predictions
print("Making predictions on X_final...")
predictions = np.zeros((len(X_final_normalized), 48, 48))

for i, image in enumerate(X_final_normalized):
    if i % 20 == 0:
        print(f"Predicting image {i}/{len(X_final_normalized)}")
    pred = predict(image, model, processor)
    
    if pred.shape != (48, 48):
        pred_tensor = torch.from_numpy(pred).unsqueeze(0)
        pred_resized = resize_transform(pred_tensor).squeeze().numpy()
        predictions[i] = pred_resized
    else:
        predictions[i] = pred

# Convert predictions to binary (0 or 1)
predictions_binary = (predictions > 0.5).astype(np.uint8)

# Save predictions
#np.save('predictions_final.npy', predictions_binary)
print("Predictions saved to predictions_final.npy")

# Print some statistics
print("\nPrediction Statistics:")
print(f"Shape: {predictions_binary.shape}")
print(f"Number of positive predictions: {np.sum(predictions_binary == 1)}")
print(f"Number of negative predictions: {np.sum(predictions_binary == 0)}")
print(f"Percentage of positive predictions: {100 * np.mean(predictions_binary):.2f}%")

  return func(*args, **kwargs)
Some weights of SegformerForSemanticSegmentation were not initialized from the model checkpoint at nvidia/mit-b3 and are newly initialized: ['decode_head.batch_norm.bias', 'decode_head.batch_norm.num_batches_tracked', 'decode_head.batch_norm.running_mean', 'decode_head.batch_norm.running_var', 'decode_head.batch_norm.weight', 'decode_head.classifier.bias', 'decode_head.classifier.weight', 'decode_head.linear_c.0.proj.bias', 'decode_head.linear_c.0.proj.weight', 'decode_head.linear_c.1.proj.bias', 'decode_head.linear_c.1.proj.weight', 'decode_head.linear_c.2.proj.bias', 'decode_head.linear_c.2.proj.weight', 'decode_head.linear_c.3.proj.bias', 'decode_head.linear_c.3.proj.weight', 'decode_head.linear_fuse.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load('best_model.pt')


Making predictions on X_final...
Predicting image 0/196
Predicting image 20/196
Predicting image 40/196
Predicting image 60/196
Predicting image 80/196
Predicting image 100/196
Predicting image 120/196
Predicting image 140/196
Predicting image 160/196
Predicting image 180/196
Predictions saved to predictions_final.npy

Prediction Statistics:
Shape: (196, 48, 48)
Number of positive predictions: 114312
Number of negative predictions: 337272
Percentage of positive predictions: 25.31%


In [14]:
predictions_binary = predictions_binary.reshape(196,48**2)

In [16]:
predictions_binary.shape

(196, 2304)

In [18]:
np.save('final_predictions.npy',predictions_binary) ##FINAL

In [None]:
import numpy as np
from matplotlib import pyplot as plt

Xtrain2_a = np.load('/kaggle/input/datasetmachinelearning4/Xtrain2_a.npy')
Ytrain2_a = np.load('/kaggle/input/datasetmachinelearning4/Ytrain2_a.npy')
Xtrain2_b = np.load('/kaggle/input/datasetmachinelearning4/Xtrain2_b.npy')
Ytrain2_b = np.load('/kaggle/input/datasetmachinelearning4/Ytrain2_b.npy')

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense

# Simple CNN model
model_a = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(7, 7, 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_a.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D, concatenate
from tensorflow.keras.models import Model

def unet_model(input_size=(48, 48, 1)):
    inputs = Input(input_size)
    
    # Encoder
    conv1 = Conv2D(64, (3, 3), activation='relu', padding='same')(inputs)
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1)
    
    conv2 = Conv2D(128, (3, 3), activation='relu', padding='same')(pool1)
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2)
    
    # Bottleneck
    conv3 = Conv2D(256, (3, 3), activation='relu', padding='same')(pool2)
    
    # Decoder
    up4 = UpSampling2D(size=(2, 2))(conv3)
    merge4 = concatenate([conv2, up4], axis=3)
    conv4 = Conv2D(128, (3, 3), activation='relu', padding='same')(merge4)
    
    up5 = UpSampling2D(size=(2, 2))(conv4)
    merge5 = concatenate([conv1, up5], axis=3)
    conv5 = Conv2D(64, (3, 3), activation='relu', padding='same')(merge5)
    
    conv6 = Conv2D(1, (1, 1), activation='sigmoid')(conv5)
    
    return Model(inputs=inputs, outputs=conv6)

model_b = unet_model()
model_b.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
import numpy as np

# Reshape Xtrain2_a to have the shape (num_samples, 7, 7, 1)
Xtrain2_a = Xtrain2_a.reshape(-1, 7, 7, 1)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, Dense

# Simple CNN model for format a
model_a = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(7, 7, 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_a.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train the models
model_a.fit(Xtrain2_a, Ytrain2_a, epochs=10, batch_size=32)
model_b.fit(Xtrain2_b, Ytrain2_b, epochs=10, batch_size=32)

import numpy as np
from sklearn.metrics import balanced_accuracy_score

# Predict
preds_a = model_a.predict(Xtrain2_a).flatten()
preds_b = model_b.predict(Xtrain2_b).flatten()

# Round predictions to get binary outputs
preds_a_rounded = np.round(preds_a)
preds_b_rounded = np.round(preds_b)

# Compute balanced accuracy
balanced_acc_a = balanced_accuracy_score(Ytrain2_a.flatten(), preds_a_rounded)
balanced_acc_b = balanced_accuracy_score(Ytrain2_b.flatten(), preds_b_rounded)

print("Balanced Accuracy for Format A:", balanced_acc_a)
print("Balanced Accuracy for Format B:", balanced_acc_b)