In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
import numpy as np
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt

# --- Configuration for Data Loading and SimpleNN ---
BATCH_SIZE = 64
NUM_EPOCHS_SIMPLE_NN = 3 # For initial training of Class 1/2 models and target models (if generated)
LEARNING_RATE_SIMPLE_NN = 0.001

# Define the classes for each subset (same as before)
CLASS1_LABELS = [0, 1, 2, 3]
CLASS2_LABELS = [2, 3, 4, 5]
TARGET_MODEL_LABELS = [0, 1, 2, 3, 4] # Labels for the "ideal updated Model 1"

# Evaluation labels for the final predicted model
EVAL_SET_A_LABELS = [0, 1, 2, 3, 4] # Reverted to focus on digits 0-4
EVAL_SET_B_LABELS = [5] # Still defined, but its loss component will not be part of meta-loss
ALL_EVAL_DIGITS = [0, 1, 2, 3, 4, 5]

# --- Configuration for MetaModel Training ---
# Increased parameters for more rigorous training
NUM_TARGET_MODELS = 50 # Number of "ideal updated Model 1" models to generate (if needed)
NUM_META_MODEL_TRAINING_PAIRS = 1000 # Increased training pairs (from 500)
META_MODEL_EPOCHS = 200 # Increased epochs (from 100)
META_MODEL_LEARNING_RATE = 0.0005
META_MODEL_BATCH_SIZE = 64

# New parameter for sampled internal evaluation - now using more batches
NUM_BATCHES_FOR_INTERNAL_EVAL = 20 # Increased sampled batches for custom loss calculation (from 5)

# Interval to print MetaModel training loss (in batches)
META_LOSS_PRINT_INTERVAL = 25

# Set device to GPU if available, otherwise CPU
# Prioritize MPS for Apple Silicon, then CUDA, then CPU
DEVICE = torch.device("cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu"))
print(f"Using device: {DEVICE}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
elif torch.backends.mps.is_available(): # For Apple Silicon
    # MPS does not have a manual_seed_all equivalent for all operations.
    # Deterministic behavior for MPS is generally handled by setting seeds for CPU/GPU.
    # For full reproducibility, you might need to disable MPS for some operations,
    # or accept minor variations.
    pass


# --- Data Loading and Preprocessing (MNIST) ---
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and std
])

print("Loading MNIST dataset...")
full_train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
full_test_dataset = datasets.MNIST('./data', train=False, download=True, transform=transform)
print("MNIST dataset loaded.")

# --- Dataset Subset Creation Function ---
def create_subset(dataset, labels_to_include):
    """Filters a dataset to include only samples with specified labels."""
    indices = []
    for i, (_, label) in enumerate(dataset):
        if label in labels_to_include:
            indices.append(i)
    return Subset(dataset, indices)

# Create datasets for SimpleNN training and evaluation
class1_train_dataset = create_subset(full_train_dataset, CLASS1_LABELS)
class2_train_dataset = create_subset(full_train_dataset, CLASS2_LABELS)
target_model_train_dataset = create_subset(full_train_dataset, TARGET_MODEL_LABELS)

# Evaluation datasets for the final predicted model
# These loaders will also be used *inside* the MetaModel's training loop for custom loss
eval_loader_A_internal = DataLoader(create_subset(full_test_dataset, EVAL_SET_A_LABELS), batch_size=INTERNAL_EVAL_BATCH_SIZE, shuffle=True) # Shuffle for sampling
eval_loader_B_internal = DataLoader(create_subset(full_test_dataset, EVAL_SET_B_LABELS), batch_size=INTERNAL_EVAL_BATCH_SIZE, shuffle=True) # Shuffle for sampling


# Create individual digit test datasets for detailed tracing
individual_digit_test_loaders = {}
for digit in ALL_EVAL_DIGITS:
    digit_dataset = create_subset(full_test_dataset, [digit])
    if len(digit_dataset) > 0:
        individual_digit_test_loaders[digit] = DataLoader(digit_dataset, batch_size=BATCH_SIZE, shuffle=False)

# --- SimpleNN Model Definition (ONE HIDDEN LAYER REMOVED) ---
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        # Input layer: 28*28 = 784 pixels
        # Single Hidden layer: 8 neurons (reduced from 16)
        # Output layer: 10 neurons (for digits 0-9)
        self.fc1 = nn.Linear(28 * 28, 10) # Input to the single hidden layer, reduced to 8 neurons
        # self.relu1 = nn.ReLU()
        # self.fc3 = nn.Linear(16, 10) # Direct connection from hidden layer to output, input size matches fc1 output

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        # x = self.relu1(x)
        # x = self.fc3(x) # Direct connection
        return x

# --- Helper Functions for Flattening/Unflattening State Dicts ---
def flatten_state_dict(state_dict):
    """Flattens a state_dict into a single 1D tensor."""
    return torch.cat([param.view(-1) for param in state_dict.values()])

def unflatten_tensor_to_state_dict(flat_tensor, model_template_state_dict):
    """Unflattens a 1D tensor back into a state_dict, using a template."""
    new_state_dict = {}
    offset = 0
    for name, param in model_template_state_dict.items():
        num_elements = param.numel()
        # Ensure the tensor is on the correct device and has requires_grad if needed
        # For loading into a model that will be evaluated for loss, it needs to track gradients.
        new_state_dict[name] = flat_tensor[offset:offset + num_elements].view(param.shape).to(param.device)
        offset += num_elements
    return new_state_dict

# Get a template model to determine state_dict structure and size
template_model = SimpleNN() # Re-initialize to get the new, smaller architecture's state_dict
template_state_dict = template_model.state_dict()
FLATTENED_STATE_DICT_SIZE = flatten_state_dict(template_state_dict).numel()
print(f"Flattened SimpleNN state_dict size: {FLATTENED_STATE_DICT_SIZE}")

# --- MetaModel Definition (REDUCED CAPACITY) ---
class MetaModel(nn.Module):
    def __init__(self, input_size, output_size):
        super(MetaModel, self).__init__()
        # Input: flattened state_dict of Model 1 + flattened state_dict of (Model 2 - Model 1)
        # Output: flattened state_dict of the *difference* to add to Model 1
        # Hidden layer sizes significantly reduced
        self.fc1 = nn.Linear(input_size, 512) # Reduced from 2048
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(512, 256) # Reduced from 1024
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(256, 128) # Reduced from 512
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(128, output_size) # Output is the flattened state_dict

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.fc4(x)
        return x

# --- Training and Evaluation Functions (SimpleNN) ---
def train_simple_nn(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for data, target in train_loader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

def evaluate_simple_nn_accuracy(model, data_loader):
    """Evaluates a SimpleNN model and returns accuracy (for final reporting)."""
    model.eval() # Set to eval mode, no gradients needed for this accuracy calculation
    correct = 0
    total = 0
    if len(data_loader.dataset) == 0:
        return float('nan')
    with torch.no_grad(): # No gradients needed for accuracy calculation
        for data, target in data_loader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    return 100 * correct / total

def calculate_classification_loss(model, data_loader, criterion, num_batches_to_sample=None):
    """
    Calculates the average classification loss for a SimpleNN model over a DataLoader.
    Can sample a fixed number of batches for lighter computation.
    This function is designed to be differentiable for the MetaModel's loss.
    """
    model.train() # Keep in train mode to ensure gradients are tracked
    total_loss = 0.0
    num_batches_processed = 0
    
    # Create an iterator for the DataLoader
    data_iter = iter(data_loader)

    for _ in range(num_batches_to_sample if num_batches_to_sample is not None else len(data_loader)):
        try:
            data, target = next(data_iter)
        except StopIteration:
            # Reached end of dataset, reset iterator for continuous sampling
            data_iter = iter(data_loader)
            data, target = next(data_iter) # Get next batch from the beginning

        data, target = data.to(DEVICE), target.to(DEVICE)
        output = model(data)
        loss = criterion(output, target)
        total_loss += loss # Accumulate loss directly (not .item())
        num_batches_processed += 1
    
    return total_loss / num_batches_processed if num_batches_processed > 0 else torch.tensor(0.0, device=DEVICE)


# --- Main Logic ---
# Consistent SAVE_DIR with the first script
SAVE_DIR = './trained_models_smallest'
PLOTS_DIR = './meta_plots'
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# Define paths for saved models (corrected filenames)
class1_save_path = os.path.join(SAVE_DIR, 'class1_models_weights.pt')
class2_save_path = os.path.join(SAVE_DIR, 'class2_models_weights.pt')
target_models_save_path = os.path.join(SAVE_DIR, 'target_updated_models.pt') # Still used for generating target models if not found

# 1. Load initial Class 1 and Class 2 models (STRICTLY LOAD)
class1_models = []
class2_models = []

print("Attempting to load pre-generated Class 1 and Class 2 models...")
try:
    if os.path.exists(class1_save_path):
        class1_models = torch.load(class1_save_path, map_location=DEVICE, weights_only=True)
        print(f"Loaded {len(class1_models)} Class 1 models.")
    else:
        raise FileNotFoundError(f"Class 1 model weights file not found at {class1_save_path}. Please run the 'MNIST Subset Neural Network Experiment' Canvas first to generate these models.")

    if os.path.exists(class2_save_path):
        class2_models = torch.load(class2_save_path, map_location=DEVICE, weights_only=True)
        print(f"Loaded {len(class2_models)} Class 2 models.")
    else:
        raise FileNotFoundError(f"Class 2 model weights file not found at {class2_save_path}. Please run the 'MNIST Subset Neural Network Experiment' Canvas first to generate these models.")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Exiting. Please ensure the initial Class 1 and Class 2 models are generated and saved in the correct directory.")
    exit() # Exit if essential files are missing
except Exception as e:
    print(f"An unexpected error occurred while loading initial models: {e}")
    exit()

# 2. Load "Target Updated Model 1" weights (models trained on 0,1,2,3,4)
target_updated_models = []
print("Attempting to load pre-generated Target Updated Model 1 weights...")
try:
    if os.path.exists(target_models_save_path):
        target_updated_models = torch.load(target_models_save_path, map_location=DEVICE, weights_only=True)
        print(f"Loaded {len(target_updated_models)} Target Updated Model 1 models.")
    else:
        # This part remains a fallback to generate target models if they don't exist,
        # as they are the "ground truth" for what the meta-model should aim to produce.
        print(f"Target updated model weights file not found at {target_models_save_path}. Generating them now...")
        for _ in tqdm(range(NUM_TARGET_MODELS), desc="Generating Target Updated Models"):
            model = SimpleNN().to(DEVICE)
            train_simple_nn(model, DataLoader(target_model_train_dataset, BATCH_SIZE, True), nn.CrossEntropyLoss(), optim.Adam(model.parameters(), LEARNING_RATE_SIMPLE_NN), NUM_EPOCHS_SIMPLE_NN * 2) # Train a bit more for good performance
            target_updated_models.append(model.state_dict())
        torch.save(target_updated_models, target_models_save_path)
        print("Target Updated Model 1 models generated and saved.")

except Exception as e:
    print(f"An unexpected error occurred while loading/generating target models: {e}")
    exit()

# Ensure we have enough models after loading/generating
if not class1_models or not class2_models or not target_updated_models:
    raise RuntimeError("Critical: Not enough models available after loading/generation. Something went wrong.")


# 3. Prepare training data for the MetaModel (inputs: original Model1, Model2-Model1 diff)
meta_train_inputs = []

print(f"Preparing {NUM_META_MODEL_TRAINING_PAIRS} training inputs for the MetaModel...")
for _ in tqdm(range(NUM_META_MODEL_TRAINING_PAIRS), desc="Preparing MetaModel data"):
    # Randomly select one Class 1 model and one Class 2 model
    original_class1_sd = random.choice(class1_models)
    class2_sd = random.choice(class2_models)

    # Flatten inputs
    flat_original_class1 = flatten_state_dict(original_class1_sd).to(DEVICE)
    flat_class2 = flatten_state_dict(class2_sd).to(DEVICE)
    
    # Calculate the difference in weights
    diff_m2_m1_flat = flat_class2 - flat_original_class1

    # Concatenate original Model 1 weights and the difference as MetaModel input
    meta_input = torch.cat((flat_original_class1, diff_m2_m1_flat))

    meta_train_inputs.append(meta_input)

# Convert lists to tensors for DataLoader
meta_train_inputs_tensor = torch.stack(meta_train_inputs)

# Create a dummy tensor for targets in TensorDataset, as targets are computed dynamically
dummy_targets = torch.zeros(meta_train_inputs_tensor.shape[0], 1, device=DEVICE)
meta_dataset = torch.utils.data.TensorDataset(meta_train_inputs_tensor, dummy_targets)
meta_dataloader = DataLoader(meta_dataset, batch_size=META_MODEL_BATCH_SIZE, shuffle=True)


# 4. Define and Train the MetaModel
meta_model_input_size = FLATTENED_STATE_DICT_SIZE * 2 # Original Model 1 + (Model 2 - Model 1)
meta_model_output_size = FLATTENED_STATE_DICT_SIZE # Predicted difference to add to Model 1

meta_model = MetaModel(meta_model_input_size, meta_model_output_size).to(DEVICE)
meta_criterion_ce = nn.CrossEntropyLoss() # Used for internal SimpleNN evaluation
meta_optimizer = optim.Adam(meta_model.parameters(), lr=META_MODEL_LEARNING_RATE)

print(f"\n--- Training the MetaModel with Custom Loss ---")
meta_train_losses = []
for epoch in tqdm(range(META_MODEL_EPOCHS), desc="Training MetaModel"):
    meta_model.train() # Set MetaModel to train mode
    epoch_loss_sum = 0.0 # To track loss for printing
    num_batches_in_epoch = 0

    for batch_idx, (batch_inputs, _) in enumerate(meta_dataloader): # Dummy targets are ignored
        # batch_inputs contains [original_model1_flat, diff_m2_m1_flat]
        original_model1_flat_batch = batch_inputs[:, :FLATTENED_STATE_DICT_SIZE]
        # diff_m2_m1_flat_batch = batch_inputs[:, FLATTENED_STATE_DICT_SIZE:] # Not directly used here but conceptually part of input

        batch_inputs = batch_inputs.to(DEVICE)
        meta_optimizer.zero_grad()

        # Get predicted flattened difference from MetaModel
        predicted_diff_for_m1_flat_batch = meta_model(batch_inputs) # This tensor has requires_grad=True

        batch_meta_loss = 0.0
        current_batch_loss_0_4 = 0.0 # Now representing loss on 0,1,2,3,4
        # current_batch_loss_5 is no longer used in the meta_loss calculation

        # Iterate over each predicted model in the batch
        for i in range(predicted_diff_for_m1_flat_batch.shape[0]):
            single_predicted_diff_for_m1_flat = predicted_diff_for_m1_flat_batch[i]
            single_original_model1_flat = original_model1_flat_batch[i] # Get the corresponding original Model 1

            # Calculate the predicted updated Model 1 weights
            predicted_updated_flat_m1 = single_original_model1_flat + single_predicted_diff_for_m1_flat
            
            # Unflatten predicted weights to form a SimpleNN's state_dict
            # Ensure these tensors retain their gradient tracking
            predicted_sd = unflatten_tensor_to_state_dict(predicted_updated_flat_m1, template_state_dict)
            
            # Instantiate a temporary SimpleNN and load the predicted state_dict
            temp_simple_nn = SimpleNN().to(DEVICE)
            temp_simple_nn.load_state_dict(predicted_sd)

            # Calculate loss components using the temporary SimpleNN on MNIST data
            # Sample a fixed number of batches for lighter computation
            loss_0_4 = calculate_classification_loss(temp_simple_nn, eval_loader_A_internal, meta_criterion_ce, num_batches_to_sample=NUM_BATCHES_FOR_INTERNAL_EVAL)
            # loss_5 is calculated but not used in the meta-loss for optimization
            # loss_5 = calculate_classification_loss(temp_simple_nn, eval_loader_B_internal, meta_criterion_ce, num_batches_to_sample=NUM_BATCHES_FOR_INTERNAL_EVAL)

            # Custom loss: Minimize loss on 0-4 ONLY
            batch_meta_loss += loss_0_4 # Only minimize loss on 0-4
            current_batch_loss_0_4 += loss_0_4.item()

        meta_loss = batch_meta_loss / predicted_diff_for_m1_flat_batch.shape[0] # Average over batch
        avg_loss_0_4 = current_batch_loss_0_4 / predicted_diff_for_m1_flat_batch.shape[0]
        # avg_loss_5 is not relevant for the meta-loss anymore, but can be calculated for monitoring if desired

        meta_loss.backward()
        meta_optimizer.step()
        meta_train_losses.append(meta_loss.item())
        epoch_loss_sum += meta_loss.item()
        num_batches_in_epoch += 1

        if (batch_idx + 1) % META_LOSS_PRINT_INTERVAL == 0:
            # Only print L_0-4 as it's the only component of the meta-loss
            print(f"    Epoch {epoch+1}/{META_MODEL_EPOCHS}, Batch {batch_idx+1}/{len(meta_dataloader)} - Avg Meta Loss: {epoch_loss_sum / num_batches_in_epoch:.4f} (L_0-4: {avg_loss_0_4:.4f})")
            epoch_loss_sum = 0.0 # Reset for next interval
            num_batches_in_epoch = 0 # Reset for next interval

    # Print final loss for the epoch if not already printed by interval
    if num_batches_in_epoch > 0:
        print(f"    Epoch {epoch+1}/{META_MODEL_EPOCHS}, Final Batch - Avg Meta Loss: {epoch_loss_sum / num_batches_in_epoch:.4f} (L_0-4: {avg_loss_0_4:.4f})")


print("MetaModel training complete.")

# Save the trained MetaModel
meta_model_save_path = os.path.join(SAVE_DIR, 'meta_model_custom_loss.pt')
torch.save(meta_model.state_dict(), meta_model_save_path)
print(f"MetaModel saved to {meta_model_save_path}")

# --- Plotting MetaModel Training Loss ---
plt.figure(figsize=(10, 6))
plt.plot(meta_train_losses)
plt.xlabel('MetaModel Training Step (Batch)')
plt.ylabel('Custom Loss (Loss_0-4)') # Updated label
plt.title('MetaModel Training Loss Over Steps (Custom Loss)')
plt.grid(True)
plot_save_path = os.path.join(PLOTS_DIR, 'meta_model_training_custom_loss.png')
plt.savefig(plot_save_path)
plt.close()


# 5. Evaluate the MetaModel's Output (by evaluating the predicted SimpleNN)
print("\n--- Evaluating the MetaModel's Output ---")

# Pick a random pair of initial Class 1 and Class 2 models for testing
test_original_class1_sd = random.choice(class1_models)
test_class2_sd = random.choice(class2_models)

# Flatten and concatenate for MetaModel input
flat_test_original_class1 = flatten_state_dict(test_original_class1_sd).to(DEVICE)
flat_test_class2 = flatten_state_dict(test_class2_sd).to(DEVICE)
diff_test_m2_m1_flat = flat_test_class2 - flat_test_original_class1
meta_test_input = torch.cat((flat_test_original_class1, diff_test_m2_m1_flat)).unsqueeze(0) # Add batch dimension

# Get predicted difference from MetaModel
meta_model.eval() # Set MetaModel to eval mode for final prediction
with torch.no_grad(): # No gradients needed for final prediction
    predicted_diff_for_m1_flat = meta_model(meta_test_input).squeeze(0) # Remove batch dimension

# Calculate the predicted updated Model 1 weights
predicted_updated_flat_m1 = flat_test_original_class1 + predicted_diff_for_m1_flat

# Unflatten predicted weights back into a state_dict
predicted_model_state_dict = unflatten_tensor_to_state_dict(predicted_updated_flat_m1.cpu(), template_state_dict)

# Instantiate a SimpleNN with the predicted weights
predicted_simple_nn = SimpleNN().to(DEVICE)
predicted_simple_nn.load_state_dict(predicted_model_state_dict)

# Evaluate the predicted SimpleNN on MNIST data (using accuracy)
print("\n--- Performance of the Model Predicted by MetaModel ---")
pred_acc_A = evaluate_simple_nn_accuracy(predicted_simple_nn, eval_loader_A_internal) # Use internal loaders for consistency
pred_acc_B = evaluate_simple_nn_accuracy(predicted_simple_nn, eval_loader_B_internal)

print(f"  Predicted Model Accuracy on {EVAL_SET_A_LABELS} (Accuracy A): {pred_acc_A:.2f}%")
print(f"  Predicted Model Accuracy on {EVAL_SET_B_LABELS} (Accuracy B): {pred_acc_B:.2f}%") # Still show for monitoring
print(f"  Predicted Model Custom Metric (A - B): {pred_acc_A - pred_acc_B:.2f}")

# Optional: Evaluate the original Class 1 model's performance for comparison
original_simple_nn = SimpleNN().to(DEVICE)
original_simple_nn.load_state_dict(test_original_class1_sd)
orig_acc_A = evaluate_simple_nn_accuracy(original_simple_nn, eval_loader_A_internal)
orig_acc_B = evaluate_simple_nn_accuracy(original_simple_nn, eval_loader_B_internal)
print(f"  Original Model Accuracy on {EVAL_SET_A_LABELS} (Accuracy A): {orig_acc_A:.2f}%")
print(f"  Original Model Accuracy on {EVAL_SET_B_LABELS} (Accuracy B): {orig_acc_B:.2f}%")
print(f"  Original Model Custom Metric (A - B): {orig_acc_A - orig_acc_B:.2f}")

print("\nMeta-Learning Experiment complete!")
print("Check the 'meta_plots' directory for visualizations of MetaModel training and output performance.")


Using device: mps
Loading MNIST dataset...
MNIST dataset loaded.
Flattened SimpleNN state_dict size: 7850
Attempting to load pre-generated Class 1 and Class 2 models...
Loaded 1000 Class 1 models.
Loaded 1000 Class 2 models.
Attempting to load pre-generated Target Updated Model 1 weights...
Loaded 50 Target Updated Model 1 models.
Preparing 1000 training inputs for the MetaModel...


Preparing MetaModel data: 100%|██████████| 1000/1000 [00:00<00:00, 11463.42it/s]



--- Training the MetaModel with Custom Loss ---


Training MetaModel:   0%|          | 1/200 [01:46<5:54:50, 106.99s/it]

    Epoch 1/200, Final Batch - Avg Meta Loss: 1.9395 (L_0-4: 1.9684)


Training MetaModel:   1%|          | 2/200 [03:32<5:49:50, 106.01s/it]

    Epoch 2/200, Final Batch - Avg Meta Loss: 1.9362 (L_0-4: 1.9595)


Training MetaModel:   2%|▏         | 3/200 [05:18<5:48:34, 106.17s/it]

    Epoch 3/200, Final Batch - Avg Meta Loss: 1.9309 (L_0-4: 1.9212)


Training MetaModel:   2%|▏         | 4/200 [07:04<5:46:04, 105.94s/it]

    Epoch 4/200, Final Batch - Avg Meta Loss: 1.9328 (L_0-4: 1.9432)


Training MetaModel:   2%|▎         | 5/200 [09:01<5:57:40, 110.05s/it]

    Epoch 5/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9497)


Training MetaModel:   3%|▎         | 6/200 [10:55<6:00:21, 111.45s/it]

    Epoch 6/200, Final Batch - Avg Meta Loss: 1.9359 (L_0-4: 1.9149)


Training MetaModel:   4%|▎         | 7/200 [12:58<6:09:58, 115.02s/it]

    Epoch 7/200, Final Batch - Avg Meta Loss: 1.9326 (L_0-4: 1.9179)


Training MetaModel:   4%|▍         | 8/200 [14:52<6:07:00, 114.69s/it]

    Epoch 8/200, Final Batch - Avg Meta Loss: 1.9343 (L_0-4: 1.9514)


Training MetaModel:   4%|▍         | 9/200 [16:43<6:01:44, 113.63s/it]

    Epoch 9/200, Final Batch - Avg Meta Loss: 1.9314 (L_0-4: 1.9340)


Training MetaModel:   5%|▌         | 10/200 [18:33<5:56:48, 112.68s/it]

    Epoch 10/200, Final Batch - Avg Meta Loss: 1.9359 (L_0-4: 1.9488)


Training MetaModel:   6%|▌         | 11/200 [20:21<5:50:19, 111.22s/it]

    Epoch 11/200, Final Batch - Avg Meta Loss: 1.9358 (L_0-4: 1.9264)


Training MetaModel:   6%|▌         | 12/200 [22:09<5:44:40, 110.00s/it]

    Epoch 12/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9483)


Training MetaModel:   6%|▋         | 13/200 [23:56<5:40:10, 109.15s/it]

    Epoch 13/200, Final Batch - Avg Meta Loss: 1.9361 (L_0-4: 1.9434)


Training MetaModel:   7%|▋         | 14/200 [25:43<5:36:19, 108.49s/it]

    Epoch 14/200, Final Batch - Avg Meta Loss: 1.9341 (L_0-4: 1.9404)


Training MetaModel:   8%|▊         | 15/200 [27:28<5:31:18, 107.45s/it]

    Epoch 15/200, Final Batch - Avg Meta Loss: 1.9354 (L_0-4: 1.9072)


Training MetaModel:   8%|▊         | 16/200 [29:13<5:27:48, 106.89s/it]

    Epoch 16/200, Final Batch - Avg Meta Loss: 1.9318 (L_0-4: 1.9301)


Training MetaModel:   8%|▊         | 17/200 [30:59<5:24:37, 106.43s/it]

    Epoch 17/200, Final Batch - Avg Meta Loss: 1.9331 (L_0-4: 1.9538)


Training MetaModel:   9%|▉         | 18/200 [32:44<5:22:00, 106.15s/it]

    Epoch 18/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9031)


Training MetaModel:  10%|▉         | 19/200 [34:32<5:21:18, 106.51s/it]

    Epoch 19/200, Final Batch - Avg Meta Loss: 1.9379 (L_0-4: 1.9368)


Training MetaModel:  10%|█         | 20/200 [36:18<5:19:29, 106.50s/it]

    Epoch 20/200, Final Batch - Avg Meta Loss: 1.9345 (L_0-4: 1.9123)


Training MetaModel:  10%|█         | 21/200 [38:04<5:17:25, 106.40s/it]

    Epoch 21/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9447)


Training MetaModel:  11%|█         | 22/200 [39:50<5:15:07, 106.22s/it]

    Epoch 22/200, Final Batch - Avg Meta Loss: 1.9319 (L_0-4: 1.9190)


Training MetaModel:  12%|█▏        | 23/200 [41:35<5:12:17, 105.86s/it]

    Epoch 23/200, Final Batch - Avg Meta Loss: 1.9346 (L_0-4: 1.9317)


Training MetaModel:  12%|█▏        | 24/200 [43:21<5:10:10, 105.74s/it]

    Epoch 24/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9305)


Training MetaModel:  12%|█▎        | 25/200 [45:07<5:08:46, 105.87s/it]

    Epoch 25/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9570)


Training MetaModel:  13%|█▎        | 26/200 [46:53<5:07:07, 105.91s/it]

    Epoch 26/200, Final Batch - Avg Meta Loss: 1.9369 (L_0-4: 1.9408)


Training MetaModel:  14%|█▎        | 27/200 [48:48<5:13:49, 108.84s/it]

    Epoch 27/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9343)


Training MetaModel:  14%|█▍        | 28/200 [50:43<5:17:04, 110.61s/it]

    Epoch 28/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9535)


Training MetaModel:  14%|█▍        | 29/200 [52:29<5:11:25, 109.27s/it]

    Epoch 29/200, Final Batch - Avg Meta Loss: 1.9348 (L_0-4: 1.9115)


Training MetaModel:  15%|█▌        | 30/200 [54:15<5:06:49, 108.29s/it]

    Epoch 30/200, Final Batch - Avg Meta Loss: 1.9324 (L_0-4: 1.9251)


Training MetaModel:  16%|█▌        | 31/200 [56:01<5:02:53, 107.54s/it]

    Epoch 31/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9218)


Training MetaModel:  16%|█▌        | 32/200 [57:47<4:59:37, 107.01s/it]

    Epoch 32/200, Final Batch - Avg Meta Loss: 1.9357 (L_0-4: 1.9299)


Training MetaModel:  16%|█▋        | 33/200 [59:33<4:56:56, 106.69s/it]

    Epoch 33/200, Final Batch - Avg Meta Loss: 1.9297 (L_0-4: 1.9302)


Training MetaModel:  17%|█▋        | 34/200 [1:01:18<4:53:57, 106.25s/it]

    Epoch 34/200, Final Batch - Avg Meta Loss: 1.9352 (L_0-4: 1.9432)


Training MetaModel:  18%|█▊        | 35/200 [1:03:03<4:51:24, 105.97s/it]

    Epoch 35/200, Final Batch - Avg Meta Loss: 1.9384 (L_0-4: 1.9577)


Training MetaModel:  18%|█▊        | 36/200 [1:19:42<17:01:26, 373.70s/it]

    Epoch 36/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9339)


Training MetaModel:  18%|█▊        | 37/200 [1:21:23<13:13:13, 291.98s/it]

    Epoch 37/200, Final Batch - Avg Meta Loss: 1.9328 (L_0-4: 1.9515)


Training MetaModel:  19%|█▉        | 38/200 [1:27:11<13:53:31, 308.71s/it]

    Epoch 38/200, Final Batch - Avg Meta Loss: 1.9336 (L_0-4: 1.9402)


Training MetaModel:  20%|█▉        | 39/200 [1:33:55<15:05:33, 337.47s/it]

    Epoch 39/200, Final Batch - Avg Meta Loss: 1.9330 (L_0-4: 1.9452)


Training MetaModel:  20%|██        | 40/200 [1:36:39<12:40:38, 285.24s/it]

    Epoch 40/200, Final Batch - Avg Meta Loss: 1.9327 (L_0-4: 1.9265)


Training MetaModel:  20%|██        | 41/200 [1:39:12<10:50:48, 245.59s/it]

    Epoch 41/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9418)


Training MetaModel:  21%|██        | 42/200 [1:41:41<9:30:47, 216.76s/it] 

    Epoch 42/200, Final Batch - Avg Meta Loss: 1.9346 (L_0-4: 1.9105)


Training MetaModel:  22%|██▏       | 43/200 [1:44:04<8:28:52, 194.48s/it]

    Epoch 43/200, Final Batch - Avg Meta Loss: 1.9318 (L_0-4: 1.9387)


Training MetaModel:  22%|██▏       | 44/200 [1:46:24<7:43:15, 178.18s/it]

    Epoch 44/200, Final Batch - Avg Meta Loss: 1.9336 (L_0-4: 1.9416)


Training MetaModel:  22%|██▎       | 45/200 [1:48:42<7:09:18, 166.19s/it]

    Epoch 45/200, Final Batch - Avg Meta Loss: 1.9395 (L_0-4: 1.9297)


Training MetaModel:  23%|██▎       | 46/200 [1:51:00<6:44:24, 157.56s/it]

    Epoch 46/200, Final Batch - Avg Meta Loss: 1.9321 (L_0-4: 1.9410)


Training MetaModel:  24%|██▎       | 47/200 [1:53:16<6:25:26, 151.15s/it]

    Epoch 47/200, Final Batch - Avg Meta Loss: 1.9348 (L_0-4: 1.9344)


Training MetaModel:  24%|██▍       | 48/200 [1:55:32<6:11:29, 146.64s/it]

    Epoch 48/200, Final Batch - Avg Meta Loss: 1.9365 (L_0-4: 1.9579)


Training MetaModel:  24%|██▍       | 49/200 [1:57:54<6:05:59, 145.42s/it]

    Epoch 49/200, Final Batch - Avg Meta Loss: 1.9353 (L_0-4: 1.9374)


Training MetaModel:  25%|██▌       | 50/200 [2:00:10<5:56:21, 142.54s/it]

    Epoch 50/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9253)


Training MetaModel:  26%|██▌       | 51/200 [2:02:27<5:49:50, 140.87s/it]

    Epoch 51/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9454)


Training MetaModel:  26%|██▌       | 52/200 [2:04:47<5:46:23, 140.43s/it]

    Epoch 52/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9424)


Training MetaModel:  26%|██▋       | 53/200 [2:07:10<5:46:04, 141.26s/it]

    Epoch 53/200, Final Batch - Avg Meta Loss: 1.9319 (L_0-4: 1.9460)


Training MetaModel:  27%|██▋       | 54/200 [2:09:28<5:41:13, 140.23s/it]

    Epoch 54/200, Final Batch - Avg Meta Loss: 1.9365 (L_0-4: 1.9485)


Training MetaModel:  28%|██▊       | 55/200 [2:11:58<5:46:35, 143.42s/it]

    Epoch 55/200, Final Batch - Avg Meta Loss: 1.9305 (L_0-4: 1.9441)


Training MetaModel:  28%|██▊       | 56/200 [2:14:16<5:40:05, 141.71s/it]

    Epoch 56/200, Final Batch - Avg Meta Loss: 1.9338 (L_0-4: 1.9463)


Training MetaModel:  28%|██▊       | 57/200 [2:16:32<5:33:35, 139.97s/it]

    Epoch 57/200, Final Batch - Avg Meta Loss: 1.9323 (L_0-4: 1.9205)


Training MetaModel:  29%|██▉       | 58/200 [2:28:09<12:06:59, 307.18s/it]

    Epoch 58/200, Final Batch - Avg Meta Loss: 1.9355 (L_0-4: 1.9341)


Training MetaModel:  30%|██▉       | 59/200 [2:29:50<9:36:29, 245.31s/it] 

    Epoch 59/200, Final Batch - Avg Meta Loss: 1.9351 (L_0-4: 1.9388)


Training MetaModel:  30%|███       | 60/200 [2:31:33<7:52:26, 202.48s/it]

    Epoch 60/200, Final Batch - Avg Meta Loss: 1.9337 (L_0-4: 1.9329)


Training MetaModel:  30%|███       | 61/200 [2:33:16<6:40:07, 172.72s/it]

    Epoch 61/200, Final Batch - Avg Meta Loss: 1.9360 (L_0-4: 1.9336)


Training MetaModel:  31%|███       | 62/200 [2:35:01<5:50:19, 152.32s/it]

    Epoch 62/200, Final Batch - Avg Meta Loss: 1.9344 (L_0-4: 1.9516)


Training MetaModel:  32%|███▏      | 63/200 [2:36:45<5:15:00, 137.96s/it]

    Epoch 63/200, Final Batch - Avg Meta Loss: 1.9346 (L_0-4: 1.9375)


Training MetaModel:  32%|███▏      | 64/200 [2:38:31<4:50:45, 128.27s/it]

    Epoch 64/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9303)


Training MetaModel:  32%|███▎      | 65/200 [2:40:16<4:32:51, 121.27s/it]

    Epoch 65/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9413)


Training MetaModel:  33%|███▎      | 66/200 [2:42:02<4:20:31, 116.65s/it]

    Epoch 66/200, Final Batch - Avg Meta Loss: 1.9338 (L_0-4: 1.9525)


Training MetaModel:  34%|███▎      | 67/200 [2:43:47<4:11:05, 113.28s/it]

    Epoch 67/200, Final Batch - Avg Meta Loss: 1.9364 (L_0-4: 1.9475)


Training MetaModel:  34%|███▍      | 68/200 [2:45:32<4:03:19, 110.60s/it]

    Epoch 68/200, Final Batch - Avg Meta Loss: 1.9344 (L_0-4: 1.9381)


Training MetaModel:  34%|███▍      | 69/200 [2:47:16<3:57:19, 108.70s/it]

    Epoch 69/200, Final Batch - Avg Meta Loss: 1.9319 (L_0-4: 1.9194)


Training MetaModel:  35%|███▌      | 70/200 [2:49:01<3:53:17, 107.67s/it]

    Epoch 70/200, Final Batch - Avg Meta Loss: 1.9323 (L_0-4: 1.8990)


Training MetaModel:  36%|███▌      | 71/200 [2:50:46<3:49:49, 106.89s/it]

    Epoch 71/200, Final Batch - Avg Meta Loss: 1.9373 (L_0-4: 1.9614)


Training MetaModel:  36%|███▌      | 72/200 [2:52:31<3:46:25, 106.14s/it]

    Epoch 72/200, Final Batch - Avg Meta Loss: 1.9373 (L_0-4: 1.9529)


Training MetaModel:  36%|███▋      | 73/200 [2:54:15<3:43:44, 105.71s/it]

    Epoch 73/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9156)


Training MetaModel:  37%|███▋      | 74/200 [2:56:01<3:41:46, 105.61s/it]

    Epoch 74/200, Final Batch - Avg Meta Loss: 1.9362 (L_0-4: 1.9260)


Training MetaModel:  38%|███▊      | 75/200 [2:57:46<3:39:40, 105.44s/it]

    Epoch 75/200, Final Batch - Avg Meta Loss: 1.9307 (L_0-4: 1.9259)


Training MetaModel:  38%|███▊      | 76/200 [2:59:30<3:37:24, 105.20s/it]

    Epoch 76/200, Final Batch - Avg Meta Loss: 1.9338 (L_0-4: 1.9253)


Training MetaModel:  38%|███▊      | 77/200 [3:01:15<3:35:09, 104.96s/it]

    Epoch 77/200, Final Batch - Avg Meta Loss: 1.9324 (L_0-4: 1.9306)


Training MetaModel:  39%|███▉      | 78/200 [3:02:59<3:33:10, 104.84s/it]

    Epoch 78/200, Final Batch - Avg Meta Loss: 1.9314 (L_0-4: 1.9341)


Training MetaModel:  40%|███▉      | 79/200 [3:04:44<3:31:24, 104.83s/it]

    Epoch 79/200, Final Batch - Avg Meta Loss: 1.9336 (L_0-4: 1.9159)


Training MetaModel:  40%|████      | 80/200 [3:06:32<3:31:44, 105.87s/it]

    Epoch 80/200, Final Batch - Avg Meta Loss: 1.9321 (L_0-4: 1.9300)


Training MetaModel:  40%|████      | 81/200 [3:08:18<3:29:43, 105.74s/it]

    Epoch 81/200, Final Batch - Avg Meta Loss: 1.9327 (L_0-4: 1.9027)


Training MetaModel:  41%|████      | 82/200 [3:10:06<3:29:21, 106.45s/it]

    Epoch 82/200, Final Batch - Avg Meta Loss: 1.9320 (L_0-4: 1.9205)


Training MetaModel:  42%|████▏     | 83/200 [3:11:56<3:29:38, 107.51s/it]

    Epoch 83/200, Final Batch - Avg Meta Loss: 1.9319 (L_0-4: 1.9123)


Training MetaModel:  42%|████▏     | 84/200 [3:13:42<3:26:58, 107.06s/it]

    Epoch 84/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9143)


Training MetaModel:  42%|████▎     | 85/200 [3:15:28<3:24:22, 106.63s/it]

    Epoch 85/200, Final Batch - Avg Meta Loss: 1.9318 (L_0-4: 1.9295)


Training MetaModel:  43%|████▎     | 86/200 [3:17:13<3:22:05, 106.36s/it]

    Epoch 86/200, Final Batch - Avg Meta Loss: 1.9356 (L_0-4: 1.9369)


Training MetaModel:  44%|████▎     | 87/200 [3:19:00<3:20:32, 106.48s/it]

    Epoch 87/200, Final Batch - Avg Meta Loss: 1.9363 (L_0-4: 1.9473)


Training MetaModel:  44%|████▍     | 88/200 [3:20:50<3:20:33, 107.44s/it]

    Epoch 88/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9148)


Training MetaModel:  44%|████▍     | 89/200 [3:22:35<3:17:44, 106.89s/it]

    Epoch 89/200, Final Batch - Avg Meta Loss: 1.9330 (L_0-4: 1.9393)


Training MetaModel:  45%|████▌     | 90/200 [3:24:21<3:15:10, 106.46s/it]

    Epoch 90/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9391)


Training MetaModel:  46%|████▌     | 91/200 [3:26:06<3:12:48, 106.13s/it]

    Epoch 91/200, Final Batch - Avg Meta Loss: 1.9329 (L_0-4: 1.9308)


Training MetaModel:  46%|████▌     | 92/200 [3:27:51<3:10:32, 105.86s/it]

    Epoch 92/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9316)


Training MetaModel:  46%|████▋     | 93/200 [3:29:37<3:08:41, 105.81s/it]

    Epoch 93/200, Final Batch - Avg Meta Loss: 1.9339 (L_0-4: 1.9286)


Training MetaModel:  47%|████▋     | 94/200 [3:31:22<3:06:20, 105.48s/it]

    Epoch 94/200, Final Batch - Avg Meta Loss: 1.9373 (L_0-4: 1.9335)


Training MetaModel:  48%|████▊     | 95/200 [3:33:07<3:04:12, 105.26s/it]

    Epoch 95/200, Final Batch - Avg Meta Loss: 1.9352 (L_0-4: 1.9454)


Training MetaModel:  48%|████▊     | 96/200 [3:34:52<3:02:22, 105.22s/it]

    Epoch 96/200, Final Batch - Avg Meta Loss: 1.9320 (L_0-4: 1.9279)


Training MetaModel:  48%|████▊     | 97/200 [3:36:40<3:02:20, 106.22s/it]

    Epoch 97/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9508)


Training MetaModel:  49%|████▉     | 98/200 [3:38:29<3:02:04, 107.11s/it]

    Epoch 98/200, Final Batch - Avg Meta Loss: 1.9353 (L_0-4: 1.9298)


Training MetaModel:  50%|████▉     | 99/200 [3:40:20<3:01:51, 108.04s/it]

    Epoch 99/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9233)


Training MetaModel:  50%|█████     | 100/200 [3:42:08<3:00:16, 108.16s/it]

    Epoch 100/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9179)


Training MetaModel:  50%|█████     | 101/200 [3:43:55<2:57:59, 107.87s/it]

    Epoch 101/200, Final Batch - Avg Meta Loss: 1.9350 (L_0-4: 1.9127)


Training MetaModel:  51%|█████     | 102/200 [3:45:41<2:55:04, 107.19s/it]

    Epoch 102/200, Final Batch - Avg Meta Loss: 1.9366 (L_0-4: 1.9207)


Training MetaModel:  52%|█████▏    | 103/200 [3:47:26<2:52:11, 106.51s/it]

    Epoch 103/200, Final Batch - Avg Meta Loss: 1.9365 (L_0-4: 1.9594)


Training MetaModel:  52%|█████▏    | 104/200 [3:49:11<2:49:41, 106.05s/it]

    Epoch 104/200, Final Batch - Avg Meta Loss: 1.9344 (L_0-4: 1.9354)


Training MetaModel:  52%|█████▎    | 105/200 [3:50:56<2:47:29, 105.78s/it]

    Epoch 105/200, Final Batch - Avg Meta Loss: 1.9348 (L_0-4: 1.9721)


Training MetaModel:  53%|█████▎    | 106/200 [3:52:41<2:45:15, 105.48s/it]

    Epoch 106/200, Final Batch - Avg Meta Loss: 1.9361 (L_0-4: 1.9320)


Training MetaModel:  54%|█████▎    | 107/200 [3:54:25<2:42:59, 105.16s/it]

    Epoch 107/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9213)


Training MetaModel:  54%|█████▍    | 108/200 [3:56:10<2:41:07, 105.09s/it]

    Epoch 108/200, Final Batch - Avg Meta Loss: 1.9366 (L_0-4: 1.9349)


Training MetaModel:  55%|█████▍    | 109/200 [3:57:54<2:39:01, 104.86s/it]

    Epoch 109/200, Final Batch - Avg Meta Loss: 1.9370 (L_0-4: 1.9570)


Training MetaModel:  55%|█████▌    | 110/200 [3:59:39<2:37:03, 104.71s/it]

    Epoch 110/200, Final Batch - Avg Meta Loss: 1.9343 (L_0-4: 1.9217)


Training MetaModel:  56%|█████▌    | 111/200 [4:01:25<2:35:49, 105.05s/it]

    Epoch 111/200, Final Batch - Avg Meta Loss: 1.9347 (L_0-4: 1.9219)


Training MetaModel:  56%|█████▌    | 112/200 [4:03:10<2:34:15, 105.18s/it]

    Epoch 112/200, Final Batch - Avg Meta Loss: 1.9326 (L_0-4: 1.9097)


Training MetaModel:  56%|█████▋    | 113/200 [4:04:55<2:32:15, 105.01s/it]

    Epoch 113/200, Final Batch - Avg Meta Loss: 1.9320 (L_0-4: 1.9112)


Training MetaModel:  57%|█████▋    | 114/200 [4:06:39<2:30:15, 104.83s/it]

    Epoch 114/200, Final Batch - Avg Meta Loss: 1.9343 (L_0-4: 1.9346)


Training MetaModel:  57%|█████▊    | 115/200 [4:08:23<2:28:16, 104.66s/it]

    Epoch 115/200, Final Batch - Avg Meta Loss: 1.9369 (L_0-4: 1.9422)


Training MetaModel:  58%|█████▊    | 116/200 [4:10:09<2:26:47, 104.85s/it]

    Epoch 116/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9504)


Training MetaModel:  58%|█████▊    | 117/200 [4:11:54<2:25:21, 105.08s/it]

    Epoch 117/200, Final Batch - Avg Meta Loss: 1.9332 (L_0-4: 1.9308)


Training MetaModel:  59%|█████▉    | 118/200 [4:13:40<2:23:45, 105.19s/it]

    Epoch 118/200, Final Batch - Avg Meta Loss: 1.9336 (L_0-4: 1.9204)


Training MetaModel:  60%|█████▉    | 119/200 [4:15:25<2:22:03, 105.23s/it]

    Epoch 119/200, Final Batch - Avg Meta Loss: 1.9334 (L_0-4: 1.9004)


Training MetaModel:  60%|██████    | 120/200 [4:17:12<2:20:53, 105.67s/it]

    Epoch 120/200, Final Batch - Avg Meta Loss: 1.9315 (L_0-4: 1.9155)


Training MetaModel:  60%|██████    | 121/200 [4:18:57<2:19:07, 105.67s/it]

    Epoch 121/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9458)


Training MetaModel:  61%|██████    | 122/200 [4:20:43<2:17:31, 105.79s/it]

    Epoch 122/200, Final Batch - Avg Meta Loss: 1.9320 (L_0-4: 1.9365)


Training MetaModel:  62%|██████▏   | 123/200 [4:22:31<2:16:25, 106.31s/it]

    Epoch 123/200, Final Batch - Avg Meta Loss: 1.9331 (L_0-4: 1.9365)


Training MetaModel:  62%|██████▏   | 124/200 [4:24:18<2:14:49, 106.44s/it]

    Epoch 124/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9559)


Training MetaModel:  62%|██████▎   | 125/200 [4:26:03<2:12:38, 106.11s/it]

    Epoch 125/200, Final Batch - Avg Meta Loss: 1.9343 (L_0-4: 1.9340)


Training MetaModel:  63%|██████▎   | 126/200 [4:27:49<2:10:47, 106.05s/it]

    Epoch 126/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9168)


Training MetaModel:  64%|██████▎   | 127/200 [4:29:37<2:09:38, 106.55s/it]

    Epoch 127/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9214)


Training MetaModel:  64%|██████▍   | 128/200 [4:31:22<2:07:14, 106.04s/it]

    Epoch 128/200, Final Batch - Avg Meta Loss: 1.9334 (L_0-4: 1.9568)


Training MetaModel:  64%|██████▍   | 129/200 [4:33:06<2:04:59, 105.63s/it]

    Epoch 129/200, Final Batch - Avg Meta Loss: 1.9343 (L_0-4: 1.9151)


Training MetaModel:  65%|██████▌   | 130/200 [4:34:51<2:03:05, 105.51s/it]

    Epoch 130/200, Final Batch - Avg Meta Loss: 1.9341 (L_0-4: 1.9235)


Training MetaModel:  66%|██████▌   | 131/200 [4:36:36<2:01:01, 105.25s/it]

    Epoch 131/200, Final Batch - Avg Meta Loss: 1.9345 (L_0-4: 1.9253)


Training MetaModel:  66%|██████▌   | 132/200 [4:38:21<1:59:00, 105.01s/it]

    Epoch 132/200, Final Batch - Avg Meta Loss: 1.9337 (L_0-4: 1.9512)


Training MetaModel:  66%|██████▋   | 133/200 [4:40:06<1:57:14, 104.99s/it]

    Epoch 133/200, Final Batch - Avg Meta Loss: 1.9351 (L_0-4: 1.9384)


Training MetaModel:  67%|██████▋   | 134/200 [4:41:51<1:55:36, 105.09s/it]

    Epoch 134/200, Final Batch - Avg Meta Loss: 1.9345 (L_0-4: 1.9468)


Training MetaModel:  68%|██████▊   | 135/200 [4:43:36<1:53:45, 105.00s/it]

    Epoch 135/200, Final Batch - Avg Meta Loss: 1.9362 (L_0-4: 1.9298)


Training MetaModel:  68%|██████▊   | 136/200 [4:45:20<1:51:43, 104.74s/it]

    Epoch 136/200, Final Batch - Avg Meta Loss: 1.9328 (L_0-4: 1.9468)


Training MetaModel:  68%|██████▊   | 137/200 [4:47:04<1:49:53, 104.66s/it]

    Epoch 137/200, Final Batch - Avg Meta Loss: 1.9322 (L_0-4: 1.8965)


Training MetaModel:  69%|██████▉   | 138/200 [4:48:49<1:48:16, 104.77s/it]

    Epoch 138/200, Final Batch - Avg Meta Loss: 1.9337 (L_0-4: 1.9451)


Training MetaModel:  70%|██████▉   | 139/200 [4:50:37<1:47:27, 105.70s/it]

    Epoch 139/200, Final Batch - Avg Meta Loss: 1.9346 (L_0-4: 1.9615)


Training MetaModel:  70%|███████   | 140/200 [4:52:23<1:45:48, 105.81s/it]

    Epoch 140/200, Final Batch - Avg Meta Loss: 1.9356 (L_0-4: 1.9408)


Training MetaModel:  70%|███████   | 141/200 [4:54:10<1:44:14, 106.02s/it]

    Epoch 141/200, Final Batch - Avg Meta Loss: 1.9330 (L_0-4: 1.9211)


Training MetaModel:  71%|███████   | 142/200 [4:55:55<1:42:18, 105.83s/it]

    Epoch 142/200, Final Batch - Avg Meta Loss: 1.9339 (L_0-4: 1.9341)


Training MetaModel:  72%|███████▏  | 143/200 [4:57:40<1:40:13, 105.51s/it]

    Epoch 143/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9126)


Training MetaModel:  72%|███████▏  | 144/200 [4:59:25<1:38:14, 105.25s/it]

    Epoch 144/200, Final Batch - Avg Meta Loss: 1.9356 (L_0-4: 1.9578)


Training MetaModel:  72%|███████▎  | 145/200 [5:01:09<1:36:23, 105.16s/it]

    Epoch 145/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9500)


Training MetaModel:  73%|███████▎  | 146/200 [5:02:54<1:34:31, 105.02s/it]

    Epoch 146/200, Final Batch - Avg Meta Loss: 1.9342 (L_0-4: 1.9585)


Training MetaModel:  74%|███████▎  | 147/200 [5:04:39<1:32:45, 105.00s/it]

    Epoch 147/200, Final Batch - Avg Meta Loss: 1.9338 (L_0-4: 1.9380)


Training MetaModel:  74%|███████▍  | 148/200 [5:06:24<1:30:53, 104.88s/it]

    Epoch 148/200, Final Batch - Avg Meta Loss: 1.9354 (L_0-4: 1.9476)


Training MetaModel:  74%|███████▍  | 149/200 [5:08:09<1:29:11, 104.92s/it]

    Epoch 149/200, Final Batch - Avg Meta Loss: 1.9322 (L_0-4: 1.9424)


Training MetaModel:  75%|███████▌  | 150/200 [5:09:54<1:27:27, 104.96s/it]

    Epoch 150/200, Final Batch - Avg Meta Loss: 1.9352 (L_0-4: 1.9311)


Training MetaModel:  76%|███████▌  | 151/200 [5:11:40<1:25:58, 105.27s/it]

    Epoch 151/200, Final Batch - Avg Meta Loss: 1.9383 (L_0-4: 1.9585)


Training MetaModel:  76%|███████▌  | 152/200 [5:13:27<1:24:39, 105.83s/it]

    Epoch 152/200, Final Batch - Avg Meta Loss: 1.9341 (L_0-4: 1.9357)


Training MetaModel:  76%|███████▋  | 153/200 [5:15:13<1:23:03, 106.03s/it]

    Epoch 153/200, Final Batch - Avg Meta Loss: 1.9358 (L_0-4: 1.9614)


Training MetaModel:  77%|███████▋  | 154/200 [5:17:00<1:21:31, 106.33s/it]

    Epoch 154/200, Final Batch - Avg Meta Loss: 1.9348 (L_0-4: 1.9222)


Training MetaModel:  78%|███████▊  | 155/200 [5:18:47<1:19:53, 106.52s/it]

    Epoch 155/200, Final Batch - Avg Meta Loss: 1.9378 (L_0-4: 1.9307)


Training MetaModel:  78%|███████▊  | 156/200 [5:20:35<1:18:15, 106.71s/it]

    Epoch 156/200, Final Batch - Avg Meta Loss: 1.9321 (L_0-4: 1.9362)


Training MetaModel:  78%|███████▊  | 157/200 [5:22:22<1:16:33, 106.83s/it]

    Epoch 157/200, Final Batch - Avg Meta Loss: 1.9340 (L_0-4: 1.9416)


Training MetaModel:  79%|███████▉  | 158/200 [5:24:09<1:14:49, 106.89s/it]

    Epoch 158/200, Final Batch - Avg Meta Loss: 1.9344 (L_0-4: 1.9323)


Training MetaModel:  80%|███████▉  | 159/200 [5:25:56<1:13:08, 107.03s/it]

    Epoch 159/200, Final Batch - Avg Meta Loss: 1.9338 (L_0-4: 1.9318)


Training MetaModel:  80%|████████  | 160/200 [5:27:43<1:11:17, 106.93s/it]

    Epoch 160/200, Final Batch - Avg Meta Loss: 1.9361 (L_0-4: 1.9266)


Training MetaModel:  80%|████████  | 161/200 [5:29:31<1:09:45, 107.33s/it]

    Epoch 161/200, Final Batch - Avg Meta Loss: 1.9373 (L_0-4: 1.9268)


Training MetaModel:  81%|████████  | 162/200 [5:31:19<1:08:05, 107.51s/it]

    Epoch 162/200, Final Batch - Avg Meta Loss: 1.9358 (L_0-4: 1.9249)


Training MetaModel:  82%|████████▏ | 163/200 [5:33:06<1:06:11, 107.34s/it]

    Epoch 163/200, Final Batch - Avg Meta Loss: 1.9325 (L_0-4: 1.9177)


Training MetaModel:  82%|████████▏ | 164/200 [5:34:53<1:04:20, 107.23s/it]

    Epoch 164/200, Final Batch - Avg Meta Loss: 1.9353 (L_0-4: 1.9295)


Training MetaModel:  82%|████████▎ | 165/200 [5:36:40<1:02:30, 107.15s/it]

    Epoch 165/200, Final Batch - Avg Meta Loss: 1.9356 (L_0-4: 1.9431)


Training MetaModel:  83%|████████▎ | 166/200 [5:38:26<1:00:29, 106.76s/it]

    Epoch 166/200, Final Batch - Avg Meta Loss: 1.9328 (L_0-4: 1.9317)


Training MetaModel:  84%|████████▎ | 167/200 [5:40:15<59:08, 107.54s/it]  

    Epoch 167/200, Final Batch - Avg Meta Loss: 1.9368 (L_0-4: 1.9404)


Training MetaModel:  84%|████████▍ | 168/200 [5:42:03<57:29, 107.80s/it]

    Epoch 168/200, Final Batch - Avg Meta Loss: 1.9358 (L_0-4: 1.9481)


Training MetaModel:  84%|████████▍ | 169/200 [5:43:50<55:30, 107.42s/it]

    Epoch 169/200, Final Batch - Avg Meta Loss: 1.9339 (L_0-4: 1.9298)


Training MetaModel:  85%|████████▌ | 170/200 [5:45:38<53:46, 107.56s/it]

    Epoch 170/200, Final Batch - Avg Meta Loss: 1.9333 (L_0-4: 1.9465)


Training MetaModel:  86%|████████▌ | 171/200 [5:47:24<51:49, 107.23s/it]

    Epoch 171/200, Final Batch - Avg Meta Loss: 1.9350 (L_0-4: 1.9094)


Training MetaModel:  86%|████████▌ | 172/200 [5:49:11<49:59, 107.11s/it]

    Epoch 172/200, Final Batch - Avg Meta Loss: 1.9346 (L_0-4: 1.9200)


Training MetaModel:  86%|████████▋ | 173/200 [5:50:59<48:14, 107.19s/it]

    Epoch 173/200, Final Batch - Avg Meta Loss: 1.9352 (L_0-4: 1.9495)


Training MetaModel:  87%|████████▋ | 174/200 [5:52:45<46:20, 106.96s/it]

    Epoch 174/200, Final Batch - Avg Meta Loss: 1.9293 (L_0-4: 1.9209)


Training MetaModel:  88%|████████▊ | 175/200 [5:54:32<44:35, 107.00s/it]

    Epoch 175/200, Final Batch - Avg Meta Loss: 1.9350 (L_0-4: 1.9389)


Training MetaModel:  88%|████████▊ | 176/200 [5:56:19<42:47, 106.99s/it]

    Epoch 176/200, Final Batch - Avg Meta Loss: 1.9314 (L_0-4: 1.9200)


Training MetaModel:  88%|████████▊ | 177/200 [5:58:06<41:00, 106.99s/it]

    Epoch 177/200, Final Batch - Avg Meta Loss: 1.9348 (L_0-4: 1.9302)


Training MetaModel:  89%|████████▉ | 178/200 [5:59:53<39:15, 107.09s/it]

    Epoch 178/200, Final Batch - Avg Meta Loss: 1.9354 (L_0-4: 1.9357)


Training MetaModel:  90%|████████▉ | 179/200 [6:01:43<37:43, 107.80s/it]

    Epoch 179/200, Final Batch - Avg Meta Loss: 1.9330 (L_0-4: 1.9546)


Training MetaModel:  90%|█████████ | 180/200 [6:03:30<35:52, 107.62s/it]

    Epoch 180/200, Final Batch - Avg Meta Loss: 1.9349 (L_0-4: 1.9308)


Training MetaModel:  90%|█████████ | 181/200 [6:05:17<34:03, 107.57s/it]

    Epoch 181/200, Final Batch - Avg Meta Loss: 1.9335 (L_0-4: 1.9356)


Training MetaModel:  91%|█████████ | 182/200 [6:07:04<32:12, 107.36s/it]

    Epoch 182/200, Final Batch - Avg Meta Loss: 1.9313 (L_0-4: 1.9152)


Training MetaModel:  92%|█████████▏| 183/200 [6:08:52<30:27, 107.47s/it]

    Epoch 183/200, Final Batch - Avg Meta Loss: 1.9320 (L_0-4: 1.9443)


Training MetaModel:  92%|█████████▏| 184/200 [6:10:39<28:39, 107.45s/it]

    Epoch 184/200, Final Batch - Avg Meta Loss: 1.9355 (L_0-4: 1.9424)


Training MetaModel:  92%|█████████▎| 185/200 [6:12:28<26:57, 107.86s/it]

    Epoch 185/200, Final Batch - Avg Meta Loss: 1.9345 (L_0-4: 1.9344)


Training MetaModel:  93%|█████████▎| 186/200 [6:14:17<25:14, 108.17s/it]

    Epoch 186/200, Final Batch - Avg Meta Loss: 1.9322 (L_0-4: 1.9392)


Training MetaModel:  94%|█████████▎| 187/200 [6:16:03<23:16, 107.40s/it]

    Epoch 187/200, Final Batch - Avg Meta Loss: 1.9368 (L_0-4: 1.9449)


Training MetaModel:  94%|█████████▍| 188/200 [6:17:48<21:20, 106.72s/it]

    Epoch 188/200, Final Batch - Avg Meta Loss: 1.9364 (L_0-4: 1.9367)


Training MetaModel:  94%|█████████▍| 189/200 [6:19:34<19:32, 106.58s/it]

    Epoch 189/200, Final Batch - Avg Meta Loss: 1.9337 (L_0-4: 1.9715)


Training MetaModel:  95%|█████████▌| 190/200 [6:21:20<17:42, 106.24s/it]

    Epoch 190/200, Final Batch - Avg Meta Loss: 1.9341 (L_0-4: 1.9001)


Training MetaModel:  96%|█████████▌| 191/200 [6:23:06<15:55, 106.18s/it]

    Epoch 191/200, Final Batch - Avg Meta Loss: 1.9374 (L_0-4: 1.9483)


Training MetaModel:  96%|█████████▌| 192/200 [6:24:53<14:13, 106.65s/it]

    Epoch 192/200, Final Batch - Avg Meta Loss: 1.9361 (L_0-4: 1.9508)


Training MetaModel:  96%|█████████▋| 193/200 [6:26:41<12:29, 107.01s/it]

    Epoch 193/200, Final Batch - Avg Meta Loss: 1.9317 (L_0-4: 1.9409)


Training MetaModel:  97%|█████████▋| 194/200 [6:28:29<10:43, 107.33s/it]

    Epoch 194/200, Final Batch - Avg Meta Loss: 1.9325 (L_0-4: 1.9163)


Training MetaModel:  98%|█████████▊| 195/200 [6:30:16<08:55, 107.00s/it]

    Epoch 195/200, Final Batch - Avg Meta Loss: 1.9363 (L_0-4: 1.9251)


Training MetaModel:  98%|█████████▊| 196/200 [6:32:02<07:07, 106.85s/it]

    Epoch 196/200, Final Batch - Avg Meta Loss: 1.9356 (L_0-4: 1.9358)


Training MetaModel:  98%|█████████▊| 197/200 [6:33:49<05:20, 106.84s/it]

    Epoch 197/200, Final Batch - Avg Meta Loss: 1.9331 (L_0-4: 1.9163)


Training MetaModel:  99%|█████████▉| 198/200 [6:35:34<03:32, 106.47s/it]

    Epoch 198/200, Final Batch - Avg Meta Loss: 1.9380 (L_0-4: 1.9544)


Training MetaModel: 100%|█████████▉| 199/200 [6:37:20<01:46, 106.26s/it]

    Epoch 199/200, Final Batch - Avg Meta Loss: 1.9347 (L_0-4: 1.9267)


Training MetaModel: 100%|██████████| 200/200 [6:39:07<00:00, 119.74s/it]

    Epoch 200/200, Final Batch - Avg Meta Loss: 1.9364 (L_0-4: 1.9122)
MetaModel training complete.
MetaModel saved to ./trained_models_smallest/meta_model_custom_loss.pt

--- Evaluating the MetaModel's Output ---






--- Performance of the Model Predicted by MetaModel ---
  Predicted Model Accuracy on [0, 1, 2, 3, 4] (Accuracy A): 77.00%
  Predicted Model Accuracy on [5] (Accuracy B): 0.00%
  Predicted Model Custom Metric (A - B): 77.00
  Original Model Accuracy on [0, 1, 2, 3, 4] (Accuracy A): 78.81%
  Original Model Accuracy on [5] (Accuracy B): 0.00%
  Original Model Custom Metric (A - B): 78.81

Meta-Learning Experiment complete!
Check the 'meta_plots' directory for visualizations of MetaModel training and output performance.


In [None]:
Using device: mps
Loading MNIST dataset...
MNIST dataset loaded.
Flattened SimpleNN state_dict size: 12730
Attempting to load pre-generated Class 1 and Class 2 models...
Loaded 20 Class 1 models.
Loaded 20 Class 2 models.
Attempting to load pre-generated Target Updated Model 1 weights...
Loaded 10 Target Updated Model 1 models.
Preparing 1000 training inputs for the MetaModel...
Preparing MetaModel data: 100%|██████████| 1000/1000 [00:00<00:00, 9507.53it/s]

--- Training the MetaModel with Custom Loss ---
Training MetaModel:   0%|          | 0/200 [00:00<?, ?it/s]    Epoch 1/200, Batch 25/63 - Avg Meta Loss: 1.8360 (L_0-4: 1.8562)
    Epoch 1/200, Batch 50/63 - Avg Meta Loss: 1.8438 (L_0-4: 1.8749)
Training MetaModel:   0%|          | 1/200 [16:59<56:20:47, 1019.34s/it]    Epoch 1/200, Final Batch - Avg Meta Loss: 1.8076 (L_0-4: 1.7700)
    Epoch 2/200, Batch 25/63 - Avg Meta Loss: 1.8532 (L_0-4: 1.9306)
    Epoch 2/200, Batch 50/63 - Avg Meta Loss: 1.8256 (L_0-4: 1.8492)
Training MetaModel:   1%|          | 2/200 [1:04:06<114:33:54, 2083.00s/it]    Epoch 2/200, Final Batch - Avg Meta Loss: 1.8135 (L_0-4: 1.7247)
    Epoch 3/200, Batch 25/63 - Avg Meta Loss: 1.8260 (L_0-4: 1.7034)
    Epoch 3/200, Batch 50/63 - Avg Meta Loss: 1.8374 (L_0-4: 1.8373)
Training MetaModel:   2%|▏         | 3/200 [1:21:22<87:48:42, 1604.68s/it]     Epoch 3/200, Final Batch - Avg Meta Loss: 1.8309 (L_0-4: 1.8743)
    Epoch 4/200, Batch 25/63 - Avg Meta Loss: 1.8200 (L_0-4: 1.7904)
    Epoch 4/200, Batch 50/63 - Avg Meta Loss: 1.8323 (L_0-4: 1.7775)
Training MetaModel:   2%|▏         | 4/200 [1:23:17<55:20:24, 1016.45s/it]    Epoch 4/200, Final Batch - Avg Meta Loss: 1.8365 (L_0-4: 1.6249)
    Epoch 5/200, Batch 25/63 - Avg Meta Loss: 1.8167 (L_0-4: 1.9721)
    Epoch 5/200, Batch 50/63 - Avg Meta Loss: 1.8270 (L_0-4: 1.8296)
Training MetaModel:   2%|▎         | 5/200 [1:25:12<37:28:00, 691.70s/it]     Epoch 5/200, Final Batch - Avg Meta Loss: 1.8616 (L_0-4: 1.6108)
    Epoch 6/200, Batch 25/63 - Avg Meta Loss: 1.8340 (L_0-4: 1.7992)
    Epoch 6/200, Batch 50/63 - Avg Meta Loss: 1.8423 (L_0-4: 2.0141)
Training MetaModel:   3%|▎         | 6/200 [1:44:00<45:16:09, 840.05s/it]    Epoch 6/200, Final Batch - Avg Meta Loss: 1.8046 (L_0-4: 1.7554)
    Epoch 7/200, Batch 25/63 - Avg Meta Loss: 1.8364 (L_0-4: 1.9645)
    Epoch 7/200, Batch 50/63 - Avg Meta Loss: 1.8302 (L_0-4: 1.8935)
Training MetaModel:   4%|▎         | 7/200 [1:45:56<32:20:31, 603.27s/it]    Epoch 7/200, Final Batch - Avg Meta Loss: 1.8075 (L_0-4: 1.7013)
    Epoch 8/200, Batch 25/63 - Avg Meta Loss: 1.8306 (L_0-4: 1.7396)
    Epoch 8/200, Batch 50/63 - Avg Meta Loss: 1.8310 (L_0-4: 1.7063)
Training MetaModel:   4%|▍         | 8/200 [2:18:34<55:10:19, 1034.48s/it]    Epoch 8/200, Final Batch - Avg Meta Loss: 1.8423 (L_0-4: 2.0280)
    Epoch 9/200, Batch 25/63 - Avg Meta Loss: 1.8353 (L_0-4: 1.9284)
    Epoch 9/200, Batch 50/63 - Avg Meta Loss: 1.8388 (L_0-4: 1.7543)
Training MetaModel:   4%|▍         | 9/200 [2:51:06<70:06:17, 1321.35s/it]    Epoch 9/200, Final Batch - Avg Meta Loss: 1.8083 (L_0-4: 1.6688)
    Epoch 10/200, Batch 25/63 - Avg Meta Loss: 1.8202 (L_0-4: 1.7861)
    Epoch 10/200, Batch 50/63 - Avg Meta Loss: 1.8595 (L_0-4: 1.8392)
Training MetaModel:   5%|▌         | 10/200 [3:02:32<59:23:17, 1125.25s/it]    Epoch 10/200, Final Batch - Avg Meta Loss: 1.7742 (L_0-4: 1.6124)
    Epoch 11/200, Batch 25/63 - Avg Meta Loss: 1.8508 (L_0-4: 1.7812)
    Epoch 11/200, Batch 50/63 - Avg Meta Loss: 1.8134 (L_0-4: 1.7450)
Training MetaModel:   6%|▌         | 11/200 [3:04:27<42:50:16, 815.96s/it]     Epoch 11/200, Final Batch - Avg Meta Loss: 1.8194 (L_0-4: 1.6707)
    Epoch 12/200, Batch 25/63 - Avg Meta Loss: 1.8328 (L_0-4: 1.6924)
    Epoch 12/200, Batch 50/63 - Avg Meta Loss: 1.8281 (L_0-4: 1.7495)
Training MetaModel:   6%|▌         | 12/200 [3:21:53<46:15:55, 885.93s/it]    Epoch 12/200, Final Batch - Avg Meta Loss: 1.8384 (L_0-4: 1.8846)
    Epoch 13/200, Batch 25/63 - Avg Meta Loss: 1.8037 (L_0-4: 1.9289)
    Epoch 13/200, Batch 50/63 - Avg Meta Loss: 1.8523 (L_0-4: 2.0146)
Training MetaModel:   6%|▋         | 13/200 [4:03:26<71:18:19, 1372.73s/it]    Epoch 13/200, Final Batch - Avg Meta Loss: 1.8401 (L_0-4: 1.7123)
    Epoch 14/200, Batch 25/63 - Avg Meta Loss: 1.8271 (L_0-4: 1.6825)
    Epoch 14/200, Batch 50/63 - Avg Meta Loss: 1.8213 (L_0-4: 1.8506)
Training MetaModel:   7%|▋         | 14/200 [4:27:50<72:20:54, 1400.30s/it]    Epoch 14/200, Final Batch - Avg Meta Loss: 1.8614 (L_0-4: 1.8105)
    Epoch 15/200, Batch 25/63 - Avg Meta Loss: 1.8407 (L_0-4: 1.8642)
    Epoch 15/200, Batch 50/63 - Avg Meta Loss: 1.8482 (L_0-4: 1.8113)
Training MetaModel:   8%|▊         | 15/200 [4:29:44<52:02:10, 1012.60s/it]    Epoch 15/200, Final Batch - Avg Meta Loss: 1.7842 (L_0-4: 1.8107)
    Epoch 16/200, Batch 25/63 - Avg Meta Loss: 1.8354 (L_0-4: 1.8120)
    Epoch 16/200, Batch 50/63 - Avg Meta Loss: 1.8310 (L_0-4: 1.8228)
Training MetaModel:   8%|▊         | 16/200 [4:48:34<53:33:36, 1047.92s/it]    Epoch 16/200, Final Batch - Avg Meta Loss: 1.8360 (L_0-4: 2.0460)
    Epoch 17/200, Batch 25/63 - Avg Meta Loss: 1.8371 (L_0-4: 1.9928)
    Epoch 17/200, Batch 50/63 - Avg Meta Loss: 1.8407 (L_0-4: 1.9415)
Training MetaModel:   8%|▊         | 17/200 [4:50:22<38:54:40, 765.47s/it]     Epoch 17/200, Final Batch - Avg Meta Loss: 1.8161 (L_0-4: 1.8338)
    Epoch 18/200, Batch 25/63 - Avg Meta Loss: 1.8487 (L_0-4: 1.8544)
    Epoch 18/200, Batch 50/63 - Avg Meta Loss: 1.8138 (L_0-4: 1.7967)
Training MetaModel:   9%|▉         | 18/200 [5:04:38<40:04:17, 792.62s/it]    Epoch 18/200, Final Batch - Avg Meta Loss: 1.8428 (L_0-4: 1.8466)
    Epoch 19/200, Batch 25/63 - Avg Meta Loss: 1.8498 (L_0-4: 1.7444)
    Epoch 19/200, Batch 50/63 - Avg Meta Loss: 1.8190 (L_0-4: 1.9204)
Training MetaModel:  10%|▉         | 19/200 [5:34:33<54:58:33, 1093.44s/it]    Epoch 19/200, Final Batch - Avg Meta Loss: 1.8202 (L_0-4: 1.8220)
    Epoch 20/200, Batch 25/63 - Avg Meta Loss: 1.8352 (L_0-4: 1.8357)
    Epoch 20/200, Batch 50/63 - Avg Meta Loss: 1.8329 (L_0-4: 1.7854)
Training MetaModel:  10%|█         | 20/200 [5:49:14<51:29:50, 1029.95s/it]    Epoch 20/200, Final Batch - Avg Meta Loss: 1.8328 (L_0-4: 1.7007)
    Epoch 21/200, Batch 25/63 - Avg Meta Loss: 1.8158 (L_0-4: 1.8976)
    Epoch 21/200, Batch 50/63 - Avg Meta Loss: 1.8366 (L_0-4: 1.7796)
Training MetaModel:  10%|█         | 21/200 [5:51:02<37:26:14, 752.93s/it]     Epoch 21/200, Final Batch - Avg Meta Loss: 1.8483 (L_0-4: 1.8057)
    Epoch 22/200, Batch 25/63 - Avg Meta Loss: 1.8165 (L_0-4: 1.7626)
    Epoch 22/200, Batch 50/63 - Avg Meta Loss: 1.8531 (L_0-4: 2.0057)
Training MetaModel:  11%|█         | 22/200 [5:52:50<27:40:00, 559.55s/it]    Epoch 22/200, Final Batch - Avg Meta Loss: 1.8377 (L_0-4: 1.9751)
    Epoch 23/200, Batch 25/63 - Avg Meta Loss: 1.8275 (L_0-4: 1.8515)
    Epoch 23/200, Batch 50/63 - Avg Meta Loss: 1.8382 (L_0-4: 1.8422)
Training MetaModel:  12%|█▏        | 23/200 [5:54:40<20:52:13, 424.48s/it]    Epoch 23/200, Final Batch - Avg Meta Loss: 1.8412 (L_0-4: 1.8374)
    Epoch 24/200, Batch 25/63 - Avg Meta Loss: 1.8307 (L_0-4: 1.8362)
    Epoch 24/200, Batch 50/63 - Avg Meta Loss: 1.8243 (L_0-4: 1.7848)
Training MetaModel:  12%|█▏        | 24/200 [5:56:30<16:08:27, 330.16s/it]    Epoch 24/200, Final Batch - Avg Meta Loss: 1.8717 (L_0-4: 2.0374)
    Epoch 25/200, Batch 25/63 - Avg Meta Loss: 1.8367 (L_0-4: 1.8124)
    Epoch 25/200, Batch 50/63 - Avg Meta Loss: 1.8357 (L_0-4: 1.7775)
Training MetaModel:  12%|█▎        | 25/200 [5:58:20<12:50:53, 264.30s/it]    Epoch 25/200, Final Batch - Avg Meta Loss: 1.8051 (L_0-4: 1.5530)
    Epoch 26/200, Batch 25/63 - Avg Meta Loss: 1.8320 (L_0-4: 1.8308)
    Epoch 26/200, Batch 50/63 - Avg Meta Loss: 1.8187 (L_0-4: 1.8345)
Training MetaModel:  13%|█▎        | 26/200 [6:00:11<10:33:08, 218.33s/it]    Epoch 26/200, Final Batch - Avg Meta Loss: 1.8516 (L_0-4: 1.8590)
    Epoch 27/200, Batch 25/63 - Avg Meta Loss: 1.8382 (L_0-4: 1.7721)
    Epoch 27/200, Batch 50/63 - Avg Meta Loss: 1.8198 (L_0-4: 1.7705)
Training MetaModel:  14%|█▎        | 27/200 [6:03:21<10:04:13, 209.56s/it]    Epoch 27/200, Final Batch - Avg Meta Loss: 1.8376 (L_0-4: 1.7083)
    Epoch 28/200, Batch 25/63 - Avg Meta Loss: 1.8291 (L_0-4: 1.6928)
    Epoch 28/200, Batch 50/63 - Avg Meta Loss: 1.8240 (L_0-4: 1.8016)
Training MetaModel:  14%|█▍        | 28/200 [6:09:30<12:18:12, 257.51s/it]    Epoch 28/200, Final Batch - Avg Meta Loss: 1.8394 (L_0-4: 1.6676)
    Epoch 29/200, Batch 25/63 - Avg Meta Loss: 1.8256 (L_0-4: 2.0726)
    Epoch 29/200, Batch 50/63 - Avg Meta Loss: 1.8382 (L_0-4: 2.0114)
Training MetaModel:  14%|█▍        | 29/200 [6:12:32<11:09:13, 234.82s/it]    Epoch 29/200, Final Batch - Avg Meta Loss: 1.8475 (L_0-4: 1.9027)
    Epoch 30/200, Batch 25/63 - Avg Meta Loss: 1.8007 (L_0-4: 1.6686)
    Epoch 30/200, Batch 50/63 - Avg Meta Loss: 1.8544 (L_0-4: 1.8956)
Training MetaModel:  15%|█▌        | 30/200 [6:15:09<9:59:28, 211.58s/it]     Epoch 30/200, Final Batch - Avg Meta Loss: 1.8455 (L_0-4: 1.8723)
    Epoch 31/200, Batch 25/63 - Avg Meta Loss: 1.8267 (L_0-4: 1.8197)
    Epoch 31/200, Batch 50/63 - Avg Meta Loss: 1.8353 (L_0-4: 1.9416)
Training MetaModel:  16%|█▌        | 31/200 [6:17:55<9:17:16, 197.85s/it]    Epoch 31/200, Final Batch - Avg Meta Loss: 1.8257 (L_0-4: 1.5058)
    Epoch 32/200, Batch 25/63 - Avg Meta Loss: 1.8201 (L_0-4: 1.8200)
    Epoch 32/200, Batch 50/63 - Avg Meta Loss: 1.8491 (L_0-4: 1.7909)
Training MetaModel:  16%|█▌        | 32/200 [6:20:48<8:53:32, 190.55s/it]    Epoch 32/200, Final Batch - Avg Meta Loss: 1.8207 (L_0-4: 1.7743)
    Epoch 33/200, Batch 25/63 - Avg Meta Loss: 1.8445 (L_0-4: 1.9730)
    Epoch 33/200, Batch 50/63 - Avg Meta Loss: 1.8246 (L_0-4: 1.7790)
Training MetaModel:  16%|█▋        | 33/200 [6:23:41<8:35:17, 185.13s/it]    Epoch 33/200, Final Batch - Avg Meta Loss: 1.8220 (L_0-4: 1.7811)
    Epoch 34/200, Batch 25/63 - Avg Meta Loss: 1.8315 (L_0-4: 1.7666)
    Epoch 34/200, Batch 50/63 - Avg Meta Loss: 1.8532 (L_0-4: 1.8278)
Training MetaModel:  17%|█▋        | 34/200 [6:26:30<8:18:26, 180.16s/it]    Epoch 34/200, Final Batch - Avg Meta Loss: 1.7918 (L_0-4: 1.8164)
    Epoch 35/200, Batch 25/63 - Avg Meta Loss: 1.8304 (L_0-4: 1.7158)
    Epoch 35/200, Batch 50/63 - Avg Meta Loss: 1.8311 (L_0-4: 1.7756)
Training MetaModel:  18%|█▊        | 35/200 [7:36:25<63:28:15, 1384.82s/it]    Epoch 35/200, Final Batch - Avg Meta Loss: 1.8268 (L_0-4: 1.6410)
    Epoch 36/200, Batch 25/63 - Avg Meta Loss: 1.8572 (L_0-4: 2.0331)
    Epoch 36/200, Batch 50/63 - Avg Meta Loss: 1.8030 (L_0-4: 1.9363)
Training MetaModel:  18%|█▊        | 36/200 [7:38:12<45:37:14, 1001.43s/it]    Epoch 36/200, Final Batch - Avg Meta Loss: 1.8343 (L_0-4: 1.6064)
    Epoch 37/200, Batch 25/63 - Avg Meta Loss: 1.8226 (L_0-4: 1.8192)
    Epoch 37/200, Batch 50/63 - Avg Meta Loss: 1.8486 (L_0-4: 1.8496)
Training MetaModel:  18%|█▊        | 37/200 [7:56:35<46:43:15, 1031.88s/it]    Epoch 37/200, Final Batch - Avg Meta Loss: 1.8338 (L_0-4: 1.8510)
    Epoch 38/200, Batch 25/63 - Avg Meta Loss: 1.8444 (L_0-4: 1.7722)
Training MetaModel:  18%|█▊        | 37/200 [8:01:56<35:23:07, 781.52s/it]