In [1]:
import time
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from pytorch_lightning import Trainer
    
from typing import List
import copy
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
from lightning.pytorch.loggers import WandbLogger
import wandb

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SEED = 2024
pl.seed_everything(SEED)

Seed set to 2024


2024

In [3]:
wandb_project_name = 'MNIST_LORA'

In [4]:
class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = './data', batch_size: int = 64):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transforms.ToTensor()

    def prepare_data(self):
        # Download only
        datasets.MNIST(root=self.data_dir, train=True, download=True)
        datasets.MNIST(root=self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        # Transform and split datasets
        if stage == 'fit' or stage is None:
            mnist_full = datasets.MNIST(root=self.data_dir, train=True, transform=self.transform)
            self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
        if stage == 'test' or stage is None:
            self.mnist_test = datasets.MNIST(root=self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size)

# Example of how to use the MNISTDataModule
batch_size = 64
mnist_data = MNISTDataModule(data_dir='./data', batch_size=batch_size)


In [5]:
# Hyperparameters
random_seed = 123
learning_rate = 0.005
num_epochs = 2

# Architecture
num_features = 784
num_hidden_1 = 128
num_hidden_2 = 256
num_classes = 10

class MultilayerPerceptron(pl.LightningModule):
    def __init__(self, num_features, num_hidden_1, num_hidden_2, num_classes, learning_rate):
        super().__init__()
        self.save_hyperparameters()

        self.layers = nn.Sequential(
            nn.Linear(num_features, num_hidden_1),
            nn.ReLU(),
            nn.Linear(num_hidden_1, num_hidden_2),
            nn.ReLU(),
            nn.Linear(num_hidden_2, num_classes)
        )

    def forward(self, x):
        return self.layers(x)

    def training_step(self, batch, batch_idx):
        features, targets = batch
        features = features.view(-1, 28*28)
        logits = self(features)
        loss = F.cross_entropy(logits, targets)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == targets).float().mean()
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        features, targets = batch
        features = features.view(-1, 28*28)
        logits = self(features)
        loss = F.cross_entropy(logits, targets)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == targets).float().mean()
        self.log('val_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        
    def test_step(self, batch, batch_idx):
        features, targets = batch
        features = features.view(-1, 28*28)
        logits = self(features)
        loss = F.cross_entropy(logits, targets)
        self.log('test_loss', loss)
        preds = torch.argmax(logits, dim=1)
        acc = (preds == targets).float().mean()
        self.log('test_acc', acc)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
        return optimizer
    
model = MultilayerPerceptron(num_features, num_hidden_1, num_hidden_2, num_classes, learning_rate)

In [6]:
#wandb_logger = WandbLogger(project=wandb_project_name, log_model="all", name="baseline", group="baseline", save_dir="lightning_logs")
trainer = Trainer(max_epochs=num_epochs) #, logger=wandb_logger)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [7]:
trainer.fit(model, mnist_data)

You are using a CUDA device ('NVIDIA GeForce RTX 4070 Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 136 K 
--------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.544     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 1: 100%|██████████| 860/860 [00:08<00:00, 106.92it/s, v_num=36, train_loss_step=0.0947, train_acc_step=0.917, val_loss_step=0.000484, val_acc_step=1.000, val_loss_epoch=0.128, val_acc_epoch=0.964, train_loss_epoch=0.115, train_acc_epoch=0.965]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 860/860 [00:08<00:00, 106.69it/s, v_num=36, train_loss_step=0.0947, train_acc_step=0.917, val_loss_step=0.000484, val_acc_step=1.000, val_loss_epoch=0.128, val_acc_epoch=0.964, train_loss_epoch=0.115, train_acc_epoch=0.965]


In [8]:
trainer.test(model, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 260.61it/s]


[{'test_loss': 0.11397630721330643, 'test_acc': 0.9661999940872192}]

In [10]:
wandb.finish()

## LoRa methods

In [9]:
# Original code from https://magazine.sebastianraschka.com/p/lora-and-dora-from-scratch

class LoRALayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        self.B = nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

    
class LinearWithLoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        combined_weight = self.linear.weight + self.lora.alpha*lora.T
        return F.linear(x, combined_weight, self.linear.bias)

    
# This DoRA code is equivalent to LinearWithDoRA
# Code inspired by https://github.com/catid/dora/blob/main/dora.py
class LinearWithDoRAMerged(nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
        
        self.m = nn.Parameter(
            self.linear.weight.norm(p=2, dim=0, keepdim=True)
        )

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        numerator = self.linear.weight + self.lora.alpha*lora.T
        denominator = numerator.norm(p=2, dim=0, keepdim=True)
        directional_component = numerator / denominator
        new_weight = self.m * directional_component
        return F.linear(x, new_weight, self.linear.bias)
    
class LinearWithDoRAMoE(nn.Module):
    def __init__(self, linear, rank, alpha, num_experts):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, rank, alpha)
        
        # Crear múltiples m como expertos
        self.m_experts = nn.ParameterList([
            nn.Parameter(self.linear.weight.norm(p=2, dim=1, keepdim=True)) for _ in range(num_experts)
        ])
        
        # Capa lineal para generar puntuaciones de enrutamiento basadas en la entrada x
        self.routing_weights = nn.Parameter(torch.zeros(1, num_experts))

    def forward(self, x):
        lora = self.lora.A @ self.lora.B
        numerator = self.linear.weight + self.lora.alpha * lora.T
        denominator = numerator.norm(p=2, dim=1, keepdim=True)
        directional_component = numerator / denominator

        # Generar puntuaciones de enrutamiento para cada experto usando la entrada x
        routing_probs = F.softmax(self.routing_weights, dim=-1)  # Aplicar softmax para obtener probabilidades

        # Calcular la contribución ponderada de cada m experto
        m_combined = sum([m_expert*routing_probs[0, idx] for idx, m_expert in enumerate(self.m_experts)])
        new_weight = m_combined * directional_component
        
        return F.linear(x, new_weight, self.linear.bias)
    
# Lora neurons expert

class LoRAMixtureOfExpertsLayer(nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha, num_experts):
        super().__init__()
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        
        self.A = nn.Parameter(torch.randn(in_dim, rank) * std_dev)
        # Experts B
        self.B = nn.ParameterList([nn.Parameter(torch.zeros(rank, out_dim)) for _ in range(num_experts)])
        self.alpha = alpha
        self.num_experts = num_experts

    def forward(self, x):
        # Apply the linear transformation to the opertation x*A
        weights = F.softmax(self.alpha * (x @ self.A), dim=-1)
        
        # Now, weights can be used to weigh the contribution of each expert more differentially. 
        # For simplicity, here we will sum the outputs as before. 
        # TODO: Consider modifying this to use the weights effectively.
        expert_outputs = [weights @ b for b in self.B]
        x = sum(expert_outputs) / self.num_experts
        return x
    
class LinearWithLoRAMixtureOfExperts(nn.Module):
    def __init__(self, linear, rank, alpha, num_experts):
        super().__init__()
        self.linear = linear
        self.lora = LoRAMixtureOfExpertsLayer(
            linear.in_features, linear.out_features, rank, alpha, num_experts
        )

    def forward(self, x):
        return F.linear(x, self.linear.weight, self.linear.bias) + self.lora(x)

In [10]:
def freeze_linear_layers(model):
    for child in model.children():
        if isinstance(child, nn.Linear):
            for param in child.parameters():
                param.requires_grad = False
        else:
            # Recursively freeze linear layers in children modules
            freeze_linear_layers(child)

## Train

### Train Lora

In [11]:
model_lora = copy.deepcopy(model)

model_lora.layers[0] = LinearWithLoRAMerged(model.layers[0], rank=4, alpha=8) # alpha = 2*rank
model_lora.layers[2] = LinearWithLoRAMerged(model.layers[2], rank=4, alpha=8)
model_lora.layers[4] = LinearWithLoRAMerged(model.layers[4], rank=4, alpha=8)


freeze_linear_layers(model_lora)

# Check if linear layers are frozen
for name, param in model_lora.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [12]:
#wandb_logger_lora = WandbLogger(project=wandb_project_name, log_model="all", name="lora", group="lora", save_dir="lightning_logs")
trainer_lora = Trainer(max_epochs=num_epochs) #, logger=wandb_logger_lora)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
trainer_lora.fit(model_lora, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 142 K 
--------------------------------------
6.2 K     Trainable params
136 K     Non-trainable params
142 K     Total params
0.569     Total estimated model params size (MB)


Epoch 1: 100%|██████████| 860/860 [00:08<00:00, 105.78it/s, v_num=37, train_loss_step=0.0321, train_acc_step=1.000, val_loss_step=0.00929, val_acc_step=1.000, val_loss_epoch=0.0883, val_acc_epoch=0.973, train_loss_epoch=0.0903, train_acc_epoch=0.972]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 860/860 [00:08<00:00, 105.55it/s, v_num=37, train_loss_step=0.0321, train_acc_step=1.000, val_loss_step=0.00929, val_acc_step=1.000, val_loss_epoch=0.0883, val_acc_epoch=0.973, train_loss_epoch=0.0903, train_acc_epoch=0.972]


In [14]:
trainer_lora.test(model_lora, mnist_data)
wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 229.54it/s]


### Train Dora

In [15]:
model_dora = copy.deepcopy(model)

model_dora.layers[0] = LinearWithDoRAMerged(model.layers[0], rank=4, alpha=8)
model_dora.layers[2] = LinearWithDoRAMerged(model.layers[2], rank=4, alpha=8)
model_dora.layers[4] = LinearWithDoRAMerged(model.layers[4], rank=4, alpha=8)

freeze_linear_layers(model_dora)

# Check if linear layers are frozen
for name, param in model_dora.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.m: True
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.2.m: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.4.m: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B: True


In [16]:
#wandb_logger_dora = WandbLogger(project=wandb_project_name, log_model="all", name="dora", group="dora", save_dir="lightning_logs")
trainer_dora = Trainer(max_epochs=num_epochs) #, logger=wandb_logger_dora)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
trainer_dora.fit(model_dora, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 143 K 
--------------------------------------
7.4 K     Trainable params
136 K     Non-trainable params
143 K     Total params
0.574     Total estimated model params size (MB)


                                                                            

c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 1: 100%|██████████| 860/860 [00:09<00:00, 92.27it/s, v_num=38, train_loss_step=0.0217, train_acc_step=1.000, val_loss_step=0.214, val_acc_step=0.875, val_loss_epoch=0.0727, val_acc_epoch=0.976, train_loss_epoch=0.0746, train_acc_epoch=0.977]  

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 860/860 [00:09<00:00, 92.07it/s, v_num=38, train_loss_step=0.0217, train_acc_step=1.000, val_loss_step=0.214, val_acc_step=0.875, val_loss_epoch=0.0727, val_acc_epoch=0.976, train_loss_epoch=0.0746, train_acc_epoch=0.977]


In [18]:
trainer_dora.test(model_dora, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\43294881\AppData\Local\Programs\Python\Python311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 225.58it/s]


[{'test_loss': 0.10614069551229477, 'test_acc': 0.9682000279426575}]

In [19]:
wandb.finish()

### Train Lora Moe

In [20]:
model_lora_moe = copy.deepcopy(model)

In [21]:
model_lora_moe.layers[0] = LinearWithLoRAMixtureOfExperts(model_lora_moe.layers[0], rank=4, alpha=8, num_experts=8)
model_lora_moe.layers[2] = LinearWithLoRAMixtureOfExperts(model_lora_moe.layers[2], rank=4, alpha=8, num_experts=8)
model_lora_moe.layers[4] = LinearWithLoRAMixtureOfExperts(model_lora_moe.layers[4], rank=4, alpha=8, num_experts=8)

In [22]:
freeze_linear_layers(model_lora_moe)

# Check if linear layers are frozen
for name, param in model_lora_moe.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B.0: True
layers.0.lora.B.1: True
layers.0.lora.B.2: True
layers.0.lora.B.3: True
layers.0.lora.B.4: True
layers.0.lora.B.5: True
layers.0.lora.B.6: True
layers.0.lora.B.7: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B.0: True
layers.2.lora.B.1: True
layers.2.lora.B.2: True
layers.2.lora.B.3: True
layers.2.lora.B.4: True
layers.2.lora.B.5: True
layers.2.lora.B.6: True
layers.2.lora.B.7: True
layers.4.linear.weight: False
layers.4.linear.bias: False
layers.4.lora.A: True
layers.4.lora.B.0: True
layers.4.lora.B.1: True
layers.4.lora.B.2: True
layers.4.lora.B.3: True
layers.4.lora.B.4: True
layers.4.lora.B.5: True
layers.4.lora.B.6: True
layers.4.lora.B.7: True


In [23]:
#wandb_logger_lora_moe = WandbLogger(project=wandb_project_name, log_model="all", name="lora_moe", group="lora_moe", save_dir="lightning_logs")
trainer_lora_moe = Trainer(max_epochs=num_epochs) #, logger=wandb_logger_lora_moe)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [24]:
trainer_lora_moe.fit(model_lora_moe, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 153 K 
--------------------------------------
17.3 K    Trainable params
136 K     Non-trainable params
153 K     Total params
0.613     Total estimated model params size (MB)


Epoch 1: 100%|██████████| 860/860 [00:11<00:00, 77.27it/s, v_num=39, train_loss_step=0.0452, train_acc_step=1.000, val_loss_step=0.00486, val_acc_step=1.000, val_loss_epoch=0.0552, val_acc_epoch=0.981, train_loss_epoch=0.0559, train_acc_epoch=0.983]  

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 860/860 [00:11<00:00, 77.13it/s, v_num=39, train_loss_step=0.0452, train_acc_step=1.000, val_loss_step=0.00486, val_acc_step=1.000, val_loss_epoch=0.0552, val_acc_epoch=0.981, train_loss_epoch=0.0559, train_acc_epoch=0.983]


In [25]:
trainer_lora_moe.test(model_lora_moe, mnist_data)
wandb.finish()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 200.18it/s]


### Train Dora Moe

In [26]:
model_dora_moe = copy.deepcopy(model)
model_dora_moe.layers[0] = LinearWithDoRAMoE(model_dora_moe.layers[0], rank=4, alpha=8, num_experts=16)
model_dora_moe.layers[2] = LinearWithDoRAMoE(model_dora_moe.layers[2], rank=4, alpha=8, num_experts=8)
model_dora_moe.layers[4] = LinearWithDoRAMoE(model_dora_moe.layers[4], rank=4, alpha=8, num_experts=4)
freeze_linear_layers(model_dora_moe)

# Check if linear layers are frozen
for name, param in model_dora_moe.named_parameters():
    print(f"{name}: {param.requires_grad}")

layers.0.routing_weights: True
layers.0.linear.weight: False
layers.0.linear.bias: False
layers.0.lora.A: True
layers.0.lora.B: True
layers.0.m_experts.0: True
layers.0.m_experts.1: True
layers.0.m_experts.2: True
layers.0.m_experts.3: True
layers.0.m_experts.4: True
layers.0.m_experts.5: True
layers.0.m_experts.6: True
layers.0.m_experts.7: True
layers.0.m_experts.8: True
layers.0.m_experts.9: True
layers.0.m_experts.10: True
layers.0.m_experts.11: True
layers.0.m_experts.12: True
layers.0.m_experts.13: True
layers.0.m_experts.14: True
layers.0.m_experts.15: True
layers.2.routing_weights: True
layers.2.linear.weight: False
layers.2.linear.bias: False
layers.2.lora.A: True
layers.2.lora.B: True
layers.2.m_experts.0: True
layers.2.m_experts.1: True
layers.2.m_experts.2: True
layers.2.m_experts.3: True
layers.2.m_experts.4: True
layers.2.m_experts.5: True
layers.2.m_experts.6: True
layers.2.m_experts.7: True
layers.4.routing_weights: True
layers.4.linear.weight: False
layers.4.linear.bia

In [27]:
#wandb_logger_lora_moe = WandbLogger(project=wandb_project_name, log_model="all", name="Dora_M_moe", group="Dora_M_moe", save_dir="lightning_logs")
trainer_dora_moe = Trainer(max_epochs=num_epochs) #, logger=wandb_logger_lora_moe)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [28]:
trainer_dora_moe.fit(model_dora_moe, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 146 K 
--------------------------------------
10.4 K    Trainable params
136 K     Non-trainable params
146 K     Total params
0.586     Total estimated model params size (MB)


Epoch 1: 100%|██████████| 860/860 [00:13<00:00, 62.94it/s, v_num=40, train_loss_step=0.183, train_acc_step=0.958, val_loss_step=0.000567, val_acc_step=1.000, val_loss_epoch=0.0857, val_acc_epoch=0.975, train_loss_epoch=0.0864, train_acc_epoch=0.974]

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 860/860 [00:13<00:00, 62.89it/s, v_num=40, train_loss_step=0.183, train_acc_step=0.958, val_loss_step=0.000567, val_acc_step=1.000, val_loss_epoch=0.0857, val_acc_epoch=0.975, train_loss_epoch=0.0864, train_acc_epoch=0.974]


In [29]:
trainer_dora_moe.test(model_dora_moe, mnist_data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 157/157 [00:00<00:00, 159.27it/s]


[{'test_loss': 0.10055254399776459, 'test_acc': 0.9682000279426575}]

In [133]:
wandb.finish()

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█
test_acc,▁
test_loss,▁
train_acc_epoch,▁█
train_acc_step,▆▆▅▆▆▆▅▃▁▆▆▁█▅▃▃▆▅▆▆█▃█▆▅▆▆▅▃▆▃█▃▃
train_loss_epoch,█▁
train_loss_step,▃▃▄▃▂▂▄▆▅▂▂▄▁▄▅▅▃▆▃▅▁█▂▃▃▂▂▃▅▄█▂▃▃
trainer/global_step,▁▂▃▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▆▇▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂█
val_acc_epoch,▁█
val_acc_step,▇▅▅▆█▆▇▇▃▁▆▅▇█▇▇▆▅▆▆▃▆▆▇▇▇▆▅▆▇▆▅▅▇▇▇▇▇▇█

0,1
epoch,2.0
test_acc,0.9667
test_loss,0.11059
train_acc_epoch,0.9754
train_acc_step,0.95312
train_loss_epoch,0.08196
train_loss_step,0.07976
trainer/global_step,1720.0
val_acc_epoch,0.976
val_acc_step,1.0
