In [1]:
import wandb
import numpy as np
import torch
import lightning
import copy
from pathlib import Path
from model_fusion.config import BASE_DATA_DIR, CHECKPOINT_DIR
from model_fusion.datasets import DataModuleType
from model_fusion.models import ModelType
from model_fusion.models.lightning import BaseModel 
from Experiments import lmc_experiment
from model_fusion import lmc_utils
from Experiments import baselines_experiment
from Experiments import otfusion_experiment
from Experiments import pyhessian_experiment
from model_fusion.train import setup_training, setup_testing


# set seed for numpy based calculations
NUMPY_SEED = 100
np.random.seed(NUMPY_SEED)

  if not hasattr(numpy, tp_name):
  if not hasattr(numpy, tp_name):
  "lr_options": generate_power_seq(LEARNING_RATE_CIFAR, 11),
  contrastive_task: Union[FeatureMapContrastiveTask] = FeatureMapContrastiveTask("01, 02, 11"),
  self.nce_loss = AmdimNCELoss(tclip)


In [2]:
print("------- Loading models -------")

# select wandb run names
runA = '3bsofnmw'
runB = 'zp0c8n4p'#same init

api = wandb.Api()
run = api.run(f'model-fusion/Model Fusion/{runA}')

print(run.config)

batch_size = run.config['datamodule_hparams'].get('batch_size')

datamodule_type_str = run.config['datamodule_type'].split('.')[1].lower()
datamodule_type = DataModuleType(datamodule_type_str)
datamodule_hparams = run.config['datamodule_hparams']
datamodule_hparams['data_augmentation'] = False

model_type_str = run.config['model_type'].split('.')[1].lower()
model_type = ModelType(model_type_str)

model_hparams = run.config['model_hparams']

print(datamodule_hparams)
print(model_hparams)

checkpointA = f'model-fusion/Model Fusion/model-{runA}:best_k'
checkpointB = f'model-fusion/Model Fusion/model-{runB}:best_k'

run = wandb.init()

artifact = run.use_artifact(checkpointA, type='model')
artifact_dir = artifact.download(root=CHECKPOINT_DIR)
modelA = BaseModel.load_from_checkpoint(Path(artifact_dir)/"model.ckpt")

artifact = run.use_artifact(checkpointB, type='model')
artifact_dir = artifact.download(root=CHECKPOINT_DIR)
modelB = BaseModel.load_from_checkpoint(Path(artifact_dir)/"model.ckpt")


------- Loading models -------


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


{'lr': 0.1, 'momentum': 0.9, 'optimizer': 'sgd', 'max_epochs': 200, 'min_epochs': 50, 'model_seed': 42, 'model_type': 'ModelType.RESNET18', 'loss_module': 'CrossEntropyLoss', 'lr_scheduler': 'plateau', 'weight_decay': 0.0001, 'model_hparams': {'bias': False, 'num_classes': 10, 'num_channels': 3}, 'early_stopping': True, 'datamodule_type': 'DataModuleType.CIFAR10', 'lr_decay_factor': 0.1, 'lightning_params': {'lr': 0.1, 'momentum': 0.9, 'optimizer': 'sgd', 'model_seed': 42, 'lr_scheduler': 'plateau', 'weight_decay': 0.0001, 'lr_decay_factor': 0.1, 'lr_monitor_metric': 'val_loss'}, 'lr_monitor_metric': 'val_loss', 'datamodule_hparams': {'seed': 42, 'data_dir': 'data', 'batch_size': 128, 'data_augmentation': True}, 'model_hparams/bias': False, 'model_hparams/num_classes': 10, 'model_hparams/num_channels': 3}
{'seed': 42, 'data_dir': 'data', 'batch_size': 128, 'data_augmentation': False}
{'bias': False, 'num_classes': 10, 'num_channels': 3}


[34m[1mwandb[0m: Currently logged in as: [33mframbelli[0m ([33mmodel-fusion[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact model-3bsofnmw:best_k, 85.20MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.6
[34m[1mwandb[0m: Downloading large artifact model-zp0c8n4p:best_k, 85.20MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.4


In [4]:
# LMC barrier
print("------- Computing LMC barrier before alignment-------")

lmc_experiment.run_lmc(
    datamodule_type=datamodule_type,
    modelA=modelA,
    modelB=modelB,
    granularity=21
)

------- Computing LMC barrier before alignment-------
Files already downloaded and verified
Files already downloaded and verified
Alpha: 0.00 (model 2), Train average loss: 0.09177 Train barrier:  0
Alpha: 1.00 (model 1), Train average loss: 0.02649 Train barrier:  0
Alpha: 0.05, Train average loss: 0.10136 Train barrier 0.012850078412493082
Alpha: 0.10, Train average loss: 0.14794 Train barrier 0.06269492242323028
Alpha: 0.15, Train average loss: 0.26096 Train barrier 0.1789803148764372
Alpha: 0.20, Train average loss: 0.48551 Train barrier 0.4067910722928577
Alpha: 0.25, Train average loss: 0.84993 Train barrier 0.7744818732211987
Alpha: 0.30, Train average loss: 1.30267 Train barrier 1.2304862293424872
Alpha: 0.35, Train average loss: 1.71573 Train barrier 1.6468094255754022
Alpha: 0.40, Train average loss: 2.00280 Train barrier 1.9371432654070855
Alpha: 0.45, Train average loss: 2.16256 Train barrier 2.1001688453595504
Alpha: 0.50, Train average loss: 2.21379 Train barrier 2.154655

In [6]:
# Baselines (prediction ensembling, vanilla averaging)
print("------- Computing baselines -------")

wandb_tag = f'baselines-{runA}-{runB}'

vanilla_averaging_model = baselines_experiment.run_baselines(
    datamodule_type=datamodule_type,
    datamodule_hparams=datamodule_hparams,
    model_type=model_type, 
    model_hparams=model_hparams,
    modelA=modelA,
    modelB=modelB,
    wandb_tag=wandb_tag,
)

------- Computing baselines -------
------- Prediction based ensembling -------
------- Naive ensembling of weights -------
------- Evaluating baselines -------


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\filos\OneDrive\Desktop\ETH\model-fusion\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


------- Evaluating base models -------
Testing DataLoader 0: 100%|██████████| 10/10 [00:02<00:00,  3.61it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.9126999974250793
        val_loss            0.40655699372291565
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Testing DataLoader 0: 100%|██████████| 10/10 [00:02<00:00,  4.46it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.9013999700546265
        val_loss            0.3815564215183258
───────────────────────────────────────────────

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Test set: Average loss: 0.3173, Accuracy: 92.13%
------- Evaluating vanilla averaging -------
Testing DataLoader 0: 100%|██████████| 10/10 [00:02<00:00,  3.46it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.2531000077724457
        val_loss            2.2134900093078613
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
epoch,▁▁▁
trainer/global_step,▁▁▁
val_accuracy,██▁
val_loss,▁▁█

0,1
epoch,0.0
trainer/global_step,0.0
val_accuracy,0.2531
val_loss,2.21349


In [5]:
# OT model fusion + eval aligned model 
print("------- Computing model fusion -------")

wandb_tag = f"ot_model_fusion-{runA}-{runB}"

ot_fused_model, modelA_aligned = otfusion_experiment.run_otfusion(
    batch_size=batch_size,
    datamodule_type=datamodule_type,
    datamodule_hparams=datamodule_hparams,
    model_type=model_type, 
    model_hparams=model_hparams,
    modelA=modelA,
    modelB=modelB,
    wandb_tag=wandb_tag
)


------- Computing model fusion -------
------- Setting up parameters -------
{'seed': 42, 'data_dir': 'data', 'batch_size': 128, 'data_augmentation': False}
The parameters are: 
 {'eval_aligned': True, 'num_models': 2, 'width_ratio': 1, 'handle_skips': True, 'exact': True, 'activation_seed': 21, 'activation_histograms': True, 'ground_metric': 'euclidean', 'ground_metric_normalize': 'none', 'same_model': False, 'geom_ensemble_type': 'acts', 'act_num_samples': 200, 'skip_last_layer': False, 'skip_last_layer_type': 'average', 'softmax_temperature': 1, 'past_correction': True, 'correction': True, 'normalize_acts': False, 'normalize_wts': False, 'activation_normalize': False, 'center_acts': False, 'prelu_acts': False, 'pool_acts': False, 'pool_relu': False, 'importance': None, 'proper_marginals': False, 'not_squared': True, 'ground_metric_eff': False, 'dist_normalize': False, 'clip_gm': False, 'clip_min': 0, 'clip_max': 5, 'tmap_stats': False, 'ensemble_step': 0.5, 'reg': 0.01}
------- OT m

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\filos\OneDrive\Desktop\ETH\model-fusion\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 79/79 [00:02<00:00, 30.57it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.7124000191688538
        val_loss            0.8900654315948486
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
epoch,▁
trainer/global_step,▁
val_accuracy,▁
val_loss,▁

0,1
epoch,0.0
trainer/global_step,0.0
val_accuracy,0.7124
val_loss,0.89007


# model parameters:  21
# new parameters:  21
fusing:  model.conv1.weight
fusing:  model.layer1.0.conv1.weight
fusing:  model.layer1.0.conv2.weight
fusing:  model.layer1.1.conv1.weight
fusing:  model.layer1.1.conv2.weight
fusing:  model.layer2.0.conv1.weight
fusing:  model.layer2.0.conv2.weight
fusing:  model.layer2.0.shortcut.0.weight
fusing:  model.layer2.1.conv1.weight
fusing:  model.layer2.1.conv2.weight
fusing:  model.layer3.0.conv1.weight
fusing:  model.layer3.0.conv2.weight
fusing:  model.layer3.0.shortcut.0.weight
fusing:  model.layer3.1.conv1.weight
fusing:  model.layer3.1.conv2.weight
fusing:  model.layer4.0.conv1.weight
fusing:  model.layer4.0.conv2.weight
fusing:  model.layer4.0.shortcut.0.weight
fusing:  model.layer4.1.conv1.weight
fusing:  model.layer4.1.conv2.weight
fusing:  model.fc.weight
------- Evaluating ot fusion model -------


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\filos\OneDrive\Desktop\ETH\model-fusion\.venv\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 79/79 [00:02<00:00, 29.82it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      val_accuracy          0.7318000197410583
        val_loss            0.9555891752243042
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


0,1
epoch,▁
trainer/global_step,▁
val_accuracy,▁
val_loss,▁

0,1
epoch,0.0
trainer/global_step,0.0
val_accuracy,0.7318
val_loss,0.95559


In [7]:
# LMC barrier
print("------- Computing LMC barrier after alignment -------")

lmc_experiment.run_lmc(
    datamodule_type=datamodule_type,
    modelA=modelA_aligned,
    modelB=modelB,
    granularity=21
)

# Losses for ot fusion model and vanilla averaging model
datamodule_hparams_lmc = {'batch_size': 1024, 'data_dir': BASE_DATA_DIR}
datamodule_lmc = datamodule_type.get_data_module(**datamodule_hparams)
datamodule_lmc.prepare_data()
datamodule_lmc.setup('fit')

vanilla_loss = lmc_utils.compute_loss(vanilla_averaging_model, datamodule_lmc)
fused_loss = lmc_utils.compute_loss(ot_fused_model, datamodule_lmc)

print(f"Vanilla loss pre fine-tuning: {vanilla_loss}")
print(f"Fused loss pre fine-tuning: {fused_loss}")

------- Computing LMC barrier after alignment -------
Files already downloaded and verified
Files already downloaded and verified
Alpha: 0.00 (model 2), Train average loss: 0.09177 Train barrier:  0
Alpha: 1.00 (model 1), Train average loss: 0.81414 Train barrier:  0
Alpha: 0.05, Train average loss: 0.09569 Train barrier -0.0322019969433546
Alpha: 0.10, Train average loss: 0.11480 Train barrier -0.049205363176266365
Alpha: 0.15, Train average loss: 0.15272 Train barrier -0.047404595167438196
Alpha: 0.20, Train average loss: 0.21316 Train barrier -0.02308766757117378
Alpha: 0.25, Train average loss: 0.29869 Train barrier 0.026323302053411768
Alpha: 0.30, Train average loss: 0.40807 Train barrier 0.09958597238659855
Alpha: 0.35, Train average loss: 0.53543 Train barrier 0.19083508548703454
Alpha: 0.40, Train average loss: 0.67042 Train barrier 0.2897038124097718
Alpha: 0.45, Train average loss: 0.80036 Train barrier 0.3835220607681407
Alpha: 0.50, Train average loss: 0.91367 Train barrie

In [8]:
# Pyhessian (compute sharpness and eigenspectrum of base models, vanilla avg, ot fusion and finetuned solutions)
print("------- Computing sharpness -------")

print("------- Model A -------")
hessian_comp = pyhessian_experiment.run_pyhessian(datamodule_type=datamodule_type, model=modelA, compute_density=False, figure_name='modelA.pdf') 
print("------- Model B -------")
hessian_comp = pyhessian_experiment.run_pyhessian(datamodule_type=datamodule_type, model=modelB, compute_density=False, figure_name='modelB.pdf')

print("------- OT fusion model -------")
hessian_comp = pyhessian_experiment.run_pyhessian(datamodule_type=datamodule_type, model=ot_fused_model,  compute_density=False, figure_name='otmodel32.pdf')

Seed set to 42


------- Computing sharpness -------
------- Model A -------
Files already downloaded and verified
Files already downloaded and verified


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


The top Hessian eigenvalue of this model is 2.3668


Seed set to 42



***Trace:  106.25496043329653
------- Model B -------
Files already downloaded and verified
Files already downloaded and verified
The top Hessian eigenvalue of this model is 2.6837


Seed set to 42



***Trace:  168.4502716064453
------- OT fusion model -------
Files already downloaded and verified
Files already downloaded and verified
The top Hessian eigenvalue of this model is 3.8327

***Trace:  122.28527577718098


In [9]:
print("------- Vanilla avg model -------")
hessian_comp = pyhessian_experiment.run_pyhessian(datamodule_type=datamodule_type,model=vanilla_averaging_model,  compute_density=False, figure_name='vanilla_avg.pdf')

Seed set to 42


------- Vanilla avg model -------
Files already downloaded and verified
Files already downloaded and verified
The top Hessian eigenvalue of this model is 0.6501

***Trace:  6.66019786786342


In [8]:
print("------- Model A aligned to B -------")
hessian_comp = pyhessian_experiment.run_pyhessian(datamodule_type=datamodule_type,model=modelA_aligned,  compute_density=False, figure_name='modelA_aligned.pdf')

Seed set to 42


------- Model A aligned to B -------
Files already downloaded and verified
Files already downloaded and verified


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


The top Hessian eigenvalue of this model is 11.7027

***Trace:  513.2556228637695


In [None]:
save_path = r'C:\Users\filos\OneDrive\Desktop\ETH\model-fusion\Resnet_cifar10_models'
modelA_name = 'model128A.t7'
torch.save(modelA.state_dict(), save_path + '\\' + modelA_name)

modelB_name = 'model128B.t7'
torch.save(modelB.state_dict(), save_path + '\\' + modelB_name)

modelA_aligned_name = 'model128A_aligned_to128B.t7'
torch.save(modelA_aligned.state_dict(), save_path + '\\' + modelA_aligned_name)