In [2]:
import submitit

import torch
import random
import numpy as np

import pickle
import itertools
import argparse
import logging
import os
import pathlib
import time
import json
import math
import matplotlib.pyplot as plt
from torch.utils import data
from fvcore.common.config import CfgNode
from submitit.core.utils import FailedJobError
from itertools import cycle
from time import sleep
from multi_objective.main import main, get_config
from plotting.plotting import load_data, plot_row, generate_table, generate_table_taskwise

<Figure size 432x288 with 0 Axes>

In [3]:
torch.cuda.init()
torch.cuda.is_initialized()
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]='6'
# os.environ["CUDA_VISIBLE_DEVICES"]='2'

In [4]:
executor = submitit.AutoExecutor(folder="tmp/submitit")
# seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
seeds = [0]

In [5]:
executor.update_parameters(timeout_min=240,  name='results', gpus_per_node=2)
executor.update_parameters(slurm_array_parallelism=100)

In [6]:
def percent_finished(jobs):
    if len(jobs):
        return sum(job.done() for job in jobs) / len(jobs)

def execute(config, seeds, world_size=1):
    cfg = config.clone()
    cfg.eval_every = 100
    cfg.test_eval_every = 100  # generate test results
    cfg.metrics = ['mcr', 'mcr']
    cfg.lr *= world_size
    
    cfgs = []
    for seed in seeds:
        cfg = cfg.clone()
        cfg.merge_from_list(['seed', seed])
        cfgs.append(cfg)
    
    tags = [f"result_{s :02d}" for s in seeds]
    
    # func, rank, world_size, cfg, tag
    return executor.map_array(main, cycle(range(world_size)), cycle([world_size]), cfgs, tags)

# Baselines

#### cosmos

In [26]:
with open('/lfs/local/0/nomir/moo-mtl/tmp/submitit/80522_submitted.pkl', 'rb') as file:
    p_data = pickle.load(file)

In [32]:
p_data.args

(0,
 2,
 CfgNode({'dataset': 'celeba', 'dim': (5, 64, 64), 'augment_dim_for_cosmos': False, 'objectives': ['BinaryCrossEntropyLoss', 'BinaryCrossEntropyLoss'], 'task_ids': [16, 22], 'model_name': 'efficientnet-b4', 'channel_multiplier': 1.0, 'epochs': 30, 'num_workers': 4, 'checkpointing': True, 'lr_scheduler': 'CosineAnnealing', 'lr': 0.001, 'weight_decay': 0.0075, 'batch_size': 128, 'method': 'cosmos_orig', 'num_models': 5, 'approximate_mgda': False, 'normalization_type': 'none', 'alpha': 1.2, 'internal_solver_phn': 'linear', 'lamda': 2.0, 'population_size': 100, 'n_offsprings': 20, 'task_id': None, 'seed': 0, 'logdir': 'results', 'n_partitions': 24, 'eval_every': 100, 'train_eval_every': 0, 'test_eval_every': 100, 'reference_point': [1, 1], 'device': 'cuda', 'metrics': ['mcr', 'mcr']}),
 'result_00')

In [17]:
jobs_big = execute(get_config('configs/baselines/celeba/cosmos_orig.yaml'), seeds, world_size=2)

In [8]:
jobs = execute(get_config('configs/baselines/celeba/cosmos_orig.yaml'), seeds)

In [34]:
percent_finished(jobs)

1.0

In [20]:
percent_finished(jobs_big)

0.0

In [17]:
jobs_big[0].cancel()

Traceback (most recent call last):
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/submitit/local/_local.py", line 16, in <module>
    controller.run()
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/submitit/local/local.py", line 322, in run
    exit_codes = self.wait()
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/submitit/local/local.py", line 313, in wait
    time.sleep(1.0 / freq)
KeyboardInterrupt


jobs = execute(get_config('configs/multi_fashion/cosmos.yaml'), seeds)

In [36]:
jobs = execute(get_config('configs/multi_fashion_mnist/cosmos.yaml'), seeds)

In [7]:
jobs = execute(get_config('configs/multi_mnist/cosmos_orig.yaml'), seeds)

In [38]:
jobs = execute(get_config('configs/multi_fashion/cosmos_orig.yaml'), seeds)

In [39]:
jobs = execute(get_config('configs/multi_fashion_mnist/cosmos_orig.yaml'), seeds)

#### mgda

In [40]:
jobs = execute(get_config('configs/multi_mnist/mgda.yaml'), seeds)

In [41]:
jobs = execute(get_config('configs/multi_fashion/mgda.yaml'), seeds)

In [42]:
jobs = execute(get_config('configs/multi_fashion_mnist/mgda.yaml'), seeds)

#### phn

In [43]:
jobs = execute(get_config('configs/multi_mnist/phn.yaml'), seeds)

In [44]:
jobs = execute(get_config('configs/multi_fashion/phn.yaml'), seeds)

In [45]:
jobs = execute(get_config('configs/multi_fashion_mnist/phn.yaml'), seeds)

In [46]:
jobs = execute(get_config('configs/multi_mnist/phn_orig.yaml'), seeds)

In [47]:
jobs = execute(get_config('configs/multi_fashion/phn_orig.yaml'), seeds)

In [48]:
jobs = execute(get_config('configs/multi_fashion_mnist/phn_orig.yaml'), seeds)

#### pmtl

In [49]:
jobs = execute(get_config('configs/multi_mnist/pmtl.yaml'), seeds)

In [50]:
jobs = execute(get_config('configs/multi_fashion/pmtl.yaml'), seeds)

In [51]:
jobs = execute(get_config('configs/multi_fashion_mnist/pmtl.yaml'), seeds)

#### single task

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"]='5'
jobs = execute(get_config('configs/baselines/celeba/single_task_1.yaml'), seeds)
os.environ["CUDA_VISIBLE_DEVICES"]='9'
jobs + execute(get_config('configs/baselines/celeba/single_task_2.yaml'), seeds)

An exception occurred in telemetry logging.Disabling telemetry to prevent further exceptions.
Traceback (most recent call last):
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/iopath/common/file_io.py", line 946, in __log_tmetry_keys
    handler.log_event()
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/iopath/common/event_logger.py", line 97, in log_event
    del self._evt
AttributeError: _evt


[LocalJob<job_id=40188, task_id=0, state="RUNNING">,
 LocalJob<job_id=40190, task_id=0, state="RUNNING">]

In [52]:
jobs = execute(get_config('configs/multi_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('configs/multi_mnist/single_task_2.yaml'), seeds)

In [53]:
jobs = execute(get_config('configs/multi_fashion/single_task_1.yaml'), seeds)
jobs = execute(get_config('configs/multi_fashion/single_task_2.yaml'), seeds)

In [54]:
jobs = execute(get_config('configs/multi_fashion_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

#### uniform

In [55]:
jobs = execute(get_config('configs/multi_mnist/uniform.yaml'), seeds)

In [56]:
jobs = execute(get_config('configs/multi_fashion/uniform.yaml'), seeds)

In [57]:
jobs = execute(get_config('configs/multi_fashion_mnist/uniform.yaml'), seeds)

#### linear scalarization

In [14]:
jobs = execute(get_config('configs/baselines/celeba/linear_scalarization.yaml'), seeds)

In [20]:
percent_finished(jobs)

0.0

In [13]:
jobs[0].result()

FailedJobError: Job (task=0) failed during processing with trace:
----------------------
Traceback (most recent call last):
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/submitit/core/submission.py", line 53, in process_job
    result = delayed.result()
  File "/lfs/turing3/0/nomir/mambaforge/envs/moo-mtl/lib/python3.9/site-packages/submitit/core/utils.py", line 126, in result
    self._result = self.function(*self.args, **self.kwargs)
  File "/lfs/turing3/0/nomir/moo-mtl/multi_objective/main.py", line 319, in main
    loss = method.step(batch)
  File "/lfs/turing3/0/nomir/moo-mtl/multi_objective/methods/linear_scalarization.py", line 48, in step
    loss_total = torch.sum(task_losses)
TypeError: sum(): argument 'input' (position 1) must be Tensor, not list

----------------------
You can check full logs with 'job.stderr(0)' and 'job.stdout(0)'or at paths:
  - /lfs/turing3/0/nomir/moo-mtl/tmp/submitit/18595_0_log.err
  - /lfs/turing3/0/nomir/moo-mtl/tmp/submitit/18595_0_log.out

## Results
#### Loss

In [5]:
results = load_data(dirname='results')
plot_row(results, prefix='baselines')
generate_table(results, name='tab-baselines_loss')
generate_table_taskwise(results, name='tab-baselines_loss_tw')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See baselines.pdf


#### MCR

In [6]:
results = load_data(dirname='results', custom_metric=True)
plot_row(results, prefix='baselines_mcr')
generate_table(results, name='tab-baselines_mcr')
generate_table_taskwise(results, name='tab-baselines_mcr_tw')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See baselines_mcr.pdf


# Different sizes

In [25]:
executor.update_parameters(timeout_min=400)

#### size 50

In [18]:
jobs = execute(get_config('size_50_configs/multi_fashion/uniform.yaml'), seeds)

In [19]:
jobs = execute(get_config('size_50_configs/multi_fashion/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_50_configs/multi_fashion/single_task_2.yaml'), seeds)

In [20]:
jobs = execute(get_config('size_50_configs/multi_fashion_mnist/uniform.yaml'), seeds)

In [21]:
jobs = execute(get_config('size_50_configs/multi_fashion_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_50_configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

In [26]:
jobs = execute(get_config('size_50_configs/multi_mnist/uniform.yaml'), seeds)

In [27]:
jobs = execute(get_config('size_50_configs/multi_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_50_configs/multi_mnist/single_task_2.yaml'), seeds)

#### size 10

In [22]:
jobs = execute(get_config('size_10_configs/multi_fashion/uniform.yaml'), seeds)

In [23]:
jobs = execute(get_config('size_10_configs/multi_fashion/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_10_configs/multi_fashion/single_task_2.yaml'), seeds)

In [10]:
jobs = execute(get_config('size_10_configs/multi_fashion_mnist/uniform.yaml'), seeds)

In [25]:
jobs = execute(get_config('size_10_configs/multi_fashion_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_10_configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

In [26]:
jobs = execute(get_config('size_10_configs/multi_mnist/uniform.yaml'), seeds)

In [27]:
jobs = execute(get_config('size_10_configs/multi_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_10_configs/multi_mnist/single_task_2.yaml'), seeds)

#### size 0.5

In [28]:
jobs = execute(get_config('size_0.5_configs/multi_fashion/uniform.yaml'), seeds)

In [29]:
jobs = execute(get_config('size_0.5_configs/multi_fashion/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_0.5_configs/multi_fashion/single_task_2.yaml'), seeds)

In [30]:
jobs = execute(get_config('size_0.5_configs/multi_fashion_mnist/uniform.yaml'), seeds)

In [31]:
jobs = execute(get_config('size_0.5_configs/multi_fashion_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_0.5_configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

In [32]:
jobs = execute(get_config('size_0.5_configs/multi_mnist/uniform.yaml'), seeds)

In [33]:
jobs = execute(get_config('size_0.5_configs/multi_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('size_0.5_configs/multi_mnist/single_task_2.yaml'), seeds)

In [9]:
jobs = execute(get_config('size_10_configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

## Results
#### Loss 0.5

In [13]:
results = load_data(dirname='results_size_0.5', custom_metric=True)
plot_row(results, prefix='size_0.5')
generate_table(results, name='tab-size_0.5')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See size_0.5.pdf


#### Loss 10

In [14]:
results = load_data(dirname='results_size_10', custom_metric=True)
plot_row(results, prefix='size_10')
generate_table(results, name='tab-size_10')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See size_10.pdf


#### Loss 50

In [28]:
results = load_data(dirname='results_size_50', custom_metric=True)
plot_row(results, prefix='size_50')
generate_table(results, name='tab-size_50')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See size_50.pdf


# Grid search

In [11]:
jobs = execute(get_config('grid_configs/multi_fashion/uniform.yaml'), seeds)

In [12]:
jobs = execute(get_config('grid_configs/multi_fashion/single_task_1.yaml'), seeds)
jobs = execute(get_config('grid_configs/multi_fashion/single_task_2.yaml'), seeds)

In [13]:
jobs = execute(get_config('grid_configs/multi_fashion_mnist/uniform.yaml'), seeds)

In [14]:
jobs = execute(get_config('grid_configs/multi_fashion_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('grid_configs/multi_fashion_mnist/single_task_2.yaml'), seeds)

In [15]:
jobs = execute(get_config('grid_configs/multi_mnist/uniform.yaml'), seeds)

In [16]:
jobs = execute(get_config('grid_configs/multi_mnist/single_task_1.yaml'), seeds)
jobs = execute(get_config('grid_configs/multi_mnist/single_task_2.yaml'), seeds)

## Results
#### Loss

In [11]:
results = load_data(dirname='results_grid', custom_metric=True)
plot_row(results, prefix='grid-mcr')
generate_table(results, name='tab-grid')

loaded data for multi_mnist
loaded data for multi_fashion
loaded data for multi_fashion_mnist
success. See grid-mcr.pdf
