In [1]:
# set cuda visible devices
def is_notebook() -> bool:
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter

import os
if is_notebook():
    os.environ["CUDA_VISIBLE_DEVICES"] = "" #"1"
    # os.environ['CUDA_LAUNCH_BLOCKING']="1"
    # os.environ['TORCH_USE_CUDA_DSA'] = "1"

import matplotlib 
if not is_notebook():
    matplotlib.use('Agg')

In [2]:
import os
import math
import json
import random as rnd
from typing import Optional, Callable
from tqdm import tqdm
from collections import defaultdict
from functools import partial
from datetime import datetime
import itertools
from dataclasses import dataclass
from pathlib import Path
from enum import Enum

from omegaconf import OmegaConf
import submitit
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import matplotlib.pyplot as plt
import pandas as  pd
import torchvision.utils as vision_utils
from PIL import Image
import torchvision
from torchvision import transforms
from matplotlib.ticker import NullFormatter

from losses.divdis import DivDisLoss 
from losses.divdis import DivDisLoss
from losses.ace import ACELoss
from losses.dbat import DBatLoss
from losses.loss_types import LossType

from models.backbone import MultiHeadBackbone
from models.multi_model import MultiNetModel
from models.lenet import LeNet
from utils.utils import conf_to_args


In [3]:
@dataclass
class Experiment:
    seed: int
    loss_type: LossType
    model: str
    mix_rate: float
    mix_rate_lower_bound: float
    epochs: int

In [8]:
seeds = [0]
losses = [LossType.PROB, LossType.DIVDIS, LossType.EXP]
models = ["Resnet50"]
# mix rates and lower bounds (same mix rate and lower bound, fixed lower bound and varying mix rate)
mix_rates = [0.1, 0.25, 0.5, 0.75, 0.9, 1.0]
mix_rate_lower_bounds = [0.1] # [0.1, 0.5, 1.0]
same_mix_rate_and_lower_bounds = [
    (mix_rate, mix_rate) for mix_rate in mix_rates
]
fixed_lower_bounds_and_mix_rates = list(itertools.product(
    mix_rate_lower_bounds, mix_rates
))
mix_rates_and_lower_bounds = same_mix_rate_and_lower_bounds + fixed_lower_bounds_and_mix_rates

# mix_rate_to_epoch: dict[float, int] = {
#     0.1: 200,
#     0.25: 100,
#     0.5: 20,
#     0.75: 20,
#     0.9: 20,
#     1.0: 20
# }
mix_rate_to_epoch = defaultdict(lambda: 5)
experiments: list[Experiment] = []

for seed in seeds:
    for loss in losses:
        for model in models:
            for mix_rate, mix_rate_lower_bound in mix_rates_and_lower_bounds:
                epochs = mix_rate_to_epoch[mix_rate]
                experiments.append(Experiment(seed, loss, model, mix_rate, mix_rate_lower_bound, epochs))
print(len(experiments))

36


In [9]:
out_dir = Path("output_logs/cifar_mnist_sweep")
out_dir.mkdir(exist_ok=True, parents=True)
def get_executor(out_dir: Path):
    executor = submitit.AutoExecutor(folder=out_dir)
    executor.update_parameters(
        timeout_min=60 * 48,
        mem_gb=16,
        gres="gpu:1",
        cpus_per_task=4,
        nodes=1,
        slurm_qos="high",
        slurm_array_parallelism=8
    )
def get_executor_local(out_dir: Path):
    executor = submitit.LocalExecutor(folder=out_dir)
    executor.update_parameters(
        timeout_min=60 * 48,
    )
    return executor

script_name = "cifar_mnist.py"
def run_experiments(executor, experiments: list[Experiment], script_name: str):

    with executor.batch():
        jobs = []
        for exp in experiments:
            function = submitit.helpers.CommandFunction(
                ["python", script_name] + conf_to_args(exp.__dict__)
            )
            jobs.append(executor.submit(function))
    return jobs

executor = get_executor(out_dir)
# executor = get_executor_local(out_dir)
jobs = run_experiments(executor, experiments, script_name)


In [11]:
import signal
def kill_local_jobs(out_dir: str):
    # Initialize the executor to get access to job information
    
    # Get all job IDs
    job_ids = executor.list_jobs()
    
    for job_id in job_ids:
        try:
            job = executor.get_job(job_id)
            pid = job.pid()
            if pid is not None:
                print(f"Killing job {job_id} with PID {pid}")
                os.kill(pid, signal.SIGTERM)
            else:
                print(f"Job {job_id} has no PID (might have already finished)")
        except Exception as e:
            print(f"Error killing job {job_id}: {e}")

    print("Attempted to kill all local jobs")
kill_local_jobs(out_dir)

AttributeError: 'LocalExecutor' object has no attribute 'list_jobs'