In [1]:
import os
import itertools
from attrs import define
# from code.train import train
from codes.train import train
from codes.optimizers import Optimizer
# from code.problems import Problem
from codes import Loss
from codes.datasets import Dataset
from codes.models import Model

# %matplotlib widget
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def zip_dict(**kwargs):
    keys = kwargs.keys()
    for instance in zip(*kwargs.values()):
        yield dict(zip(keys, instance))


def product_dict(**kwargs):
    keys = kwargs.keys()
    for instance in itertools.product(*kwargs.values()):
        yield dict(zip(keys, instance))

In [3]:
os.environ["MKL_THREADING_LAYER"] = "AMD"
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["TORCH_DEVICE"] = "cuda"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
# os.environ["MKL_THREADING_LAYER"] = "AMD"
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["TORCH_DEVICE"] = "cpu"

In [4]:
os.environ['MLFLOW_VERBOSE'] = 'True'
# os.environ['MLFLOW_CHECK_EXIST'] = 'False'
os.environ['MLFLOW_CHECK_EXIST'] = 'True'
os.environ['MLFLOW_EXPERIMENT_NAME'] = os.path.basename(os.getcwd())

# CIFAR10

In [5]:
@define
class BaseConfig():
    nepochs:         int = 50
    seed:            int = None

    loss:           Loss = Loss.CrossEntropyLoss
    model:         Model = Model.ResNet18
    dataset:     Dataset = Dataset.CIFAR10

    optimizer: Optimizer = None
    batchsize:       int = 500
    lr:            float = 1e-5

    eps:           float = 1e-4

    beta1_:    float = 0.9
    beta2_:    float = 0.999

    eta_:       float = None
    
args_grid = dict(
    seed=[0],
    eta_=[1e-3, 1e-1, 0],
    eps=[1e-4, 1e-6, 1e-8, 1e-10],
)

os.environ['MLFLOW_RUN_TAGS'] = str(dict(about=f'full dataset'))

for d in product_dict(**args_grid):

    config = BaseConfig(**d)
    config.optimizer = None
    config.beta1_ = None
    config.beta2_ = None
    config.eta_ = None
    os.environ['MLFLOW_RUN_NAME'] = 'AdaGrad'
    %time train(config)

    config = BaseConfig(**d)
    config.optimizer = Optimizer.ADAM
    config.eta_ = None
    os.environ['MLFLOW_RUN_NAME'] = str(config.optimizer)
    %time train(config)

    config = BaseConfig(**d)
    config.optimizer = Optimizer.KATE
    config.beta1_ = None
    config.beta2_ = None
    os.environ['MLFLOW_RUN_NAME'] = str(config.optimizer)
    %time train(config)

CPU times: user 102 ms, sys: 16.3 ms, total: 118 ms
Wall time: 117 ms
CPU times: user 76.9 ms, sys: 32.2 ms, total: 109 ms
Wall time: 109 ms
CPU times: user 85.1 ms, sys: 26.8 ms, total: 112 ms
Wall time: 112 ms
CPU times: user 101 ms, sys: 12.1 ms, total: 113 ms
Wall time: 113 ms
CPU times: user 73.4 ms, sys: 36.3 ms, total: 110 ms
Wall time: 110 ms
CPU times: user 85.4 ms, sys: 23.5 ms, total: 109 ms
Wall time: 109 ms
CPU times: user 88.4 ms, sys: 20.1 ms, total: 108 ms
Wall time: 109 ms
CPU times: user 88.4 ms, sys: 20.1 ms, total: 109 ms
Wall time: 109 ms
CPU times: user 102 ms, sys: 7.68 ms, total: 110 ms
Wall time: 110 ms
CPU times: user 99.7 ms, sys: 11.9 ms, total: 112 ms
Wall time: 112 ms
CPU times: user 99.6 ms, sys: 11.9 ms, total: 111 ms
Wall time: 112 ms
CPU times: user 87.4 ms, sys: 23.9 ms, total: 111 ms
Wall time: 112 ms
CPU times: user 93.7 ms, sys: 16.3 ms, total: 110 ms
Wall time: 110 ms
CPU times: user 90.6 ms, sys: 19.8 ms, total: 110 ms
Wall time: 111 ms
CPU times

# Emotion

In [6]:
@define
class BaseConfig():
    nepochs:         int = 20
    seed:            int = None

    loss:           Loss = Loss.CrossEntropyLoss
    model:         Model = Model.BERT
    dataset:     Dataset = Dataset.Emotion

    optimizer: Optimizer = None
    batchsize:       int = 160
    lr:            float = 1e-5

    eps:           float = 1e-4

    beta1_:    float = 0.9
    beta2_:    float = 0.999

    eta_:       float = None
    
    
args_grid = dict(
    seed=[0],
    eta_=[1e-3, 1e-1, 0],
    eps=[1e-4, 1e-6, 1e-8, 1e-10], #for adam and adagrad
    # eps=[1e-3, 1e-4, 1e-5, 1e-6], #for kate
)

os.environ['MLFLOW_RUN_TAGS'] = str(dict(about=f'full dataset'))

for d in product_dict(**args_grid):

    config = BaseConfig(**d)
    config.optimizer = None
    config.beta1_ = None
    config.beta2_ = None
    config.eta_ = None
    os.environ['MLFLOW_RUN_NAME'] = 'AdaGrad'
    %time train(config)

    config = BaseConfig(**d)
    config.optimizer = Optimizer.ADAM
    config.eta_ = None
    os.environ['MLFLOW_RUN_NAME'] = str(config.optimizer)
    %time train(config)

    # config = BaseConfig(**d)
    # config.optimizer = Optimizer.KATE
    # # config.eps = 1e-4
    # config.beta1_ = None
    # config.beta2_ = None
    # os.environ['MLFLOW_RUN_NAME'] = str(config.optimizer)
    # %time train(config)

CPU times: user 90.4 ms, sys: 52 ms, total: 142 ms
Wall time: 141 ms
CPU times: user 89.4 ms, sys: 43.6 ms, total: 133 ms
Wall time: 134 ms
CPU times: user 92.5 ms, sys: 36.3 ms, total: 129 ms
Wall time: 128 ms
CPU times: user 89.3 ms, sys: 36.5 ms, total: 126 ms
Wall time: 127 ms
CPU times: user 107 ms, sys: 19.8 ms, total: 127 ms
Wall time: 127 ms
CPU times: user 97 ms, sys: 35.3 ms, total: 132 ms
Wall time: 134 ms


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


len(train_df)=16000
len(valid_df)=2000
len(test_df)=2000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Step 2000: train-loss: 0.65920 train-accuracy: 80.44375 test-loss: 0.65817 test-accuracy: 81.25000
CPU times: user 4min 13s, sys: 1h 12min 38s, total: 1h 16min 52s
Wall time: 1h 16min 45s


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 2000: train-loss: 0.01013 train-accuracy: 99.71875 test-loss: 0.30161 test-accuracy: 92.20000
CPU times: user 20min 14s, sys: 56min 53s, total: 1h 17min 7s
Wall time: 1h 16min 56s
CPU times: user 88.2 ms, sys: 48.2 ms, total: 136 ms
Wall time: 136 ms
CPU times: user 100 ms, sys: 32.1 ms, total: 132 ms
Wall time: 132 ms
CPU times: user 91.6 ms, sys: 37.1 ms, total: 129 ms
Wall time: 129 ms
CPU times: user 105 ms, sys: 26 ms, total: 131 ms
Wall time: 130 ms
CPU times: user 99.9 ms, sys: 30.7 ms, total: 131 ms
Wall time: 131 ms
CPU times: user 112 ms, sys: 20.2 ms, total: 132 ms
Wall time: 132 ms
CPU times: user 96.2 ms, sys: 36.4 ms, total: 133 ms
Wall time: 133 ms
CPU times: user 108 ms, sys: 26.1 ms, total: 134 ms
Wall time: 134 ms
CPU times: user 96.1 ms, sys: 35.6 ms, total: 132 ms
Wall time: 132 ms
CPU times: user 100 ms, sys: 31.4 ms, total: 131 ms
Wall time: 131 ms
CPU times: user 92.1 ms, sys: 40 ms, total: 132 ms
Wall time: 132 ms
CPU times: user 100 ms, sys: 32.3 ms, total