Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
0f20678
set max split size
priyakasimbeg Sep 26, 2023
6cf192a
tune max split size
priyakasimbeg Sep 28, 2023
b2f8ff9
typo
priyakasimbeg Sep 28, 2023
70625b0
add back deleted block
priyakasimbeg Sep 28, 2023
179abba
undo disable torch compile for conformer
priyakasimbeg Sep 28, 2023
a2beafb
remove whitespace
priyakasimbeg Sep 28, 2023
255d835
remove training whitespace
priyakasimbeg Sep 28, 2023
b7f4cbc
isort fix
priyakasimbeg Sep 28, 2023
bb29602
formatting
priyakasimbeg Sep 28, 2023
3738f35
print step hint
priyakasimbeg Sep 28, 2023
7f7891d
minor
pomonam Oct 2, 2023
f081dd1
minor
pomonam Oct 2, 2023
62d9ad7
minor
pomonam Oct 2, 2023
19accc7
Lint fix
pomonam Oct 2, 2023
a1844c7
Lint fix
pomonam Oct 2, 2023
0e4dd85
make pytorch cuda alloc config specific to conformer
priyakasimbeg Oct 6, 2023
da89a8b
tune max split size
priyakasimbeg Oct 6, 2023
416b88d
fix
priyakasimbeg Oct 7, 2023
4600d78
reduce max split size
priyakasimbeg Oct 7, 2023
7a764e1
move env var
priyakasimbeg Oct 7, 2023
1dbf3e4
logging
priyakasimbeg Oct 7, 2023
04f5c94
debugging
priyakasimbeg Oct 7, 2023
b0b9f40
debugging
priyakasimbeg Oct 7, 2023
318202e
debug logging
priyakasimbeg Oct 7, 2023
3cec8c5
update
priyakasimbeg Oct 7, 2023
4fc6e1c
update_logging
priyakasimbeg Oct 7, 2023
557bf0d
fix
priyakasimbeg Oct 7, 2023
2598d39
fix
priyakasimbeg Oct 7, 2023
9418f4f
fix
priyakasimbeg Oct 7, 2023
931337d
remove logging
priyakasimbeg Oct 9, 2023
aeed475
revert checkpoint utils debugging
priyakasimbeg Oct 9, 2023
7098843
extend max_allowed_runtime_sec for conformer
priyakasimbeg Oct 9, 2023
cb68dba
Merge branch 'dev' into conformer_oom_debugging_2
priyakasimbeg Oct 9, 2023
e663f8d
update speech targets
priyakasimbeg Oct 9, 2023
fe2f560
Initial Commit
pomonam Oct 10, 2023
28da089
Merge pull request #538 from mlcommons/speech_targets_update
priyakasimbeg Oct 10, 2023
e139cdc
Set criteo test targets based on external runs (#541)
priyakasimbeg Oct 11, 2023
7251992
move logging in tuning loop
priyakasimbeg Oct 11, 2023
6399058
Merge pull request #543 from mlcommons/logging_fix
priyakasimbeg Oct 11, 2023
24edc3b
Merge branch 'dev' into conformer_oom_debugging_2
priyakasimbeg Oct 11, 2023
09ceeec
remove conformer oom fixes from this branch
priyakasimbeg Oct 11, 2023
a0b624e
lint
priyakasimbeg Oct 11, 2023
39ac8af
Initial Commit
pomonam Oct 12, 2023
66af503
Add platform
pomonam Oct 12, 2023
b6af8a0
minor
pomonam Oct 12, 2023
df2b1fa
minor
pomonam Oct 12, 2023
bdce3a6
Lint fix
pomonam Oct 12, 2023
59741e0
Lint fix
pomonam Oct 12, 2023
3237d65
Replace order
pomonam Oct 12, 2023
3706caf
Add discarded workloads
pomonam Oct 12, 2023
dd72874
Merge pull request #546 from mlcommons/juhan/warning
priyakasimbeg Oct 12, 2023
abecde1
Merge pull request #547 from mlcommons/juhan/target_setting
priyakasimbeg Oct 12, 2023
061d5b3
pr feedback
priyakasimbeg Oct 13, 2023
a4bb0f0
isort
priyakasimbeg Oct 13, 2023
2f5774b
Merge pull request #544 from mlcommons/meta_data_logging_fix
priyakasimbeg Oct 16, 2023
45a7730
Merge pull request #525 from mlcommons/juhan/db_cri
priyakasimbeg Oct 16, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ pylint tests

## Unit and integration tests
We run unit tests and integration tests as part of the of github actions as well.
You can also use `python tests/reference_algorithm_tests.py` to run a single model update and two model evals for each workload using the reference algorithm in `reference_algorithms/development_algorithms/`.
You can also use `python tests/reference_algorithm_tests.py` to run a single model update and two model evals for each workload using the reference algorithm in `reference_algorithms/target_setting_algorithms/`.

## Regression tests
We also have regression tests available in [.github/workflows/regression_tests.yml](https://github.com/mlcommons/algorithmic-efficiency/tree/main/.github/workflows/regression_tests.yml) that can be run semi-automatically.
Expand Down
27 changes: 12 additions & 15 deletions algorithmic_efficiency/logger_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import shutil
import subprocess
import sys
from typing import Any, Optional
from typing import Any, Dict, Optional

from absl import flags
from clu import metric_writers
Expand Down Expand Up @@ -96,14 +96,14 @@ def write_hparams(hparams: spec.Hyperparameters,
return hparams


def write_json(name: str, log_dict: dict, indent: int = 2) -> None:
def write_json(name: str, log_dict: Dict, indent: int = 2) -> None:
if RANK == 0:
with open(name, 'w') as f:
f.write(json.dumps(log_dict, indent=indent))


def write_to_csv(
metrics: dict,
metrics: Dict,
csv_path: str,
) -> None:
try:
Expand All @@ -120,7 +120,7 @@ def write_to_csv(
return


def _get_utilization() -> dict:
def _get_utilization() -> Dict:
util_data = {}

# CPU
Expand Down Expand Up @@ -180,7 +180,7 @@ def _get_utilization() -> dict:
return util_data


def _get_system_hardware_info() -> dict:
def _get_system_hardware_info() -> Dict:
system_hardware_info = {}
try:
system_hardware_info['cpu_model_name'] = _get_cpu_model_name()
Expand All @@ -200,7 +200,7 @@ def _get_system_hardware_info() -> dict:
return system_hardware_info


def _get_system_software_info() -> dict:
def _get_system_software_info() -> Dict:
system_software_info = {}

system_software_info['os_platform'] = \
Expand Down Expand Up @@ -243,7 +243,7 @@ def _is_primitive_type(item: Any) -> bool:
return isinstance(item, primitive)


def _get_workload_properties(workload: spec.Workload) -> dict:
def _get_workload_properties(workload: spec.Workload) -> Dict:
workload_properties = {}
skip_list = ['param_shapes', 'model_params_types']
keys = [
Expand All @@ -262,7 +262,8 @@ def _get_workload_properties(workload: spec.Workload) -> dict:
return workload_properties


def get_meta_data(workload: spec.Workload) -> dict:
def get_meta_data(workload: spec.Workload,
rng_seed: Optional[int] = None) -> Dict:
meta_data = {}
workload_properties = _get_workload_properties(workload)
meta_data.update(workload_properties)
Expand All @@ -272,15 +273,11 @@ def get_meta_data(workload: spec.Workload) -> dict:
meta_data.update(system_software_info)
system_hardware_info = _get_system_hardware_info()
meta_data.update(system_hardware_info)
if rng_seed is not None:
meta_data.update({'rng_seed': rng_seed})
return meta_data


def save_meta_data(workload: spec.Workload, rng_seed: int, meta_file_name: str):
meta_data = get_meta_data(workload)
meta_data.update({'rng_seed': rng_seed})
write_json(meta_file_name, meta_data)


class MetricLogger(object):
"""Used to log all measurements during training.

Expand Down Expand Up @@ -308,7 +305,7 @@ def __init__(self,
wandb.config.update(hyperparameters._asdict())

def append_scalar_metrics(self,
metrics: dict,
metrics: Dict,
global_step: int,
preemption_count: Optional[int] = None,
is_eval: bool = False) -> None:
Expand Down
3 changes: 1 addition & 2 deletions algorithmic_efficiency/pytorch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ def pytorch_init(use_pytorch_ddp: bool, rank: int, profiler: Profiler) -> None:
# Make sure no GPU memory is preallocated to Jax.
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'
# Only use CPU for Jax to avoid memory issues.
# Setting the corresponding environment variable here has no effect; it has to
# be done before jax and tensorflow (!) are imported for the first time.
jax.config.update('jax_platforms', 'cpu')
jax.config.update('jax_platform_name', 'cpu')
# From the docs: "(...) causes cuDNN to benchmark multiple convolution
# algorithms and select the fastest."
torch.backends.cudnn.benchmark = True
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from flax import jax_utils
import jax
import jax.numpy as jnp
import numpy as np

from algorithmic_efficiency import param_utils
from algorithmic_efficiency import spec
Expand Down Expand Up @@ -147,7 +148,8 @@ def _eval_batch(self,
batch: Dict[str, spec.Tensor]) -> spec.Tensor:
# We do NOT psum inside of _eval_batch_pmapped, so the returned tensor of
# shape (local_device_count,) will all be different values.
return self._eval_batch_pmapped(params, batch).sum()
return np.array(
self._eval_batch_pmapped(params, batch).sum(), dtype=np.float64)


class Criteo1TbDlrmSmallTestWorkload(Criteo1TbDlrmSmallWorkload):
Expand Down
4 changes: 2 additions & 2 deletions algorithmic_efficiency/workloads/criteo1tb/workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ def has_reached_validation_target(self, eval_result: Dict[str,

@property
def validation_target_value(self) -> float:
return 0.123649
return 0.123735

def has_reached_test_target(self, eval_result: Dict[str, float]) -> bool:
return eval_result['test/loss'] < self.test_target_value

@property
def test_target_value(self) -> float:
return 0.126060
return 0.126041

@property
def loss_type(self) -> spec.LossType:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ def has_reached_validation_target(self, eval_result: Dict[str,

@property
def validation_target_value(self) -> float:
return 0.084952
return 0.085884

def has_reached_test_target(self, eval_result: Dict[str, float]) -> bool:
return eval_result['test/wer'] < self.test_target_value

@property
def test_target_value(self) -> float:
return 0.053000
return 0.052981

@property
def loss_type(self) -> spec.LossType:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,11 @@ def is_output_params(self, param_key: spec.ParameterKey) -> bool:

@property
def validation_target_value(self) -> float:
return 0.118232
return 0.119936

@property
def test_target_value(self) -> float:
return 0.073397
return 0.074143

@property
def step_hint(self) -> int:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,11 @@ def is_output_params(self, param_key: spec.ParameterKey) -> bool:

@property
def validation_target_value(self) -> float:
return 0.118232
return 0.119936

@property
def test_target_value(self) -> float:
return 0.073397
return 0.074143

@property
def step_hint(self) -> int:
Expand Down
5 changes: 0 additions & 5 deletions reference_algorithms/development_algorithms/README.md

This file was deleted.

This file was deleted.

Loading