From cc8b820af5d698befe4fd19af4152964f1614135 Mon Sep 17 00:00:00 2001 From: Juhan Bae Date: Mon, 25 Sep 2023 15:44:39 -0400 Subject: [PATCH 1/2] minor --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 5 +++-- algorithmic_efficiency/workloads/criteo1tb/workload.py | 7 +++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index 993d82c9d..c514d0a9c 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -136,6 +136,8 @@ def _build_input_queue( cache: Optional[bool] = None, repeat_final_dataset: Optional[bool] = None, num_batches: Optional[int] = None) -> Iterator[Dict[str, spec.Tensor]]: + del num_batches + not_train = split != 'train' per_device_batch_size = int(global_batch_size / N_GPUS) @@ -147,7 +149,6 @@ def _build_input_queue( split=split, data_dir=data_dir, global_batch_size=global_batch_size, - num_batches=num_batches, repeat_final_dataset=repeat_final_dataset) weights = None while True: @@ -233,7 +234,7 @@ def _eval_batch(self, summed_loss = self.loss_fn( label_batch=batch['targets'], logits_batch=logits, mask_batch=weights)['summed'] - return summed_loss + return summed_loss.to(dtype=torch.float64) class Criteo1TbDlrmSmallTestWorkload(Criteo1TbDlrmSmallWorkload): diff --git a/algorithmic_efficiency/workloads/criteo1tb/workload.py b/algorithmic_efficiency/workloads/criteo1tb/workload.py index 801716de7..b341d1022 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/workload.py @@ -63,11 +63,11 @@ def num_eval_train_examples(self) -> int: @property def num_validation_examples(self) -> int: - return 89_000_000 + return 83_274_637 @property def num_test_examples(self) -> int: - return 89_274_637 + return 95_000_000 @property def train_mean(self): @@ -95,13 +95,13 @@ def _build_input_queue( repeat_final_dataset: Optional[bool] = None, num_batches: Optional[int] = None) -> Iterator[Dict[str, spec.Tensor]]: del cache + del num_batches ds = input_pipeline.get_criteo1tb_dataset( split=split, shuffle_rng=data_rng, data_dir=data_dir, num_dense_features=self.num_dense_features, global_batch_size=global_batch_size, - num_batches=num_batches, repeat_final_dataset=repeat_final_dataset) for batch in iter(ds): @@ -132,7 +132,6 @@ def _eval_model_on_split(self, split=split, data_dir=data_dir, global_batch_size=global_batch_size, - num_batches=num_batches, repeat_final_dataset=True) loss = 0.0 for _ in range(num_batches): From 86ad0af2196741fd813237281c9563612d9f3294 Mon Sep 17 00:00:00 2001 From: Juhan Bae Date: Mon, 25 Sep 2023 16:34:58 -0400 Subject: [PATCH 2/2] Add num_batch configs --- .../workloads/criteo1tb/criteo1tb_pytorch/workload.py | 3 +-- algorithmic_efficiency/workloads/criteo1tb/workload.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py index c514d0a9c..55b68fb2f 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/criteo1tb_pytorch/workload.py @@ -136,8 +136,6 @@ def _build_input_queue( cache: Optional[bool] = None, repeat_final_dataset: Optional[bool] = None, num_batches: Optional[int] = None) -> Iterator[Dict[str, spec.Tensor]]: - del num_batches - not_train = split != 'train' per_device_batch_size = int(global_batch_size / N_GPUS) @@ -149,6 +147,7 @@ def _build_input_queue( split=split, data_dir=data_dir, global_batch_size=global_batch_size, + num_batches=num_batches, repeat_final_dataset=repeat_final_dataset) weights = None while True: diff --git a/algorithmic_efficiency/workloads/criteo1tb/workload.py b/algorithmic_efficiency/workloads/criteo1tb/workload.py index b341d1022..ef971bb75 100644 --- a/algorithmic_efficiency/workloads/criteo1tb/workload.py +++ b/algorithmic_efficiency/workloads/criteo1tb/workload.py @@ -95,13 +95,13 @@ def _build_input_queue( repeat_final_dataset: Optional[bool] = None, num_batches: Optional[int] = None) -> Iterator[Dict[str, spec.Tensor]]: del cache - del num_batches ds = input_pipeline.get_criteo1tb_dataset( split=split, shuffle_rng=data_rng, data_dir=data_dir, num_dense_features=self.num_dense_features, global_batch_size=global_batch_size, + num_batches=num_batches, repeat_final_dataset=repeat_final_dataset) for batch in iter(ds): @@ -132,6 +132,7 @@ def _eval_model_on_split(self, split=split, data_dir=data_dir, global_batch_size=global_batch_size, + num_batches=num_batches, repeat_final_dataset=True) loss = 0.0 for _ in range(num_batches):