diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 1c9fff0f6202..c0dbff8ed835 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -21,6 +21,7 @@ from dataclasses import dataclass, field from random import randint from typing import Optional +import time import datasets import evaluate @@ -35,12 +36,15 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow logger = logging.getLogger(__name__) @@ -184,6 +188,35 @@ def __post_init__(self): "Only make use of `--freeze_feature_encoder`." ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -351,6 +384,9 @@ def compute_metrics(eval_pred): use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # freeze the convolutional waveform encoder if model_args.freeze_feature_encoder: @@ -381,6 +417,12 @@ def compute_metrics(eval_pred): compute_metrics=compute_metrics, tokenizer=feature_extractor, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -391,6 +433,7 @@ def compute_metrics(eval_pred): checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() + trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() @@ -408,6 +451,7 @@ def compute_metrics(eval_pred): "dataset": data_args.dataset_name, "tags": ["audio-classification"], } + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 4f1ca689b130..c6a5c3e754b5 100644 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -18,6 +18,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import evaluate import numpy as np @@ -43,12 +44,15 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow """ Fine-tuning a 🤗 Transformers model for image classification""" @@ -164,12 +168,41 @@ class ModelArguments: metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') def collate_fn(examples): pixel_values = torch.stack([example["pixel_values"] for example in examples]) labels = torch.tensor([example["labels"] for example in examples]) return {"pixel_values": pixel_values, "labels": labels} +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -301,6 +334,9 @@ def compute_metrics(p): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Define torchvision transforms to be applied to each image. if "shortest_edge" in image_processor.size: @@ -367,7 +403,12 @@ def val_transforms(example_batch): tokenizer=image_processor, data_collator=collate_fn, ) - + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: checkpoint = None @@ -378,9 +419,7 @@ def val_transforms(example_batch): train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -399,6 +438,7 @@ def val_transforms(example_batch): "dataset": data_args.dataset_name, "tags": ["image-classification", "vision"], } + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index c39862be1b3d..b3e96544ff89 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -18,6 +18,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import torch from datasets import load_dataset @@ -32,11 +33,14 @@ ViTImageProcessor, ViTMAEConfig, ViTMAEForPreTraining, + TrainerCallback, + TrainerState, + TrainerControl, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow """ Pre-training a 🤗 ViT model as an MAE (masked autoencoder), as proposed in https://arxiv.org/abs/2111.06377.""" @@ -156,11 +160,40 @@ class CustomTrainingArguments(TrainingArguments): default=1e-3, metadata={"help": "Base learning rate: absolute_lr = base_lr * total_batch_size / 256."} ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') def collate_fn(examples): pixel_values = torch.stack([example["pixel_values"] for example in examples]) return {"pixel_values": pixel_values} +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -290,6 +323,10 @@ def main(): logger.info("Training new model from scratch") model = ViTMAEForPreTraining(config) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) + if training_args.do_train: column_names = ds["train"].column_names else: @@ -360,6 +397,12 @@ def preprocess_images(examples): tokenizer=image_processor, data_collator=collate_fn, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -371,9 +414,7 @@ def preprocess_images(examples): train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -390,6 +431,7 @@ def preprocess_images(examples): "dataset": data_args.dataset_name, "tags": ["masked-auto-encoding"], } + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index bfba4dfcd061..855bc51164d4 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -18,6 +18,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import numpy as np import torch @@ -35,11 +36,14 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow """ Pre-training a 🤗 Transformers model for simple masked image modeling (SimMIM). Any model supported by the AutoModelForMaskedImageModeling API can be used. @@ -50,7 +54,8 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") +require_version("datasets>=1.8.0", + "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) @@ -72,14 +77,18 @@ class DataTrainingArguments: ) image_column_name: Optional[str] = field( default=None, - metadata={"help": "The column name of the images in the files. If not set, will try to use 'image' or 'img'."}, + metadata={ + "help": "The column name of the images in the files. If not set, will try to use 'image' or 'img'."}, ) - train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."}) - validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."}) + train_dir: Optional[str] = field( + default=None, metadata={"help": "A folder containing the training data."}) + validation_dir: Optional[str] = field( + default=None, metadata={"help": "A folder containing the validation data."}) train_val_split: Optional[float] = field( default=0.15, metadata={"help": "Percent to split off of train for validation."} ) - mask_patch_size: int = field(default=32, metadata={"help": "The size of the square patches to use for masking."}) + mask_patch_size: int = field(default=32, metadata={ + "help": "The size of the square patches to use for masking."}) mask_ratio: float = field( default=0.6, metadata={"help": "Percentage of patches to mask."}, @@ -130,7 +139,8 @@ class ModelArguments: ) model_type: Optional[str] = field( default=None, - metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + metadata={ + "help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, ) config_name_or_path: Optional[str] = field( default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} @@ -146,13 +156,16 @@ class ModelArguments: ) cache_dir: Optional[str] = field( default=None, - metadata={"help": "Where do you want to store (cache) the pretrained models/datasets downloaded from the hub"}, + metadata={ + "help": "Where do you want to store (cache) the pretrained models/datasets downloaded from the hub"}, ) model_revision: str = field( default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + metadata={ + "help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) - image_processor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) + image_processor_name: str = field( + default=None, metadata={"help": "Name or path of preprocessor config."}) use_auth_token: bool = field( default=False, metadata={ @@ -201,7 +214,8 @@ def __init__(self, input_size=192, mask_patch_size=32, model_patch_size=4, mask_ if self.input_size % self.mask_patch_size != 0: raise ValueError("Input size must be divisible by mask patch size") if self.mask_patch_size % self.model_patch_size != 0: - raise ValueError("Mask patch size must be divisible by model patch size") + raise ValueError( + "Mask patch size must be divisible by model patch size") self.rand_size = self.input_size // self.mask_patch_size self.scale = self.mask_patch_size // self.model_patch_size @@ -219,23 +233,59 @@ def __call__(self): return torch.tensor(mask.flatten()) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + def collate_fn(examples): - pixel_values = torch.stack([example["pixel_values"] for example in examples]) + pixel_values = torch.stack([example["pixel_values"] + for example in examples]) mask = torch.stack([example["mask"] for example in examples]) return {"pixel_values": pixel_values, "bool_masked_pos": mask} +# Log number of parameters function + + +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params + def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + parser = HfArgumentParser( + (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + model_args, data_args, training_args = parser.parse_json_file( + json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() @@ -292,7 +342,8 @@ def main(): ) # If we don't have a validation split, split off a percentage of train as validation. - data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split + data_args.train_val_split = None if "validation" in ds.keys( + ) else data_args.train_val_split if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0: split = ds["train"].train_test_split(data_args.train_val_split) ds["train"] = split["train"] @@ -308,12 +359,15 @@ def main(): "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name_or_path: - config = AutoConfig.from_pretrained(model_args.config_name_or_path, **config_kwargs) + config = AutoConfig.from_pretrained( + model_args.config_name_or_path, **config_kwargs) elif model_args.model_name_or_path: - config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + config = AutoConfig.from_pretrained( + model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() - logger.warning("You are instantiating a new config instance from scratch.") + logger.warning( + "You are instantiating a new config instance from scratch.") if model_args.config_overrides is not None: logger.info(f"Overriding config: {model_args.config_overrides}") config.update_from_string(model_args.config_overrides) @@ -340,9 +394,11 @@ def main(): # create image processor if model_args.image_processor_name: - image_processor = AutoImageProcessor.from_pretrained(model_args.image_processor_name, **config_kwargs) + image_processor = AutoImageProcessor.from_pretrained( + model_args.image_processor_name, **config_kwargs) elif model_args.model_name_or_path: - image_processor = AutoImageProcessor.from_pretrained(model_args.model_name_or_path, **config_kwargs) + image_processor = AutoImageProcessor.from_pretrained( + model_args.model_name_or_path, **config_kwargs) else: IMAGE_PROCESSOR_TYPES = { conf.model_type: image_processor_class for conf, image_processor_class in IMAGE_PROCESSOR_MAPPING.items() @@ -362,6 +418,9 @@ def main(): else: logger.info("Training new model from scratch") model = AutoModelForMaskedImageModeling.from_config(config) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) if training_args.do_train: column_names = ds["train"].column_names @@ -381,11 +440,14 @@ def main(): # source: https://github.com/microsoft/SimMIM/blob/main/data/data_simmim.py transforms = Compose( [ - Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img), - RandomResizedCrop(model_args.image_size, scale=(0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)), + Lambda(lambda img: img.convert("RGB") + if img.mode != "RGB" else img), + RandomResizedCrop(model_args.image_size, scale=( + 0.67, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0)), RandomHorizontalFlip(), ToTensor(), - Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + Normalize(mean=image_processor.image_mean, + std=image_processor.image_std), ] ) @@ -401,8 +463,10 @@ def preprocess_images(examples): """Preprocess a batch of images by applying transforms + creating a corresponding mask, indicating which patches to mask.""" - examples["pixel_values"] = [transforms(image) for image in examples[image_column_name]] - examples["mask"] = [mask_generator() for i in range(len(examples[image_column_name]))] + examples["pixel_values"] = [transforms( + image) for image in examples[image_column_name]] + examples["mask"] = [mask_generator() + for i in range(len(examples[image_column_name]))] return examples @@ -410,7 +474,8 @@ def preprocess_images(examples): if "train" not in ds: raise ValueError("--do_train requires a train dataset") if data_args.max_train_samples is not None: - ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) + ds["train"] = ds["train"].shuffle(seed=training_args.seed).select( + range(data_args.max_train_samples)) # Set the training transforms ds["train"].set_transform(preprocess_images) @@ -419,7 +484,8 @@ def preprocess_images(examples): raise ValueError("--do_eval requires a validation dataset") if data_args.max_eval_samples is not None: ds["validation"] = ( - ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) + ds["validation"].shuffle(seed=training_args.seed).select( + range(data_args.max_eval_samples)) ) # Set the validation transforms ds["validation"].set_transform(preprocess_images) @@ -433,6 +499,12 @@ def preprocess_images(examples): tokenizer=image_processor, data_collator=collate_fn, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + # set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "False" + os.environ["MLFLOW_FLATTEN_PARAMS"] = "True" # Training if training_args.do_train: @@ -444,9 +516,7 @@ def preprocess_images(examples): train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() metrics = train_result.metrics - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -464,6 +534,7 @@ def preprocess_images(examples): "dataset": data_args.dataset_name, "tags": ["masked-image-modeling"], } + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index ea1fab6d9998..43daebd62de2 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -28,6 +28,7 @@ from dataclasses import dataclass, field from itertools import chain from typing import Optional +import time import datasets import evaluate @@ -44,6 +45,9 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, is_torch_tpu_available, set_seed, @@ -225,6 +229,28 @@ def __post_init__(self): extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + # Log number of parameters function def get_num_parameters(model): num_params = 0 @@ -584,7 +610,7 @@ def compute_metrics(eval_preds): if training_args.do_eval and not is_torch_tpu_available() else None, ) - + trainer.add_callback(TBTrainerCallback) # Mlflow initial #set the os enviroment for MLflowCallback os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" @@ -607,9 +633,7 @@ def compute_metrics(eval_preds): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 0edb28abfb7f..fb549b905e8a 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -28,6 +28,7 @@ from dataclasses import dataclass, field from itertools import chain from typing import Optional +import time import datasets import evaluate @@ -44,6 +45,9 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, is_torch_tpu_available, set_seed, ) @@ -225,6 +229,28 @@ def __post_init__(self): if extension not in ["csv", "json", "txt"]: raise ValueError("`validation_file` should be a csv, a json or a txt file.") +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + # Log number of parameters function def get_num_parameters(model): num_params = 0 @@ -610,7 +636,7 @@ def compute_metrics(eval_preds): if training_args.do_eval and not is_torch_tpu_available() else None, ) - + trainer.add_callback(TBTrainerCallback) # Mlflow initial #set the os enviroment for MLflowCallback os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" @@ -632,9 +658,7 @@ def compute_metrics(eval_preds): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 79fa1df0b498..d10c522c21c0 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -25,6 +25,7 @@ from dataclasses import dataclass, field from itertools import chain from typing import Optional +import time import datasets from datasets import load_dataset @@ -37,6 +38,9 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, XLNetConfig, XLNetLMHeadModel, set_seed, @@ -216,6 +220,28 @@ def __post_init__(self): extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + # Log number of parameters function def get_num_parameters(model): num_params = 0 @@ -522,7 +548,7 @@ def group_texts(examples): tokenizer=tokenizer, data_collator=data_collator, ) - + trainer.add_callback(TBTrainerCallback) # Mlflow initial #set the os enviroment for MLflowCallback os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" @@ -544,9 +570,7 @@ def group_texts(examples): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 299e270b30e6..e4e8fbabe3d5 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -24,6 +24,7 @@ from dataclasses import dataclass, field from itertools import chain from typing import Optional, Union +import time import datasets import numpy as np @@ -38,6 +39,9 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, set_seed, ) @@ -211,6 +215,28 @@ def __call__(self, features): batch["labels"] = torch.tensor(labels, dtype=torch.int64) return batch +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + # Log number of parameters function def get_num_parameters(model): num_params = 0 @@ -441,7 +467,7 @@ def compute_metrics(eval_predictions): data_collator=data_collator, compute_metrics=compute_metrics, ) - + trainer.add_callback(TBTrainerCallback) # Mlflow initial #set the os enviroment for MLflowCallback os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" @@ -463,9 +489,7 @@ def compute_metrics(eval_predictions): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 32436e0524d9..5fb9446a9ba3 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -40,13 +41,16 @@ HfArgumentParser, PreTrainedTokenizerFast, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -213,6 +217,35 @@ def __post_init__(self): extension = self.test_file.split(".")[-1] assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -339,6 +372,9 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): @@ -615,6 +651,12 @@ def compute_metrics(p: EvalPrediction): post_process_function=post_processing_function, compute_metrics=compute_metrics, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -631,9 +673,7 @@ def compute_metrics(p: EvalPrediction): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -671,7 +711,7 @@ def compute_metrics(p: EvalPrediction): kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name - + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 9b9eca9c7743..4ff181339db5 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -36,6 +37,9 @@ EvalPrediction, HfArgumentParser, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizerFast, @@ -45,7 +49,7 @@ from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -212,6 +216,35 @@ def __post_init__(self): extension = self.test_file.split(".")[-1] assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -336,6 +369,9 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Preprocessing the datasets. # Preprocessing is slighlty different for training and evaluation. @@ -648,6 +684,12 @@ def compute_metrics(p: EvalPrediction): post_process_function=post_processing_function, compute_metrics=compute_metrics, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -665,9 +707,7 @@ def compute_metrics(p: EvalPrediction): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -705,7 +745,7 @@ def compute_metrics(p: EvalPrediction): kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name - + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index 06cdd2044525..618248e6d9d4 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import List, Optional, Tuple +import time import datasets import evaluate @@ -38,12 +39,15 @@ DataCollatorForSeq2Seq, HfArgumentParser, Seq2SeqTrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, set_seed, ) from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -259,6 +263,35 @@ def __post_init__(self): "squad_v2": ("question", "context", "answer"), } +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -384,6 +417,9 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -652,7 +688,12 @@ def post_processing_function( compute_metrics=compute_metrics if training_args.predict_with_generate else None, post_process_function=post_processing_function, ) - + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -669,9 +710,7 @@ def post_processing_function( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -706,7 +745,7 @@ def post_processing_function( trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) - + mlflow.end_run() if training_args.push_to_hub: kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"} if data_args.dataset_name is not None: diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index c31656ba543e..8db12aa6a306 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -20,6 +20,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import evaluate import numpy as np @@ -39,12 +40,15 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow """ Finetuning any 🤗 Transformers model supported by AutoModelForSemanticSegmentation for semantic segmentation leveraging the Trainer API.""" @@ -251,6 +255,36 @@ class ModelArguments: }, ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -390,6 +424,10 @@ def compute_metrics(eval_pred): use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=True, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) + image_processor = AutoImageProcessor.from_pretrained( model_args.image_processor_name or model_args.model_name_or_path, cache_dir=model_args.cache_dir, @@ -485,6 +523,12 @@ def preprocess_val(example_batch): tokenizer=image_processor, data_collator=default_data_collator, ) + trainer.add_callback(TBTrainerCallBack) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -495,9 +539,7 @@ def preprocess_val(example_batch): checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -514,6 +556,7 @@ def preprocess_val(example_batch): "dataset": data_args.dataset_name, "tags": ["image-segmentation", "vision"], } + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 65d9ff93b07f..32891971ec4b 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -44,6 +45,9 @@ MBartTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, set_seed, ) from transformers.trainer_utils import get_last_checkpoint @@ -299,6 +303,28 @@ def __post_init__(self): "multi_news": ("document", "summary"), } +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + # Log number of parameters function def get_num_parameters(model): num_params = 0 @@ -679,6 +705,7 @@ def compute_metrics(eval_preds): data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) + trainer.add_callback(TBTrainerCallback) # Mlflow initial #set the os enviroment for MLflowCallback os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" @@ -700,9 +727,7 @@ def compute_metrics(eval_preds): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate + trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 7c7cd4b4a494..85f6161feaf1 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -22,6 +22,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -39,13 +40,16 @@ PretrainedConfig, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -202,6 +206,35 @@ class ModelArguments: metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -379,6 +412,9 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Preprocessing the raw_datasets if data_args.task_name is not None: @@ -521,6 +557,12 @@ def compute_metrics(p: EvalPrediction): data_collator=data_collator, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -535,9 +577,6 @@ def compute_metrics(p: EvalPrediction): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) @@ -610,7 +649,7 @@ def compute_metrics(p: EvalPrediction): kwargs["dataset_tags"] = "glue" kwargs["dataset_args"] = data_args.task_name kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}" - + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index ca27c7a5c6cd..fbda53567cab 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -39,13 +40,16 @@ HfArgumentParser, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version - +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -166,6 +170,35 @@ class ModelArguments: metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."}, ) +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -297,6 +330,9 @@ def main(): use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Preprocessing the datasets # Padding strategy @@ -383,6 +419,12 @@ def compute_metrics(p: EvalPrediction): tokenizer=tokenizer, data_collator=data_collator, ) + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -397,9 +439,6 @@ def compute_metrics(p: EvalPrediction): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate trainer.save_model() # Saves the tokenizer too for easy upload trainer.log_metrics("train", metrics) @@ -438,6 +477,7 @@ def compute_metrics(p: EvalPrediction): for index, item in enumerate(predictions): item = label_list[item] writer.write(f"{index}\t{item}\n") + mlflow.end_run() if __name__ == "__main__": diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 251fd4874f11..edcca1be4f00 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -24,6 +24,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -41,11 +42,17 @@ PreTrainedTokenizerFast, Trainer, TrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, set_seed, ) -from transformers.trainer_utils import get_last_checkpoint +from transformers.trainer_callback import TrainerControl, TrainerState +from transformers.trainer_utils import get_last_checkpoint, speed_metrics +from transformers.training_args import TrainingArguments from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -202,6 +209,34 @@ def __post_init__(self): assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." self.task_name = self.task_name.lower() +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -378,6 +413,9 @@ def get_label_list(labels): use_auth_token=True if model_args.use_auth_token else None, ignore_mismatched_sizes=model_args.ignore_mismatched_sizes, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): @@ -553,7 +591,12 @@ def compute_metrics(p): data_collator=data_collator, compute_metrics=compute_metrics, ) - + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -570,9 +613,6 @@ def compute_metrics(p): data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() @@ -620,7 +660,7 @@ def compute_metrics(p): kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name - + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index d7eefc655861..ea252bd39f72 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -23,6 +23,7 @@ import sys from dataclasses import dataclass, field from typing import Optional +import time import datasets import evaluate @@ -43,12 +44,16 @@ MBartTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, + TrainerCallback, + TrainerState, + TrainerControl, default_data_collator, set_seed, ) from transformers.trainer_utils import get_last_checkpoint from transformers.utils import check_min_version, send_example_telemetry from transformers.utils.versions import require_version +import mlflow # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.29.0") @@ -246,6 +251,36 @@ def __post_init__(self): if self.val_max_target_length is None: self.val_max_target_length = self.max_target_length +class TBTrainerCallback(TrainerCallback): + "A callback log loss, learning rate, and throughput each logging step" + start_time = time.time() + + def on_step_begin(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # count the time after the logging step + if state.global_step == 0 or state.global_step % args.logging_steps == 1: + self.start_time = time.time() + + def on_log(self, args: Seq2SeqTrainingArguments, state: TrainerState, control: TrainerControl,**kwargs): + if args.logging_strategy == 'steps': + logging_step_runtime = time.time() - self.start_time + num_samples = args.per_device_train_batch_size * args.logging_steps + throughput = num_samples / logging_step_runtime + if 'loss' in state.log_history[-1]: + state.log_history[-1]["throughput"] = throughput + state.log_history[-1]["step"] = state.global_step + + mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step) + mlflow.log_metric("throughput", throughput , step=state.global_step) + print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}') + +# Log number of parameters function +def get_num_parameters(model): + num_params = 0 + for param in model.parameters(): + num_params += param.numel() + # in million + num_params /= 10**6 + return num_params def main(): # See all possible arguments in src/transformers/training_args.py @@ -382,6 +417,9 @@ def main(): revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) + # Log number of parameters + num_params = get_num_parameters(model) + mlflow.log_param('num_params', num_params) # We resize the embeddings only when necessary to avoid index errors. If you are creating a model from scratch # on a small vocab and want a smaller embedding size, remove this test. @@ -569,7 +607,12 @@ def compute_metrics(eval_preds): data_collator=data_collator, compute_metrics=compute_metrics if training_args.predict_with_generate else None, ) - + trainer.add_callback(TBTrainerCallback) + # Mlflow initial + #set the os enviroment for MLflowCallback + os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False" + os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False" + os.environ["MLFLOW_FLATTEN_PARAMS"]="True" # Training if training_args.do_train: @@ -605,9 +648,6 @@ def compute_metrics(eval_preds): metrics = trainer.evaluate(max_length=max_length, num_beams=num_beams, metric_key_prefix="eval") max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) - metrics['throughput'] = metrics['train_samples_per_second'] - metrics['loss']= metrics['train_loss'] - metrics['lr'] = training_args.learning_rate trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) @@ -650,7 +690,7 @@ def compute_metrics(eval_preds): languages = [l for l in [data_args.source_lang, data_args.target_lang] if l is not None] if len(languages) > 0: kwargs["language"] = languages - + mlflow.end_run() if training_args.push_to_hub: trainer.push_to_hub(**kwargs) else: