moreh-dev · HDThang · Sep 14, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 13, 2023
diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -21,6 +21,7 @@
 from dataclasses import dataclass, field
 from random import randint
 from typing import Optional
+import time
 
 import datasets
 import evaluate
@@ -35,12 +36,15 @@
     HfArgumentParser,
     Trainer,
     TrainingArguments,
+    TrainerCallback,
+    TrainerState,
+    TrainerControl,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-
+import mlflow
 
 logger = logging.getLogger(__name__)
 
@@ -184,6 +188,35 @@ def __post_init__(self):
                 "Only make use of `--freeze_feature_encoder`."
             )
 
+class TBTrainerCallback(TrainerCallback):
+    "A callback log loss, learning rate, and throughput each logging step"
+    start_time = time.time()
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # count the time after the logging step
+        if state.global_step == 0 or state.global_step % args.logging_steps == 1:
+            self.start_time = time.time()
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
+        if args.logging_strategy == 'steps':
+            logging_step_runtime = time.time() - self.start_time
+            num_samples = args.per_device_train_batch_size * args.logging_steps
+            throughput = num_samples / logging_step_runtime
+            if 'loss' in state.log_history[-1]:
+                state.log_history[-1]["throughput"] = throughput
+                state.log_history[-1]["step"] = state.global_step
+
+                mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
+                mlflow.log_metric("throughput", throughput , step=state.global_step)
+                print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')       
+
+# Log number of parameters function
+def get_num_parameters(model):
+    num_params = 0
+    for param in model.parameters():
+        num_params += param.numel()
+    # in million
+    num_params /= 10**6
+    return num_params
 
 def main():
     # See all possible arguments in src/transformers/training_args.py
@@ -351,6 +384,9 @@ def compute_metrics(eval_pred):
         use_auth_token=True if model_args.use_auth_token else None,
         ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
     )
+    # Log number of parameters
+    num_params = get_num_parameters(model)
+    mlflow.log_param('num_params', num_params)
 
     # freeze the convolutional waveform encoder
     if model_args.freeze_feature_encoder:
@@ -381,6 +417,12 @@ def compute_metrics(eval_pred):
         compute_metrics=compute_metrics,
         tokenizer=feature_extractor,
     )
+    trainer.add_callback(TBTrainerCallback)
+    # Mlflow initial
+    #set the os enviroment for MLflowCallback
+    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
+    os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
+    os.environ["MLFLOW_FLATTEN_PARAMS"]="True"    
 
     # Training
     if training_args.do_train:
@@ -391,6 +433,7 @@ def compute_metrics(eval_pred):
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
+
         trainer.log_metrics("train", train_result.metrics)
         trainer.save_metrics("train", train_result.metrics)
         trainer.save_state()
@@ -408,6 +451,7 @@ def compute_metrics(eval_pred):
         "dataset": data_args.dataset_name,
         "tags": ["audio-classification"],
     }
+    mlflow.end_run()
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else:

diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py
@@ -18,6 +18,7 @@
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
+import time
 
 import evaluate
 import numpy as np
@@ -43,12 +44,15 @@
     HfArgumentParser,
     Trainer,
     TrainingArguments,
+    TrainerCallback,
+    TrainerState,
+    TrainerControl,
     set_seed,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-
+import mlflow
 
 """ Fine-tuning a 🤗 Transformers model for image classification"""
 
@@ -164,12 +168,41 @@ class ModelArguments:
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
     )
 
+class TBTrainerCallback(TrainerCallback):
+    "A callback log loss, learning rate, and throughput each logging step"
+    start_time = time.time()
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # count the time after the logging step
+        if state.global_step == 0 or state.global_step % args.logging_steps == 1:
+            self.start_time = time.time()
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
+        if args.logging_strategy == 'steps':
+            logging_step_runtime = time.time() - self.start_time
+            num_samples = args.per_device_train_batch_size * args.logging_steps
+            throughput = num_samples / logging_step_runtime
+            if 'loss' in state.log_history[-1]:
+                state.log_history[-1]["throughput"] = throughput
+                state.log_history[-1]["step"] = state.global_step
+
+                mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
+                mlflow.log_metric("throughput", throughput , step=state.global_step)
+                print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')       
 
 def collate_fn(examples):
     pixel_values = torch.stack([example["pixel_values"] for example in examples])
     labels = torch.tensor([example["labels"] for example in examples])
     return {"pixel_values": pixel_values, "labels": labels}
 
+# Log number of parameters function
+def get_num_parameters(model):
+    num_params = 0
+    for param in model.parameters():
+        num_params += param.numel()
+    # in million
+    num_params /= 10**6
+    return num_params
 
 def main():
     # See all possible arguments in src/transformers/training_args.py
@@ -301,6 +334,9 @@ def compute_metrics(p):
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
     )
+    # Log number of parameters
+    num_params = get_num_parameters(model)
+    mlflow.log_param('num_params', num_params)
 
     # Define torchvision transforms to be applied to each image.
     if "shortest_edge" in image_processor.size:
@@ -367,7 +403,12 @@ def val_transforms(example_batch):
         tokenizer=image_processor,
         data_collator=collate_fn,
     )
-
+    trainer.add_callback(TBTrainerCallback)
+    # Mlflow initial
+    #set the os enviroment for MLflowCallback
+    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
+    os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
+    os.environ["MLFLOW_FLATTEN_PARAMS"]="True"
     # Training
     if training_args.do_train:
         checkpoint = None
@@ -378,9 +419,7 @@ def val_transforms(example_batch):
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
         metrics = train_result.metrics
-        metrics['throughput'] = metrics['train_samples_per_second']
-        metrics['loss']= metrics['train_loss']
-        metrics['lr'] = training_args.learning_rate
+
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
@@ -399,6 +438,7 @@ def val_transforms(example_batch):
         "dataset": data_args.dataset_name,
         "tags": ["image-classification", "vision"],
     }
+    mlflow.end_run()
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else:

diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py
@@ -18,6 +18,7 @@
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
+import time
 
 import torch
 from datasets import load_dataset
@@ -32,11 +33,14 @@
     ViTImageProcessor,
     ViTMAEConfig,
     ViTMAEForPreTraining,
+    TrainerCallback,
+    TrainerState,
+    TrainerControl,
 )
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import check_min_version, send_example_telemetry
 from transformers.utils.versions import require_version
-
+import mlflow
 
 """ Pre-training a 🤗 ViT model as an MAE (masked autoencoder), as proposed in https://arxiv.org/abs/2111.06377."""
 
@@ -156,11 +160,40 @@ class CustomTrainingArguments(TrainingArguments):
         default=1e-3, metadata={"help": "Base learning rate: absolute_lr = base_lr * total_batch_size / 256."}
     )
 
+class TBTrainerCallback(TrainerCallback):
+    "A callback log loss, learning rate, and throughput each logging step"
+    start_time = time.time()
+
+    def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
+        # count the time after the logging step
+        if state.global_step == 0 or state.global_step % args.logging_steps == 1:
+            self.start_time = time.time()
+
+    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
+        if args.logging_strategy == 'steps':
+            logging_step_runtime = time.time() - self.start_time
+            num_samples = args.per_device_train_batch_size * args.logging_steps
+            throughput = num_samples / logging_step_runtime
+            if 'loss' in state.log_history[-1]:
+                state.log_history[-1]["throughput"] = throughput
+                state.log_history[-1]["step"] = state.global_step
+
+                mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
+                mlflow.log_metric("throughput", throughput , step=state.global_step)
+                print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')       
 
 def collate_fn(examples):
     pixel_values = torch.stack([example["pixel_values"] for example in examples])
     return {"pixel_values": pixel_values}
 
+# Log number of parameters function
+def get_num_parameters(model):
+    num_params = 0
+    for param in model.parameters():
+        num_params += param.numel()
+    # in million
+    num_params /= 10**6
+    return num_params
 
 def main():
     # See all possible arguments in src/transformers/training_args.py
@@ -290,6 +323,10 @@ def main():
         logger.info("Training new model from scratch")
         model = ViTMAEForPreTraining(config)
 
+    # Log number of parameters
+    num_params = get_num_parameters(model)
+    mlflow.log_param('num_params', num_params)
+
     if training_args.do_train:
         column_names = ds["train"].column_names
     else:
@@ -360,6 +397,12 @@ def preprocess_images(examples):
         tokenizer=image_processor,
         data_collator=collate_fn,
     )
+    trainer.add_callback(TBTrainerCallback)
+    # Mlflow initial
+    #set the os enviroment for MLflowCallback
+    os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
+    os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
+    os.environ["MLFLOW_FLATTEN_PARAMS"]="True"
 
     # Training
     if training_args.do_train:
@@ -371,9 +414,7 @@ def preprocess_images(examples):
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
         metrics = train_result.metrics
-        metrics['throughput'] = metrics['train_samples_per_second']
-        metrics['loss']= metrics['train_loss']
-        metrics['lr'] = training_args.learning_rate
+
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
@@ -390,6 +431,7 @@ def preprocess_images(examples):
         "dataset": data_args.dataset_name,
         "tags": ["masked-auto-encoding"],
     }
+    mlflow.end_run()
     if training_args.push_to_hub:
         trainer.push_to_hub(**kwargs)
     else: