Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from dataclasses import dataclass, field
from random import randint
from typing import Optional
import time

import datasets
import evaluate
Expand All @@ -35,12 +36,15 @@
HfArgumentParser,
Trainer,
TrainingArguments,
TrainerCallback,
TrainerState,
TrainerControl,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

import mlflow

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -184,6 +188,35 @@ def __post_init__(self):
"Only make use of `--freeze_feature_encoder`."
)

class TBTrainerCallback(TrainerCallback):
"A callback log loss, learning rate, and throughput each logging step"
start_time = time.time()
def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# count the time after the logging step
if state.global_step == 0 or state.global_step % args.logging_steps == 1:
self.start_time = time.time()

def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
if args.logging_strategy == 'steps':
logging_step_runtime = time.time() - self.start_time
num_samples = args.per_device_train_batch_size * args.logging_steps
throughput = num_samples / logging_step_runtime
if 'loss' in state.log_history[-1]:
state.log_history[-1]["throughput"] = throughput
state.log_history[-1]["step"] = state.global_step

mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
mlflow.log_metric("throughput", throughput , step=state.global_step)
print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')

# Log number of parameters function
def get_num_parameters(model):
num_params = 0
for param in model.parameters():
num_params += param.numel()
# in million
num_params /= 10**6
return num_params

def main():
# See all possible arguments in src/transformers/training_args.py
Expand Down Expand Up @@ -351,6 +384,9 @@ def compute_metrics(eval_pred):
use_auth_token=True if model_args.use_auth_token else None,
ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
)
# Log number of parameters
num_params = get_num_parameters(model)
mlflow.log_param('num_params', num_params)

# freeze the convolutional waveform encoder
if model_args.freeze_feature_encoder:
Expand Down Expand Up @@ -381,6 +417,12 @@ def compute_metrics(eval_pred):
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
)
trainer.add_callback(TBTrainerCallback)
# Mlflow initial
#set the os enviroment for MLflowCallback
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
os.environ["MLFLOW_FLATTEN_PARAMS"]="True"

# Training
if training_args.do_train:
Expand All @@ -391,6 +433,7 @@ def compute_metrics(eval_pred):
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()

trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
Expand All @@ -408,6 +451,7 @@ def compute_metrics(eval_pred):
"dataset": data_args.dataset_name,
"tags": ["audio-classification"],
}
mlflow.end_run()
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
Expand Down
50 changes: 45 additions & 5 deletions examples/pytorch/image-classification/run_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
from dataclasses import dataclass, field
from typing import Optional
import time

import evaluate
import numpy as np
Expand All @@ -43,12 +44,15 @@
HfArgumentParser,
Trainer,
TrainingArguments,
TrainerCallback,
TrainerState,
TrainerControl,
set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

import mlflow

""" Fine-tuning a 🤗 Transformers model for image classification"""

Expand Down Expand Up @@ -164,12 +168,41 @@ class ModelArguments:
metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
)

class TBTrainerCallback(TrainerCallback):
"A callback log loss, learning rate, and throughput each logging step"
start_time = time.time()

def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# count the time after the logging step
if state.global_step == 0 or state.global_step % args.logging_steps == 1:
self.start_time = time.time()

def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
if args.logging_strategy == 'steps':
logging_step_runtime = time.time() - self.start_time
num_samples = args.per_device_train_batch_size * args.logging_steps
throughput = num_samples / logging_step_runtime
if 'loss' in state.log_history[-1]:
state.log_history[-1]["throughput"] = throughput
state.log_history[-1]["step"] = state.global_step

mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
mlflow.log_metric("throughput", throughput , step=state.global_step)
print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')

def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
labels = torch.tensor([example["labels"] for example in examples])
return {"pixel_values": pixel_values, "labels": labels}

# Log number of parameters function
def get_num_parameters(model):
num_params = 0
for param in model.parameters():
num_params += param.numel()
# in million
num_params /= 10**6
return num_params

def main():
# See all possible arguments in src/transformers/training_args.py
Expand Down Expand Up @@ -301,6 +334,9 @@ def compute_metrics(p):
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# Log number of parameters
num_params = get_num_parameters(model)
mlflow.log_param('num_params', num_params)

# Define torchvision transforms to be applied to each image.
if "shortest_edge" in image_processor.size:
Expand Down Expand Up @@ -367,7 +403,12 @@ def val_transforms(example_batch):
tokenizer=image_processor,
data_collator=collate_fn,
)

trainer.add_callback(TBTrainerCallback)
# Mlflow initial
#set the os enviroment for MLflowCallback
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
os.environ["MLFLOW_FLATTEN_PARAMS"]="True"
# Training
if training_args.do_train:
checkpoint = None
Expand All @@ -378,9 +419,7 @@ def val_transforms(example_batch):
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
metrics = train_result.metrics
metrics['throughput'] = metrics['train_samples_per_second']
metrics['loss']= metrics['train_loss']
metrics['lr'] = training_args.learning_rate

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
Expand All @@ -399,6 +438,7 @@ def val_transforms(example_batch):
"dataset": data_args.dataset_name,
"tags": ["image-classification", "vision"],
}
mlflow.end_run()
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
Expand Down
50 changes: 46 additions & 4 deletions examples/pytorch/image-pretraining/run_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import sys
from dataclasses import dataclass, field
from typing import Optional
import time

import torch
from datasets import load_dataset
Expand All @@ -32,11 +33,14 @@
ViTImageProcessor,
ViTMAEConfig,
ViTMAEForPreTraining,
TrainerCallback,
TrainerState,
TrainerControl,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

import mlflow

""" Pre-training a 🤗 ViT model as an MAE (masked autoencoder), as proposed in https://arxiv.org/abs/2111.06377."""

Expand Down Expand Up @@ -156,11 +160,40 @@ class CustomTrainingArguments(TrainingArguments):
default=1e-3, metadata={"help": "Base learning rate: absolute_lr = base_lr * total_batch_size / 256."}
)

class TBTrainerCallback(TrainerCallback):
"A callback log loss, learning rate, and throughput each logging step"
start_time = time.time()

def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
# count the time after the logging step
if state.global_step == 0 or state.global_step % args.logging_steps == 1:
self.start_time = time.time()

def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl,**kwargs):
if args.logging_strategy == 'steps':
logging_step_runtime = time.time() - self.start_time
num_samples = args.per_device_train_batch_size * args.logging_steps
throughput = num_samples / logging_step_runtime
if 'loss' in state.log_history[-1]:
state.log_history[-1]["throughput"] = throughput
state.log_history[-1]["step"] = state.global_step

mlflow.log_metric("lr", state.log_history[-1]["learning_rate"] , step=state.global_step)
mlflow.log_metric("throughput", throughput , step=state.global_step)
print(f'loss: {state.log_history[-1]["loss"]}, lr: {state.log_history[-1]["learning_rate"]}, throughput: {throughput}, step: {state.global_step}')

def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
return {"pixel_values": pixel_values}

# Log number of parameters function
def get_num_parameters(model):
num_params = 0
for param in model.parameters():
num_params += param.numel()
# in million
num_params /= 10**6
return num_params

def main():
# See all possible arguments in src/transformers/training_args.py
Expand Down Expand Up @@ -290,6 +323,10 @@ def main():
logger.info("Training new model from scratch")
model = ViTMAEForPreTraining(config)

# Log number of parameters
num_params = get_num_parameters(model)
mlflow.log_param('num_params', num_params)

if training_args.do_train:
column_names = ds["train"].column_names
else:
Expand Down Expand Up @@ -360,6 +397,12 @@ def preprocess_images(examples):
tokenizer=image_processor,
data_collator=collate_fn,
)
trainer.add_callback(TBTrainerCallback)
# Mlflow initial
#set the os enviroment for MLflowCallback
os.environ["DISABLE_MLFLOW_INTEGRATION"] = "False"
os.environ["HF_MLFLOW_LOG_ARTIFACTS"]="False"
os.environ["MLFLOW_FLATTEN_PARAMS"]="True"

# Training
if training_args.do_train:
Expand All @@ -371,9 +414,7 @@ def preprocess_images(examples):
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
metrics = train_result.metrics
metrics['throughput'] = metrics['train_samples_per_second']
metrics['loss']= metrics['train_loss']
metrics['lr'] = training_args.learning_rate

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
Expand All @@ -390,6 +431,7 @@ def preprocess_images(examples):
"dataset": data_args.dataset_name,
"tags": ["masked-auto-encoding"],
}
mlflow.end_run()
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
Expand Down
Loading