In [33]:
#!pip3 install -I git+https://github.com/cloudera/cmlextensions.git
#!pip3 install -I ray
#!pip3 install -I modin[ray]
#!pip3 install transformers datasets scipy sklearn torch
#!pip3 install transformers[torch]
#!pip3 install accelerate -U
#!pip3 install ray[client]
#!pip3 install ray[tune]

In [1]:
!nvidia-smi

/bin/bash: nvidia-smi: command not found


In [2]:
import cmlextensions.ray_cluster as rc
import cmlapi
import os
import json
from pprint import pprint
import ray

  from .autonotebook import tqdm as notebook_tqdm
2023-09-28 16:39:01,045	INFO util.py:159 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [3]:
# Set the setup variables needed by CML APIv2
HOST = os.getenv("CDSW_API_URL").split(":")[0] + "://" + os.getenv("CDSW_DOMAIN")
USERNAME = os.getenv("CDSW_PROJECT_URL").split("/")[6]  # args.username  # "vdibia"
API_KEY = os.getenv("CDSW_APIV2_KEY")
PROJECT_NAME = os.getenv("CDSW_PROJECT")
PROJECT_ID=os.getenv("CDSW_PROJECT_ID")

cml = cmlapi.default_client(url=HOST,cml_api_key=API_KEY)

def set_environ(Cml,Item,Value):
    Project=Cml.get_project(os.getenv("CDSW_PROJECT_ID"))
    if Project.environment=='':
        Project_Environment={}
    else:
        Project_Environment=json.loads(Project.environment)
    Project_Environment[Item]=Value
    Project.environment=json.dumps(Project_Environment)
    Cml.update_project(Project,project_id=os.getenv("CDSW_PROJECT_ID"))

def get_environ(Cml,Item):
    Project=Cml.get_project(os.getenv("CDSW_PROJECT_ID"))
    Project_Environment=json.loads(Project.environment)
    return Project_Environment[Item]

In [None]:
cluster = rc.RayCluster( num_workers=2,
                         worker_cpu=2, worker_memory=4, worker_nvidia_gpu=2, 
                         head_cpu=2, head_memory=4, head_nvidia_gpu=0                       
                       )
cluster.init()
set_environ(cml,"RAY_ADDRESS",cluster.get_client_url())

Starting ray head...
Starting 2 ray workers...


In [38]:
GLUE_TASKS = [
    "cola",
    "mnli",
    "mnli-mm",
    "mrpc",
    "qnli",
    "qqp",
    "rte",
    "sst2",
    "stsb",
    "wnli",
]

In [39]:
task = "cola"
model_checkpoint = "distilbert-base-uncased"
batch_size = 16

In [40]:
from datasets import load_dataset

actual_task = "mnli" if task == "mnli-mm" else task
datasets = load_dataset("glue", actual_task)

In [41]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [42]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mnli-mm": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

In [43]:
runtime_env = {"pip":["pytz","python-dateutil"], "env_vars": {"CUDA_VISIBLE_DEVICES": "0"}}

ray.init(address=cluster.get_client_url(),runtime_env=runtime_env)

0,1
Python version:,3.9.11
Ray version:,2.7.0
Dashboard:,http://127.0.0.1:8090


In [44]:
import ray.data

ray_datasets = {
    "train": ray.data.from_huggingface(datasets["train"]),
    "validation": ray.data.from_huggingface(datasets["validation"]),
    "test": ray.data.from_huggingface(datasets["test"]),
}
ray_datasets

{'train': MaterializedDataset(
    num_blocks=1,
    num_rows=8551,
    schema={sentence: string, label: int64, idx: int32}
 ),
 'validation': MaterializedDataset(
    num_blocks=1,
    num_rows=1043,
    schema={sentence: string, label: int64, idx: int32}
 ),
 'test': MaterializedDataset(
    num_blocks=1,
    num_rows=1063,
    schema={sentence: string, label: int64, idx: int32}
 )}

In [45]:
import numpy as np
from typing import Dict


# Tokenize input sentences
def collate_fn(examples: Dict[str, np.array]):
    sentence1_key, sentence2_key = task_to_keys[task]
    if sentence2_key is None:
        outputs = tokenizer(
            list(examples[sentence1_key]),
            truncation=True,
            padding="longest",
            return_tensors="pt",
        )
    else:
        outputs = tokenizer(
            list(examples[sentence1_key]),
            list(examples[sentence2_key]),
            truncation=True,
            padding="longest",
            return_tensors="pt",
        )

    outputs["labels"] = torch.LongTensor(examples["label"])

    # Move all input tensors to GPU
    for key, value in outputs.items():
        outputs[key] = value.cuda()

    return outputs

In [46]:
import torch
import numpy as np

from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

import ray.train
from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback

num_labels = 3 if task.startswith("mnli") else 1 if task == "stsb" else 2
metric_name = (
    "pearson"
    if task == "stsb"
    else "matthews_correlation"
    if task == "cola"
    else "accuracy"
)
model_name = model_checkpoint.split("/")[-1]
validation_key = (
    "validation_mismatched"
    if task == "mnli-mm"
    else "validation_matched"
    if task == "mnli"
    else "validation"
)
name = f"{model_name}-finetuned-{task}"

# Calculate the maximum steps per epoch based on the number of rows in the training dataset.
# Make sure to scale by the total number of training workers and the per device batch size.
max_steps_per_epoch = ray_datasets["train"].count() // (batch_size * cluster.num_workers)


def train_func(config):
    print(f"Is CUDA available: {torch.cuda.is_available()}")

    metric = load_metric("glue", actual_task)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=num_labels
    )

    train_ds = ray.train.get_dataset_shard("train")
    eval_ds = ray.train.get_dataset_shard("eval")

    train_ds_iterable = train_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )
    eval_ds_iterable = eval_ds.iter_torch_batches(
        batch_size=batch_size, collate_fn=collate_fn
    )

    print("max_steps_per_epoch: ", max_steps_per_epoch)

    args = TrainingArguments(
        name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=config.get("learning_rate", 2e-5),
        num_train_epochs=config.get("epochs", 2),
        weight_decay=config.get("weight_decay", 0.01),
        push_to_hub=False,
        max_steps=max_steps_per_epoch * config.get("epochs", 2),
        disable_tqdm=True,  # declutter the output a little
        use_cpu=False,  # you need to explicitly set no_cuda if you want CPUs
        report_to="none",
    )

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        if task != "stsb":
            predictions = np.argmax(predictions, axis=1)
        else:
            predictions = predictions[:, 0]
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        model,
        args,
        train_dataset=train_ds_iterable,
        eval_dataset=eval_ds_iterable,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.add_callback(RayTrainReportCallback())

    trainer = prepare_trainer(trainer)

    print("Starting training")
    trainer.train()

In [47]:
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig, CheckpointConfig

trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=cluster.num_workers, use_gpu=True),
    datasets={
        "train": ray_datasets["train"],
        "eval": ray_datasets["validation"],
    },
    run_config=RunConfig(
        checkpoint_config=CheckpointConfig(
            num_to_keep=1,
            checkpoint_score_attribute="eval_loss",
            checkpoint_score_order="min",
        ),
    ),
)

In [48]:
cluster.ray_worker_details

{'workers': [{'id': 't5qfwn00tfinvecn',
   'name': 'Ray Worker',
   'project': {'id': 1626,
    'name': 'Using Ray CML',
    'slug': 'pauldefusco/using-ray-cml',
    'html_url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/pauldefusco/using-ray-cml',
    'url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/api/v1/projects/pauldefusco/using-ray-cml',
    'default_engine_type': 'ml_runtime'},
   'owner': {'id': 16,
    'username': 'pauldefusco',
    'name': 'Paul de Fusco',
    'html_url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/pauldefusco',
    'url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/api/v1/users/pauldefusco',
    'is_team': False},
   'creator': {'id': 16,
    'username': 'pauldefusco',
    'name': 'Paul de Fusco',
    'html_url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/pauldefusco',
    'url': 'https://ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site/api/v1/users/pauldefusco'},
   'biller': {'id': 1

In [49]:
print("Cuda support:", torch.cuda.is_available(),":", torch.cuda.device_count(), "devices")

Cuda support: False : 0 devices


In [51]:
result = trainer.fit()

[2m[36m(TunerInternal pid=793)[0m [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949
[2m[36m(TunerInternal pid=793)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[2m[36m(TunerInternal pid=793)[0m 
[2m[36m(TunerInternal pid=793)[0m View detailed results here: /home/cdsw/ray_results/TorchTrainer_2023-09-28_06-40-16
[2m[36m(TunerInternal pid=793)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /home/cdsw/ray_results/TorchTrainer_2023-09-28_06-40-16`
[2m[36m(TunerInternal pid=793)[0m 
[2m[36m(TunerInternal pid=793)[0m Training started without custom configuration.


[2m[36m(TorchTrainer pid=240, ip=100.100.133.8)[0m Starting distributed worker processes: ['275 (100.100.133.8)', '268 (100.100.111.72)']
[2m[36m(RayTrainWorker pid=275, ip=100.100.133.8)[0m Setting up process group for: env:// [rank=0, world_size=2]


[2m[36m(TunerInternal pid=793)[0m 
[2m[36m(TunerInternal pid=793)[0m Training errored after 0 iterations at 2023-09-28 06:40:58. Total running time: 41s
[2m[36m(TunerInternal pid=793)[0m Error file: /home/cdsw/ray_results/TorchTrainer_2023-09-28_06-40-16/TorchTrainer_e2fba_00000_0_2023-09-28_06-40-17/error.txt


[2m[36m(TunerInternal pid=793)[0m Trial task failed for trial TorchTrainer_e2fba_00000
[2m[36m(TunerInternal pid=793)[0m Traceback (most recent call last):
[2m[36m(TunerInternal pid=793)[0m   File "/home/cdsw/.local/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
[2m[36m(TunerInternal pid=793)[0m     result = ray.get(future)
[2m[36m(TunerInternal pid=793)[0m   File "/home/cdsw/.local/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
[2m[36m(TunerInternal pid=793)[0m     return fn(*args, **kwargs)
[2m[36m(TunerInternal pid=793)[0m   File "/home/cdsw/.local/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
[2m[36m(TunerInternal pid=793)[0m     return func(*args, **kwargs)
[2m[36m(TunerInternal pid=793)[0m   File "/home/cdsw/.local/lib/python3.9/site-packages/ray/_private/worker.py", line 2547, in get
[2m[36m(TunerInternal pid=793)

TrainingFailedError: The Ray Train run failed. Please inspect the previous error messages for a cause. After fixing the issue (assuming that the error is not caused by your own application logic, but rather an error such as OOM), you can restart the run from scratch or continue this run.
To continue this run, you can use: `trainer = TorchTrainer.restore("/home/cdsw/ray_results/TorchTrainer_2023-09-28_06-40-08")`.
To start a new run that will retry on training failures, set `train.RunConfig(failure_config=train.FailureConfig(max_failures))` in the Trainer's `run_config` with `max_failures > 0`, or `max_failures = -1` for unlimited retries.

In [52]:
ray.shutdown()

In [53]:
cluster.terminate()

In [None]:
import os

elist={}
elist=os.environ
for e in elist:
    print(e)

[2m[36m(TunerInternal pid=793)[0m Trials did not complete: [TorchTrainer_e2fba_00000]


[2m[36m(TunerInternal pid=793)[0m 
