In [1]:
!pip install "ray==2.7.0"

Collecting ray==2.7.0
  Obtaining dependency information for ray==2.7.0 from https://files.pythonhosted.org/packages/82/e9/d7d85bdc8b1b3101c760d42a63493b8b4092c9ade9dce9f8240b328e488a/ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl.metadata
  Downloading ray-2.7.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Collecting jsonschema (from ray==2.7.0)
  Obtaining dependency information for jsonschema from https://files.pythonhosted.org/packages/0f/bf/a84bc75f069f4f156e1c0d9892fb7325945106c6ecaad9f29d24360872af/jsonschema-4.19.1-py3-none-any.whl.metadata
  Downloading jsonschema-4.19.1-py3-none-any.whl.metadata (7.9 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray==2.7.0)
  Obtaining dependency information for msgpack<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/92/cb/fb176f840b8ead860fd7ac2060dbc26f2ccc551d5a08e590ea979de4b63a/msgpack-1.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading msgpack-1.0.6-cp310-cp310-manylinux_2_17_x86_64.m

In [2]:
from axolotl.utils.data import prepare_dataset
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_tokenizer
import yaml

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
config = "configs/relora.yaml"

with open(config, encoding="utf-8") as file:
    cfg: DictDefault = DictDefault(yaml.safe_load(file))

validate_config(cfg)

normalize_config(cfg)
tokenizer = load_tokenizer(cfg)
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)

`pad_to_sequence_len: true` is recommended when using sample_packing
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map (num_proc=4): 100%|██████████| 31650/31650 [02:50<00:00, 185.73 examples/s]
Saving the dataset (3/3 shards): 100%|██████████| 31650/31650 [00:01<00:00, 20174.33 examples/s]
Filter (num_proc=4): 100%|██████████| 31333/31333 [00:54<00:00, 576.17 examples/s]
Filter (num_proc=4): 100%|██████████| 317/317 [00:00<00:00, 467.72 examples/s]
Map (num_proc=4): 100%|██████████| 31123/31123 [00:30<00:00, 1004.30 examples/s]
Map (num_proc=4): 100%|██████████| 317/317 [00:00<00:00, 687.39 examples/s]


In [6]:
print(total_num_steps)

109970


In [3]:
import sys
import time
from collections import Counter

import ray

In [None]:
ray.shutdown()

In [4]:
@ray.remote
def gethostname(x):
    import platform
    import time

    time.sleep(0.01)
    return x + (platform.node(),)


def wait_for_nodes(expected):
    # Wait for all nodes to join the cluster.
    while True:
        resources = ray.cluster_resources()
        node_keys = [key for key in resources if "node" in key]
        num_nodes = sum(resources[node_key] for node_key in node_keys)
        if num_nodes < expected:
            print(
                "{} nodes have joined so far, waiting for {} more.".format(
                    num_nodes, expected - num_nodes
                )
            )
            sys.stdout.flush()
            time.sleep(1)
        else:
            break


def main():
    wait_for_nodes(2)

    # Check that objects can be transferred from each node to each other node.
    for i in range(10):
        print("Iteration {}".format(i))
        results = [gethostname.remote(gethostname.remote(())) for _ in range(100)]
        print(Counter(ray.get(results)))
        sys.stdout.flush()

    print("Success!")
    sys.stdout.flush()

In [5]:
runtime_env = {
    "pip": [
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "tokenizers>=0.13.3",
            "numpy<1.24",  # remove when mlflow updates beyond 2.2
            "torch",
    ],
    "env_vars": {"HUGGING_FACE_HUB_TOKEN": "key"}
}

In [6]:
ray.init("ray://10.3.5.35:10001",runtime_env=runtime_env)


0,1
Python version:,3.10.8
Ray version:,3.0.0.dev0
Dashboard:,http://10.3.5.35:8265


In [8]:
from ray import serve

In [7]:
main()

Iteration 0
Counter({('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-worker-workergroup-8kcp4'): 100})
Iteration 1
Counter({('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-worker-workergroup-8kcp4'): 100})
Iteration 2
Counter({('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-worker-workergroup-8kcp4'): 75, ('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-head-bkw8t'): 15, ('raycluster-kuberay-head-bkw8t', 'raycluster-kuberay-worker-workergroup-8kcp4'): 9, ('raycluster-kuberay-head-bkw8t', 'raycluster-kuberay-head-bkw8t'): 1})
Iteration 3
Counter({('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-worker-workergroup-8kcp4'): 58, ('raycluster-kuberay-worker-workergroup-8kcp4', 'raycluster-kuberay-head-bkw8t'): 29, ('raycluster-kuberay-head-bkw8t', 'raycluster-kuberay-worker-workergroup-8kcp4'): 12, ('raycluster-kuberay-head-bkw8t', 'raycluster-kuberay-head-bkw8t'): 1})
Iteration 4
Counter({('ra

In [9]:
import pandas as pd


from starlette.requests import Request


@serve.deployment(ray_actor_options={"num_gpus": 8,"num_cpus":32})
class PredictDeployment:
    def __init__(self, model_id: str, revision: str = None):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",  # automatically makes use of all GPUs available to the Actor
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def generate(self, text: str) -> pd.DataFrame:
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(
            self.model.device
        )

        gen_tokens = self.model.generate(
            input_ids,
            do_sample=True,
            temperature=0.9,
            max_length=100,
        )
        return pd.DataFrame(
            self.tokenizer.batch_decode(gen_tokens), columns=["responses"]
        )

    async def __call__(self, http_request: Request) -> str:
        json_request: str = await http_request.json()
        prompts = []
        for prompt in json_request:
            text = prompt["text"]
            if isinstance(text, list):
                prompts.extend(text)
            else:
                prompts.append(text)
        return self.generate(prompts)

In [10]:
model_id = "WizardLM/WizardCoder-Python-34B-V1.0"
revision = "float16"  # use float16 weights to fit in 16GB GPUs
prompt = (
    "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
    "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
    "researchers was the fact that the unicorns spoke perfect English."
)

In [29]:
ray.serve.start(http_options={
"host":"0.0.0.0"
})

[2m[36m(HTTPProxyActor pid=4344)[0m INFO 2023-09-26 12:54:25,843 http_proxy 10.3.5.35 http_proxy.py:1418 - Proxy actor 56dab9230021cb885d2b3ec207000000 starting on node 3bf9415e44d2577899f62db907c9953a08a2dfa0202a744f4f90f0d1.
[2m[36m(HTTPProxyActor pid=4344)[0m INFO 2023-09-26 12:54:25,855 http_proxy 10.3.5.35 http_proxy.py:1603 - Starting HTTP server on node: 3bf9415e44d2577899f62db907c9953a08a2dfa0202a744f4f90f0d1 listening on port 8000
[2m[36m(HTTPProxyActor pid=4344)[0m INFO:     Started server process [4344]


In [23]:
deployment = PredictDeployment.bind(model_id=model_id)
handle = serve.run(deployment)
serve

The new client HTTP config differs from the existing one in the following fields: ['host', 'location']. The new HTTP config is ignored.
[2m[36m(ServeController pid=4095)[0m INFO 2023-09-26 12:51:02,846 controller 4095 deployment_state.py:1389 - Deploying new version of deployment PredictDeployment in application 'default'.
[2m[36m(ServeController pid=4095)[0m INFO 2023-09-26 12:51:02,950 controller 4095 deployment_state.py:1678 - Adding 1 replica to deployment PredictDeployment in application 'default'.
[2m[36m(ServeReplica:default:PredictDeployment pid=21154, ip=10.3.0.3)[0m 2023-09-26 12:51:09.108708: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
[2m[36m(ServeReplica:default:PredictDeployment pid=21154, ip=10.3.0.3)[0m 2023-09-26 12:51:09.108806: 

<module 'ray.serve' from '/opt/conda/lib/python3.10/site-packages/ray/serve/__init__.py'>

In [24]:
serve.status()

ServeStatus(proxies={'3bf9415e44d2577899f62db907c9953a08a2dfa0202a744f4f90f0d1': <ProxyStatus.HEALTHY: 'HEALTHY'>}, applications={'default': ApplicationStatusOverview(status=<ApplicationStatus.RUNNING: 'RUNNING'>, message='', last_deployed_time_s=1695757862.7654545, deployments={'PredictDeployment': DeploymentStatusOverview(status=<DeploymentStatus.HEALTHY: 'HEALTHY'>, replica_states={'RUNNING': 1}, message='')})})

In [26]:
import requests
prompt = "Once upon a time, there was a horse."

sample_input = {"text": prompt}
output = requests.post("http://10.3.5.35:8000/", json=[sample_input]).json()
print(output)

[2m[36m(HTTPProxyActor pid=4128)[0m ERROR 2023-09-26 12:53:01,544 http_proxy 10.3.5.35 2ab48c8d-69e7-400a-a42e-ce19318458cc / default http_proxy.py:1345 - [36mray::ServeReplica:default:PredictDeployment.handle_request_streaming()[39m (pid=21154, ip=10.3.0.3, actor_id=c946dea35cf662f4c0a1153d07000000, repr=<ray.serve._private.replica.ServeReplica:default:PredictDeployment object at 0x7f71bb45cee0>)
[2m[36m(HTTPProxyActor pid=4128)[0m     async for result in generator:
[2m[36m(HTTPProxyActor pid=4128)[0m   File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/serve/_private/replica.py", line 326, in _handle_http_request_generator
[2m[36m(HTTPProxyActor pid=4128)[0m     raise e from None
[2m[36m(HTTPProxyActor pid=4128)[0m   File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/serve/_private/replica.py", line 875, in call_user_method
[2m[36m(HTTPProxyActor pid=4128)[0m     raise e from None
[2m[36m(HTTPProxyActor pid=4128)[0m ray.exceptions.RayTaskError: 

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [33]:
 serve.shutdown()

2023-09-26 20:07:49,299	INFO router.py:473 -- Got updated replicas for deployment 'MostBasicIngress' in application 'default': set().
[2m[36m(ServeController pid=4311)[0m INFO 2023-09-26 13:07:49,257 controller 4311 deployment_state.py:1706 - Removing 1 replica from deployment 'MostBasicIngress' in application 'default'.
[2m[36m(ServeController pid=4311)[0m INFO 2023-09-26 13:07:51,432 controller 4311 deployment_state.py:2025 - Replica default#MostBasicIngress#emYJdw is stopped.


In [28]:
 serve.list_deployments()

RayServeException: There is no Serve instance running on this Ray cluster.

In [34]:
ray.shutdown()

In [30]:
@serve.deployment
class MostBasicIngress:
    async def __call__(self, request: Request) -> str:
        name = (await request.json())["name"]
        return f"Hello {name}!"


app = MostBasicIngress.bind()
serve.run(app)

The new client HTTP config differs from the existing one in the following fields: ['host', 'location']. The new HTTP config is ignored.
[2m[36m(ServeController pid=4311)[0m INFO 2023-09-26 12:54:40,327 controller 4311 deployment_state.py:1389 - Deploying new version of deployment MostBasicIngress in application 'default'.
[2m[36m(ServeController pid=4311)[0m INFO 2023-09-26 12:54:40,430 controller 4311 deployment_state.py:1678 - Adding 1 replica to deployment MostBasicIngress in application 'default'.
2023-09-26 19:54:44,366	INFO router.py:1132 -- Using router <class 'ray.serve._private.router.PowerOfTwoChoicesReplicaScheduler'>.
2023-09-26 19:54:44,383	INFO router.py:473 -- Got updated replicas for deployment 'MostBasicIngress' in application 'default': {'default#MostBasicIngress#emYJdw'}.


RayServeSyncHandle(deployment='MostBasicIngress')

In [31]:
assert (
    requests.get("http://10.3.5.35:8000/", json={"name": "Corey"}).text
    == "Hello Corey!"
)

[2m[36m(ServeReplica:default:MostBasicIngress pid=4388)[0m INFO 2023-09-26 12:54:50,816 MostBasicIngress default#MostBasicIngress#emYJdw 433c08f9-eeca-42ae-8fc9-26ccbf03ac96 / default replica.py:746 - __CALL__ OK 5.8ms


In [32]:
requests.get("http://10.3.5.35:8000/", json={"name": "Corey"}).text

'Hello Corey!'

[2m[36m(ServeReplica:default:MostBasicIngress pid=4388)[0m INFO 2023-09-26 12:55:06,193 MostBasicIngress default#MostBasicIngress#emYJdw 133e4be2-8e86-4521-ae4e-38a79a647ec5 / default replica.py:746 - __CALL__ OK 2.4ms
