### Ray AI Libraries

In [3]:
import asyncio
import fastapi
import pandas as pd
import requests
import xgboost
from pydantic import BaseModel
from sklearn.model_selection import train_test_split

import ray
import ray.tune
import ray.train
from ray.train.xgboost import XGBoostTrainer as RayTrainXGBoostTrainer
from ray.train import ScalingConfig, RunConfig
import ray.data
import ray.serve


### 3 Step -> 0 : Cloud, 1: Ray Core (low level distr. computing framework), 2: Ray AI Libraries (Data, Train, Tune , Serve)

In [8]:
features = [
    "passenger_count",
    "trip_distance",
    "fare_amount",
    "tolls_amount",
]

label_column = "tip_amount"

In [5]:
def load_data():
    path = "s3://anyscale-public-materials/nyc-taxi-cab/yellow_tripdata_2021-03.parquet"
    df = pd.read_parquet(path, columns=features + [label_column])
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[label_column], test_size=0.2, random_state=42)
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtest = xgboost.DMatrix(X_test, label=y_test)
    return dtrain, dtest

In [6]:
storage_folder = "/tmp/ray_train_xgboost"

In [10]:
from pathlib import Path
model_path = Path(storage_folder) / "model.ubj"

def my_xgboost_func(params):
    evals_result = {}
    dtrain, dtest = load_data()
    bst = xgboost.train(
        params,
        dtrain,
        num_boost_round=10,
        evals=[(dtest, "eval")],
        evals_result=evals_result,
    )
    model_path.parent.mkdir(parents=True, exist_ok=True)
    bst.save_model(model_path)
    print(f"{evals_result['eval']}")
    return {"eval-rmse": evals_result["eval"]["rmse"][-1]}

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "tree_method": "hist",
    "max_depth": 6,
    "eta": 0.1,
}
my_xgboost_func(params)

[0]	eval-rmse:2.18114
[1]	eval-rmse:2.13805
[2]	eval-rmse:2.10221
[3]	eval-rmse:2.07294
[4]	eval-rmse:2.04855
[5]	eval-rmse:2.02852
[6]	eval-rmse:2.01225
[7]	eval-rmse:1.99868
[8]	eval-rmse:1.98771
[9]	eval-rmse:1.97872
OrderedDict([('rmse', [2.18113709207776, 2.1380522744942168, 2.1022143627953036, 2.072936825276888, 2.0485457212693987, 2.028522863406997, 2.0122461934067273, 1.99868078532301, 1.9877117047436583, 1.9787180742813582])])


{'eval-rmse': 1.9787180742813582}

### Hyperparameter Tuning

In [12]:
tuner = ray.tune.Tuner(
    my_xgboost_func,
    param_space={
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "tree_method": "hist",
        "max_depth": 6,
        "eta": ray.tune.uniform(0.01, 0.3),
    },
    run_config=RunConfig(
        storage_path=storage_folder
    ),
    tune_config=ray.tune.TuneConfig(
        metric="eval-rmse",
        mode="min",
        num_samples=10,
    ),
    )

results = tuner.fit()
print("Best hyperparameters found were: ", results.get_best_result().config)

0,1
Current time:,2025-07-20 18:45:47
Running for:,00:01:45.69
Memory:,9.4/15.5 GiB

Trial name,status,loc,eta,iter,total time (s),eval-rmse
my_xgboost_func_5a13b_00000,TERMINATED,192.168.1.113:1284644,0.017217,1,42.6304,2.15222
my_xgboost_func_5a13b_00001,TERMINATED,192.168.1.113:1284638,0.292783,1,40.4477,1.93439
my_xgboost_func_5a13b_00002,TERMINATED,192.168.1.113:1284641,0.251578,1,47.927,1.93593
my_xgboost_func_5a13b_00003,TERMINATED,192.168.1.113:1284640,0.0576946,1,43.0324,2.03533
my_xgboost_func_5a13b_00004,TERMINATED,192.168.1.113:1284639,0.276954,1,46.1864,1.93446
my_xgboost_func_5a13b_00005,TERMINATED,192.168.1.113:1284642,0.0974319,1,44.6243,1.98107
my_xgboost_func_5a13b_00006,TERMINATED,192.168.1.113:1284643,0.0365516,1,38.7625,2.08585
my_xgboost_func_5a13b_00007,TERMINATED,192.168.1.113:1284645,0.0743513,1,42.1051,2.00769
my_xgboost_func_5a13b_00008,TERMINATED,192.168.1.113:1286303,0.232245,1,34.6192,1.93742
my_xgboost_func_5a13b_00009,TERMINATED,192.168.1.113:1286302,0.24758,1,35.8037,1.93715


[36m(my_xgboost_func pid=1284643)[0m [0]	eval-rmse:2.21359
[36m(my_xgboost_func pid=1284643)[0m [1]	eval-rmse:2.19536
[36m(my_xgboost_func pid=1284644)[0m [0]	eval-rmse:2.22381
[36m(my_xgboost_func pid=1284643)[0m [2]	eval-rmse:2.17829
[36m(my_xgboost_func pid=1284643)[0m [3]	eval-rmse:2.16228
[36m(my_xgboost_func pid=1284640)[0m [0]	eval-rmse:2.20258
[36m(my_xgboost_func pid=1284644)[0m [1]	eval-rmse:2.21484
[36m(my_xgboost_func pid=1284643)[0m [4]	eval-rmse:2.14727
[36m(my_xgboost_func pid=1284638)[0m [0]	eval-rmse:2.09373
[36m(my_xgboost_func pid=1284645)[0m [0]	eval-rmse:2.19405
[36m(my_xgboost_func pid=1284644)[0m [2]	eval-rmse:2.20616
[36m(my_xgboost_func pid=1284640)[0m [1]	eval-rmse:2.17513
[36m(my_xgboost_func pid=1284643)[0m [5]	eval-rmse:2.13324
[36m(my_xgboost_func pid=1284638)[0m [1]	eval-rmse:2.01945
[36m(my_xgboost_func pid=1284645)[0m [1]	eval-rmse:2.16007
[36m(my_xgboost_func pid=1284638)[0m [2]	eval-rmse:1.98012
[36m(my_xgboost_func pi

2025-07-20 18:45:47,682	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/tmp/ray_train_xgboost/my_xgboost_func_2025-07-20_18-43-39' in 0.0051s.


[36m(my_xgboost_func pid=1286302)[0m [7]	eval-rmse:1.94069
[36m(my_xgboost_func pid=1286302)[0m [8]	eval-rmse:1.93851


2025-07-20 18:45:47,688	INFO tune.py:1041 -- Total run time: 108.71 seconds (105.68 seconds for the tuning loop).


Best hyperparameters found were:  {'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'tree_method': 'hist', 'max_depth': 6, 'eta': 0.29278341909657435}


[36m(my_xgboost_func pid=1286302)[0m [9]	eval-rmse:1.93715
[36m(my_xgboost_func pid=1286302)[0m OrderedDict([('rmse', [2.1126452943309664, 2.040883181990997, 1.9981238969050643, 1.973108252858935, 1.9581758572110184, 1.9491914806407895, 1.944029461096515, 1.9406905108043413, 1.9385105151831652, 1.9371533592860497])])


[36m(XGBoostTrainer pid=1290088)[0m GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.
[36m(XGBoostTrainer pid=1290088)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=1290088)[0m - (node_id=4c7337921f554d6ab59799e8b0edaf84fa9762427105064d8962142a, ip=192.168.1.113, pid=1290199) world_rank=0, local_rank=0, node_rank=0
[36m(XGBoostTrainer pid=1290088)[0m - (node_id=4c7337921f554d6ab59799e8b0edaf84fa9762427105064d8962142a, ip=192.168.1.113, pid=1290200) world_rank=1, local_rank=1, node_rank=0
[36m(RayTrainWorker pid=1290199)[0m [18:48:10] Task [xgboost.ray-rank=00000000]:5da89a72b469abc7bd37770801000000 got rank 0


[36m(RayTrainWorker pid=1290199)[0m Loading data for worker 0...


[36m(XGBoostTrainer pid=1290088)[0m [18:48:42] [0]	eval-rmse:2.28346
[36m(RayTrainWorker pid=1290200)[0m [18:48:10] Task [xgboost.ray-rank=00000001]:82b040ded768be2a2de39fd401000000 got rank 1
[36m(XGBoostTrainer pid=1290088)[0m [18:48:42] [1]	eval-rmse:2.25069
[36m(XGBoostTrainer pid=1290088)[0m [18:48:43] [2]	eval-rmse:2.22460
[36m(XGBoostTrainer pid=1290088)[0m [18:48:43] [3]	eval-rmse:2.20430
[36m(XGBoostTrainer pid=1290088)[0m [18:48:43] [4]	eval-rmse:2.18836
[36m(XGBoostTrainer pid=1290088)[0m [18:48:43] [5]	eval-rmse:2.17259
[36m(XGBoostTrainer pid=1290088)[0m [18:48:44] [6]	eval-rmse:2.15961
[36m(XGBoostTrainer pid=1290088)[0m [18:48:44] [7]	eval-rmse:2.14910
[36m(XGBoostTrainer pid=1290088)[0m [18:48:44] [8]	eval-rmse:2.14038
[36m(XGBoostTrainer pid=1290088)[0m [18:48:44] [9]	eval-rmse:2.13346


[36m(RayTrainWorker pid=1290199)[0m OrderedDict([('rmse', [np.float64(2.283455977036048), np.float64(2.2506874095991365), np.float64(2.2245956490092267), np.float64(2.204302370137811), np.float64(2.1883569504768654), np.float64(2.1725859682897077), np.float64(2.1596106275006597), np.float64(2.149095348540642), np.float64(2.140382007941743), np.float64(2.133460611893429)])])
[36m(RayTrainWorker pid=1290200)[0m Loading data for worker 1...


[36m(ProxyActor pid=1291965)[0m INFO 2025-07-20 18:49:22,097 proxy 192.168.1.113 -- Proxy starting on node 4c7337921f554d6ab59799e8b0edaf84fa9762427105064d8962142a (HTTP port: 8000).
[36m(ProxyActor pid=1291965)[0m INFO 2025-07-20 18:49:22,194 proxy 192.168.1.113 -- Got updated endpoints: {}.
[36m(ServeController pid=1286209)[0m INFO 2025-07-20 18:49:22,436 controller 1286209 -- Deploying new version of Deployment(name='Model', app='default') (initial target replicas: 1).
[36m(ServeController pid=1286209)[0m INFO 2025-07-20 18:49:22,437 controller 1286209 -- Deploying new version of Deployment(name='Model_1', app='default') (initial target replicas: 1).
[36m(ServeController pid=1286209)[0m INFO 2025-07-20 18:49:22,439 controller 1286209 -- Deploying new version of Deployment(name='Ensemble', app='default') (initial target replicas: 1).
[36m(ProxyActor pid=1291965)[0m INFO 2025-07-20 18:49:22,444 proxy 192.168.1.113 -- Got updated endpoints: {Deployment(name='Ensemble', app=

### Distributed Train

In [13]:
def load_data():
    train_ctx = ray.train.get_context()
    worker_rank = train_ctx.get_world_rank()
    print(f"Loading data for worker {worker_rank}...")

    month = (worker_rank + 1) % 12
    year = 2021 + (worker_rank + 1) // 12
    path = f"s3://anyscale-public-materials/nyc-taxi-cab/yellow_tripdata_{year}-{month:02}.parquet"

    df = pd.read_parquet(path, columns=features + [label_column])
    X_train, X_test, y_train, y_test = train_test_split(
        df[features], df[label_column], test_size=0.2, random_state=42
    )
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dtest = xgboost.DMatrix(X_test, label=y_test)
    return dtrain, dtest

trainer = RayTrainXGBoostTrainer(  
    my_xgboost_func,  
    scaling_config=ray.train.ScalingConfig(
        num_workers=2, use_gpu=False
    ),  # Define how many training workers
    train_loop_config=params,  
)

trainer.fit()  # Run 

2025-07-20 18:47:56,002	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
2025-07-20 18:47:56,018	INFO data_parallel_trainer.py:339 -- GPUs are detected in your Ray cluster, but GPU training is not enabled for this trainer. To enable GPU training, make sure to set `use_gpu` to True in your scaling config.


== Status ==
Current time: 2025-07-20 18:47:56 (running for 00:00:00.11)
Using FIFO scheduling algorithm.
Logical resource usage: 3.0/8 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-07-20_18-43-39_350810_1079375/artifacts/2025-07-20_18-47-56/XGBoostTrainer_2025-07-20_18-47-55/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-20 18:48:01 (running for 00:00:05.14)
Using FIFO scheduling algorithm.
Logical resource usage: 3.0/8 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-07-20_18-43-39_350810_1079375/artifacts/2025-07-20_18-47-56/XGBoostTrainer_2025-07-20_18-47-55/driver_artifacts
Number of trials: 1/1 (1 PENDING)


== Status ==
Current time: 2025-07-20 18:48:06 (running for 00:00:10.14)
Using FIFO scheduling algorithm.
Logical resource usage: 3.0/8 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-07-20_18-43-39_350810_1079375/artifacts/2025-07-20_18

2025-07-20 18:48:45,767	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/ozkan/ray_results/XGBoostTrainer_2025-07-20_18-47-55' in 0.0040s.
2025-07-20 18:48:45,771	INFO tune.py:1041 -- Total run time: 49.77 seconds (49.75 seconds for the tuning loop).


Trial XGBoostTrainer_e758e_00000 completed. Last result: 
== Status ==
Current time: 2025-07-20 18:48:45 (running for 00:00:49.75)
Using FIFO scheduling algorithm.
Logical resource usage: 3.0/8 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-07-20_18-43-39_350810_1079375/artifacts/2025-07-20_18-47-56/XGBoostTrainer_2025-07-20_18-47-55/driver_artifacts
Number of trials: 1/1 (1 TERMINATED)




Result(
  metrics={},
  path='/home/ozkan/ray_results/XGBoostTrainer_2025-07-20_18-47-55/XGBoostTrainer_e758e_00000_0_2025-07-20_18-47-56',
  filesystem='local',
  checkpoint=None
)

### Serve

In [14]:
app = fastapi.FastAPI()

class Payload(BaseModel):
    passenger_count: int
    trip_distance: float
    fare_amount: float
    tolls_amount: float


@ray.serve.deployment
@ray.serve.ingress(app)
class Ensemble:
    def __init__(self, model1, model2):
        self.model1 = model1
        self.model2 = model2

    @app.post("/predict")
    async def predict(self, data: Payload) -> dict:
        model1_prediction, model2_prediction = await asyncio.gather(
            self.model1.predict.remote([data.model_dump()]),
            self.model2.predict.remote([data.model_dump()]),
        )
        out = {"prediction": float(model1_prediction + model2_prediction) / 2}
        return out


@ray.serve.deployment
class Model:
    def __init__(self, path: str):
        self._model = xgboost.Booster()
        self._model.load_model(path)

    def predict(self, data: list[dict]) -> list[float]:
        # Make prediction
        dmatrix = xgboost.DMatrix(pd.DataFrame(data))
        model_prediction = self._model.predict(dmatrix)
        return model_prediction


# Run the deployment
handle = ray.serve.run(
    Ensemble.bind(
        model1=Model.bind(model_path),
        model2=Model.bind(model_path),
    ),
    route_prefix="/ensemble"
)

INFO 2025-07-20 18:49:22,322 serve 1079375 -- Started Serve in namespace "serve".
INFO 2025-07-20 18:49:28,467 serve 1079375 -- Application 'default' is ready at http://127.0.0.1:8000/ensemble.


### Request

In [15]:
requests.post(
    "http://localhost:8000/ensemble/predict",
    json={  
        "passenger_count": 1,
        "trip_distance": 2.5,
        "fare_amount": 10.0,
        "tolls_amount": 0.5,
    },
).json()

{'prediction': 2.0076115131378174}

### Batch inference

In [16]:
class OfflinePredictor:
    def __init__(self):
        # Load expensive state
        self._model = xgboost.Booster()
        self._model.load_model(model_path)

    def predict(self, data: list[dict]) -> list[float]:
        # Make prediction in batch
        dmatrix = xgboost.DMatrix(pd.DataFrame(data))
        model_prediction = self._model.predict(dmatrix)
        return model_prediction

    def __call__(self, batch: dict) -> dict:
        batch["predictions"] = self.predict(batch)
        return batch


# Apply the predictor to the validation dataset
prediction_pipeline = (
    ray.data.read_parquet(
        "s3://anyscale-public-materials/nyc-taxi-cab/yellow_tripdata_2021-03.parquet"
    )
    .select_columns(features)
    .map_batches(OfflinePredictor, concurrency=(2, 10))
)               


Parquet Files Sample 0:   0%|          | 0.00/1.00 [00:00<?, ? file/s]