In [5]:
pip install ray

Note: you may need to restart the kernel to use updated packages.


In [7]:
import ray
from ray.air.config import ScalingConfig
from ray.train.xgboost import XGBoostTrainer
from ray import tune
from ray.tune.tuner import Tuner, TuneConfig
from ray.air.config import RunConfig

dataset = ray.data.read_csv("C:\\Users\\ADMIN\\Downloads\\breast_cancer.csv")

train_dataset, valid_dataset = dataset.train_test_split(test_size=0.2)
test_dataset = valid_dataset.drop_columns(cols=["diagnosis"])

# NOTE: CPU does not have enough resources to run this example.
# I tried using num_workers=1, resources_per_worker={"CPU": 1, "GPU": 0} in your
# ScalingConfig below.
trainer = XGBoostTrainer(
    scaling_config=ScalingConfig(
        num_workers=3,
        use_gpu=False,
    ),
    label_column="diagnosis",
    num_boost_round=30,
    params={
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
    },
    datasets={"train": train_dataset, "valid": valid_dataset},
)
result = trainer.fit()
print(result.metrics)

# Define a custom log directory
custom_log_dir = "C:/Users/ADMIN/ray_logs"

# Ensure the directory exists
import os
os.makedirs(custom_log_dir, exist_ok=True)

# Define the custom trial directory name function (if needed)
def trial_dirname_creator(trial):
    """Create shorter directory names to avoid path length issues."""
    return f"{trial.trainable_name}_{trial.trial_id}"

param_space = {"params": {"max_depth": tune.choice([1, 5, 50, 100])}}
metric = "train-logloss"

tuner = Tuner(
    trainer,
    param_space=param_space,
    run_config=RunConfig(
        verbose=1,
        callbacks=[],  # Disable all callbacks including TensorBoard
        storage_path=custom_log_dir,
    ),
    tune_config=TuneConfig(num_samples=5, metric=metric, mode="min", trial_dirname_creator=trial_dirname_creator),
)
result_grid = tuner.fit()

best_result = result_grid.get_best_result()
print("Best Result:", best_result)



0,1
Current time:,2024-12-10 19:34:23
Running for:,00:02:35.12
Memory:,6.8/7.7 GiB

Trial name,status,loc,params/max_depth,iter,total time (s),train-logloss,train-error,valid-logloss
XGBoostTrainer_4ee9d_00000,TERMINATED,127.0.0.1:20008,50,31,33.672,0.0108854,0.0,0.047455
XGBoostTrainer_4ee9d_00001,TERMINATED,127.0.0.1:20624,5,31,22.8771,0.0103782,0.0,0.0403975
XGBoostTrainer_4ee9d_00002,TERMINATED,127.0.0.1:15468,1,31,18.6195,0.0705789,0.0154525,0.0848822
XGBoostTrainer_4ee9d_00003,TERMINATED,127.0.0.1:10428,1,31,16.2519,0.0705789,0.0154525,0.0848822
XGBoostTrainer_4ee9d_00004,TERMINATED,127.0.0.1:9996,100,31,17.2518,0.0108854,0.0,0.047455


[36m(XGBoostTrainer pid=20008)[0m Started distributed worker processes: 
[36m(XGBoostTrainer pid=20008)[0m - (node_id=e3ad6dd743f38fb6ad65f352ab024ac371bddd111b7b321bd0527f89, ip=127.0.0.1, pid=12188) world_rank=0, local_rank=0, node_rank=0
[36m(XGBoostTrainer pid=20008)[0m - (node_id=e3ad6dd743f38fb6ad65f352ab024ac371bddd111b7b321bd0527f89, ip=127.0.0.1, pid=9596) world_rank=1, local_rank=1, node_rank=0
[36m(XGBoostTrainer pid=20008)[0m - (node_id=e3ad6dd743f38fb6ad65f352ab024ac371bddd111b7b321bd0527f89, ip=127.0.0.1, pid=20052) world_rank=2, local_rank=2, node_rank=0
[36m(RayTrainWorker pid=12188)[0m [19:32:18] Task [xgboost.ray-rank=00000000]:85808862c297aec2d371660f01000000 got rank 0
[36m(SplitCoordinator pid=17700)[0m Starting execution of Dataset. Full logs are in C:\Users\ADMIN\AppData\Local\Temp\ray\session_2024-12-10_19-25-32_918088_1352\logs\ray-data
[36m(SplitCoordinator pid=17700)[0m Execution plan of Dataset: InputDataBuffer[Input] -> OutputSplitter[split(3,

(pid=17700) Running 0: 0.00 row [00:00, ? row/s]

(pid=17700) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=20052)[0m [19:32:18] Task [xgboost.ray-rank=00000002]:0eb478b5c91dbe5c7b3226e801000000 got rank 2[32m [repeated 2x across cluster][0m
[36m(SplitCoordinator pid=12720)[0m Starting execution of Dataset. Full logs are in C:\Users\ADMIN\AppData\Local\Temp\ray\session_2024-12-10_19-25-32_918088_1352\logs\ray-data
[36m(SplitCoordinator pid=12720)[0m Execution plan of Dataset: InputDataBuffer[Input] -> OutputSplitter[split(3, equal=True)]


(pid=12720) Running 0: 0.00 row [00:00, ? row/s]

(pid=12720) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(XGBoostTrainer pid=20008)[0m [19:32:30] [0]	train-logloss:0.44963	train-error:0.02428	valid-logloss:0.45128	valid-error:0.07895
[36m(XGBoostTrainer pid=20008)[0m [19:32:30] [1]	train-logloss:0.32296	train-error:0.02208	valid-logloss:0.35640	valid-error:0.07018
[36m(XGBoostTrainer pid=20008)[0m [19:32:30] [2]	train-logloss:0.23951	train-error:0.01104	valid-logloss:0.28115	valid-error:0.07895
[36m(XGBoostTrainer pid=20008)[0m [19:32:31] [3]	train-logloss:0.18233	train-error:0.01104	valid-logloss:0.23415	valid-error:0.04386
[36m(XGBoostTrainer pid=20008)[0m [19:32:31] [4]	train-logloss:0.14165	train-error:0.00883	valid-logloss:0.19498	valid-error:0.04386
[36m(XGBoostTrainer pid=20008)[0m [19:32:31] [5]	train-logloss:0.11227	train-error:0.00442	valid-logloss:0.17539	valid-error:0.03509
[36m(XGBoostTrainer pid=20008)[0m [19:32:31] [6]	train-logloss:0.09077	train-error:0.00662	valid-logloss:0.15537	valid-error:0.03509
[36m(XGBoostTrainer pid=20008)[0m [19:32:31] [7]	trai

(pid=15140) Running 0: 0.00 row [00:00, ? row/s]

(pid=15140) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=18636)[0m [19:32:55] Task [xgboost.ray-rank=00000002]:034d0beee8a6a329c8515a4401000000 got rank 2[32m [repeated 2x across cluster][0m
[36m(SplitCoordinator pid=16184)[0m Starting execution of Dataset. Full logs are in C:\Users\ADMIN\AppData\Local\Temp\ray\session_2024-12-10_19-25-32_918088_1352\logs\ray-data
[36m(SplitCoordinator pid=16184)[0m Execution plan of Dataset: InputDataBuffer[Input] -> OutputSplitter[split(3, equal=True)]


(pid=16184) Running 0: 0.00 row [00:00, ? row/s]

(pid=16184) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [0]	train-logloss:0.45103	train-error:0.02428	valid-logloss:0.44948	valid-error:0.07895
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [1]	train-logloss:0.32500	train-error:0.02208	valid-logloss:0.35593	valid-error:0.07018
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [2]	train-logloss:0.24130	train-error:0.01325	valid-logloss:0.28084	valid-error:0.07895
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [3]	train-logloss:0.18365	train-error:0.01104	valid-logloss:0.23421	valid-error:0.05263
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [4]	train-logloss:0.14339	train-error:0.01104	valid-logloss:0.20066	valid-error:0.05263
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [5]	train-logloss:0.11374	train-error:0.01104	valid-logloss:0.16625	valid-error:0.04386
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [6]	train-logloss:0.09190	train-error:0.00662	valid-logloss:0.14392	valid-error:0.00877
[36m(XGBoostTrainer pid=20624)[0m [19:33:05] [7]	trai

(pid=13972) Running 0: 0.00 row [00:00, ? row/s]

(pid=13972) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=8540)[0m [19:33:28] Task [xgboost.ray-rank=00000002]:1a596762a06c5e942909f19d01000000 got rank 2[32m [repeated 2x across cluster][0m


(pid=14164) Running 0: 0.00 row [00:00, ? row/s]

(pid=14164) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [0]	train-logloss:0.49784	train-error:0.07506	valid-logloss:0.47143	valid-error:0.10526
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [1]	train-logloss:0.38954	train-error:0.07285	valid-logloss:0.37672	valid-error:0.05263
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [2]	train-logloss:0.32230	train-error:0.06181	valid-logloss:0.30711	valid-error:0.04386
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [3]	train-logloss:0.27139	train-error:0.07285	valid-logloss:0.25914	valid-error:0.05263
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [4]	train-logloss:0.23683	train-error:0.05298	valid-logloss:0.23169	valid-error:0.05263
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [5]	train-logloss:0.21244	train-error:0.05519	valid-logloss:0.20918	valid-error:0.05263
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [6]	train-logloss:0.19241	train-error:0.05077	valid-logloss:0.18667	valid-error:0.04386
[36m(XGBoostTrainer pid=15468)[0m [19:33:35] [7]	trai

(pid=2032) Running 0: 0.00 row [00:00, ? row/s]

(pid=2032) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=14996)[0m [19:33:52] Task [xgboost.ray-rank=00000001]:5095e6f43220cdc99d7e50e901000000 got rank 1[32m [repeated 2x across cluster][0m


(pid=14092) Running 0: 0.00 row [00:00, ? row/s]

(pid=14092) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [0]	train-logloss:0.49784	train-error:0.07506	valid-logloss:0.47143	valid-error:0.10526
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [1]	train-logloss:0.38954	train-error:0.07285	valid-logloss:0.37672	valid-error:0.05263
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [2]	train-logloss:0.32230	train-error:0.06181	valid-logloss:0.30711	valid-error:0.04386
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [3]	train-logloss:0.27139	train-error:0.07285	valid-logloss:0.25914	valid-error:0.05263
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [4]	train-logloss:0.23683	train-error:0.05298	valid-logloss:0.23169	valid-error:0.05263
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [5]	train-logloss:0.21244	train-error:0.05519	valid-logloss:0.20918	valid-error:0.05263
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [6]	train-logloss:0.19241	train-error:0.05077	valid-logloss:0.18667	valid-error:0.04386
[36m(XGBoostTrainer pid=10428)[0m [19:33:58] [7]	trai

(pid=9324) Running 0: 0.00 row [00:00, ? row/s]

(pid=9324) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(RayTrainWorker pid=12608)[0m [19:34:15] Task [xgboost.ray-rank=00000001]:af1c330dcfcc11bdd641addb01000000 got rank 1[32m [repeated 2x across cluster][0m


(pid=12928) Running 0: 0.00 row [00:00, ? row/s]

(pid=12928) - split(3, equal=True) 1: 0.00 row [00:00, ? row/s]

[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [0]	train-logloss:0.44963	train-error:0.02428	valid-logloss:0.45128	valid-error:0.07895
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [1]	train-logloss:0.32296	train-error:0.02208	valid-logloss:0.35640	valid-error:0.07018
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [2]	train-logloss:0.23951	train-error:0.01104	valid-logloss:0.28115	valid-error:0.07895
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [3]	train-logloss:0.18233	train-error:0.01104	valid-logloss:0.23415	valid-error:0.04386
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [4]	train-logloss:0.14165	train-error:0.00883	valid-logloss:0.19498	valid-error:0.04386
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [5]	train-logloss:0.11227	train-error:0.00442	valid-logloss:0.17539	valid-error:0.03509
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [6]	train-logloss:0.09077	train-error:0.00662	valid-logloss:0.15537	valid-error:0.03509
[36m(XGBoostTrainer pid=9996)[0m [19:34:22] [7]	train-loglos

Best Result: Result(
  metrics={'train-logloss': 0.01037820598838345, 'train-error': 0.0, 'valid-logloss': 0.04039745780908944, 'valid-error': 0.0},
  path='C:/Users/ADMIN/ray_logs/XGBoostTrainer_2024-12-10_19-31-48/XGBoostTrainer_4ee9d_00001',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=C:/Users/ADMIN/ray_logs/XGBoostTrainer_2024-12-10_19-31-48/XGBoostTrainer_4ee9d_00001/checkpoint_000000)
)


In [13]:
best_logdir = best_result.path
print(best_logdir)
print('\n')

best_metrics = best_result.metrics
print(best_metrics)
print('\n')

best_config = best_result.config
print(best_config)

C:/Users/ADMIN/ray_logs/XGBoostTrainer_2024-12-10_19-31-48/XGBoostTrainer_4ee9d_00001


OrderedDict([('train-logloss', 0.01037820598838345), ('train-error', 0.0), ('valid-logloss', 0.04039745780908944), ('valid-error', 0.0), ('timestamp', 1733877186), ('checkpoint_dir_name', 'checkpoint_000000'), ('should_checkpoint', True), ('done', True), ('training_iteration', 31), ('trial_id', '4ee9d_00001'), ('date', '2024-12-10_19-33-06'), ('time_this_iter_s', 0.033522605895996094), ('time_total_s', 22.87714171409607), ('pid', 20624), ('hostname', 'DESKTOP-2UG9FJV'), ('node_ip', '127.0.0.1'), ('config', {'params': {'max_depth': 5}}), ('time_since_restore', 22.87714171409607), ('iterations_since_restore', 31), ('experiment_tag', '1_max_depth=5')])


{'params': {'max_depth': 5}}


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, accuracy_score
import time

# Load the dataset
data = pd.read_csv("C:\\Users\\ADMIN\\Downloads\\breast_cancer.csv")

X_all_features = df.drop('diagnosis',axis=1)
X_train_all_features, X_test_all_features, y_train, y_test = train_test_split(X_all_features, y, test_size = 0.2, random_state=42) #split the  data into traing and validating

# Define depths to test
depths = [40, 10, 75]

results = []

for max_depth in depths:
    # Create the model with different max_depth values
    model = XGBClassifier(
        max_depth=max_depth,
        n_estimators=100,
        learning_rate=0.3,
        objective='binary:logistic',
        eval_metric=['logloss', 'error']
    )

    # Start timing
    start_time = time.time()

    # Train the model
    model.fit(X_train_all_features, y_train, eval_set=[(X_test_all_features, y_test)], verbose=True)

    # End timing
    end_time = time.time()

    # Predictions
    y_pred_prob = model.predict_proba(X_test_all_features)[:, 1]  # Predicted probabilities
    y_pred = model.predict(X_test_all_features)  # Binary predictions

    # Calculate metrics
    logloss = log_loss(y_valid, y_pred_prob)
    accuracy = accuracy_score(y_valid, y_pred)

    # Store results
    results.append({
        'max_depth': max_depth,
        'training_time': end_time - start_time,
        'log_loss': logloss,
        'accuracy': accuracy
    })

    print(f"Completed training with max_depth = {max_depth}")
    print(f"Training Time: {end_time - start_time:.2f} seconds")
    print(f"Log Loss: {logloss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")

# Output all results
for result in results:
    print(result)