# Grid Search with dask ml

```cmd
conda install dask-ml -y
```

In [7]:
conda install -c conda-forge graphviz ipycytoscape

Retrieving notices: ...working... done
Channels:
 - conda-forge
 - rapidsai
 - pytorch
 - nvidia
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 24.3.0
    latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - graphviz
    - ipycytoscape


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    atk-1.0-2.38.0             |       hd4edc92_1         539 KB  conda-forge
    fribidi-1.0.10             |       h36c2ea0_0         112 KB  conda-forge
    gdk-pixbuf-2.42.10         |       h829c605_5         560 KB  conda-forge
    graphviz-9.0.0             |       h78e8752_1         2.2 MB  conda-forge
    gtk2-2.24.33               |       h280cfa0_4         6.2 MB  conda-forge
    gts-0.7.6   

In [23]:
conda install dask-ml -y

Channels:
 - rapidsai
 - pytorch
 - conda-forge
 - nvidia
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 24.3.0
    latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - dask-ml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    dask-ml-2024.3.20          |     pyhd8ed1ab_0         110 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         110 KB

The following packages will be UPDATED:

  dask-ml                            2023.3.24-pyhd8ed1ab_1 --> 2024.3.20-pyhd8ed1ab_0 



Downloading and Extracting Packages:
                                                                                
Prepar

In [3]:
import time

import cupy as cp

import cudf
import dask
import dask_cudf
import dask.dataframe as dd
dask.config.set({"dataframe.backend": "cudf"})

from dask_ml.model_selection import  GridSearchCV as GSCV
from cuml.model_selection import train_test_split, GridSearchCV
from cuml.metrics import accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

In [4]:
class Timer:
    def __init__(self, name="cpu"):
        self.name = name
        
    def __enter__(self):
        self.start = time.perf_counter()
        
    def __exit__(self, type, value, trackback):
        self.end = time.perf_counter()
        self.execute_time = self.end - self.start
        print(f"{self.name} execute time : {self.execute_time:.4f} seconds")

In [5]:
with Timer(name="dask read parquet") as dask_time:
    train_dask = dd.read_parquet("./data/train.parquet")
    test_dask  = dd.read_parquet("./data/test.parquet")

dask read parquet execute time : 0.1484 seconds


In [6]:
def preprocess_data(data):
    # Convert categorical variables into numerical
    data = dd.reshape.get_dummies(data.categorize(), columns=["Sex", "Embarked"])
    # Fill missing values in Age and Fare with median
    data["Age"] = data["Age"].fillna(data["Age"].median())
    data["Fare"] = data["Fare"].fillna(data["Fare"].median())
    # Drop unnecessary columns
    data = data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
    return data


In [7]:
train_dask = train_dask.persist()
test_dask  = test_dask.persist()

  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()


In [8]:
with Timer(name="dask preprocessed") as dask_time:
    train_dask = preprocess_data(train_dask)
    test_dask  = preprocess_data(test_dask)

dask preprocessed execute time : 0.7610 seconds


In [9]:
train_dask.visualize()

CytoscapeWidget(cytoscape_layout={'name': 'dagre', 'rankDir': 'BT', 'nodeSep': 10, 'edgeSep': 10, 'spacingFact…

In [19]:
train_dask = train_dask.persist()
test_dask  = test_dask.persist()

In [11]:
with Timer(name="Train Valid Split") as dask_time:
    X_train = train_dask.drop("Survived", axis=1).compute()
    y_train = train_dask["Survived"].compute()
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Train Valid Split execute time : 0.0649 seconds


In [12]:
model = XGBClassifier(tree_method = "hist", device = "cuda")

param_grid = {
    "max_depth": [10,30,50],
    "min_child_weight" : [1,3,6,10],
    "n_estimators": [200,300,500,1000],
    "learning_rate": [0.1, 0.01, 0.001],
}

In [14]:
with Timer(name="grid search with xgb") as dask_time:
    # Perform grid search to find the best hyperparameters
    grid_search = GSCV(model, param_grid, cv=5, scoring='accuracy')
    X_train = X_train.astype(int)
    try:
        grid_search.fit(X_train, y_train.to_numpy())
    except KeyboardInterrupt:
        print("Grid search interrupted by user.")

    # Get the best model
    best_model = grid_search.best_estimator_

  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()


grid search with xgb execute time : 3324.0267 seconds


grid search with xgb execute time : 3324.0267 seconds

In [30]:
test_dask

Dask Series Structure:
npartitions=1
    bool
     ...
dtype: bool
Dask Name: apply, 2 graph layers

In [15]:
with Timer(name="xgb gpu inference") as dask_time:
    X_val = X_val.astype(int)
    y_pred_train = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    print("Training Accuracy:", train_accuracy)
    
    y_pred_val = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    print("Validation Accuracy:", val_accuracy)

  feature_names = data.columns.format()
  feature_names = data.columns.format()


Training Accuracy: 0.8639551401138306
Validation Accuracy: 0.7921348214149475
xgb gpu inference execute time : 0.2536 seconds


In [25]:
def fix_bool(x):
    if x == -1:
        return False
    else:
        return bool(x)

test_dask = test_dask.apply(fix_bool, meta=bool, axis=1)



In [27]:
test_predictions = best_model.predict(test_dask.compute())

# Prepare submission file
submission_df = cudf.DataFrame({
    "PassengerId": range(892, 892 + len(test_predictions)),
    "Survived": test_predictions
})

# Save submission file
submission_df.to_csv("submission_xgb.csv", index=False)

ValueError: user defined function compilation failed.

# CPU Example

In [27]:
model = XGBClassifier()

# Define hyperparameters to tune
param_grid = {
    "max_depth": [10,30,50],
    "min_child_weight" : [1,3,6,10],
    "n_estimators": [200,300,500,1000],
    "learning_rate": [0.1, 0.01, 0.001],
}

In [28]:
with Timer(name="grid search with xgb") as dask_time:
    # Perform grid search to find the best hyperparameters
    grid_search = GridSearchCV(XGBClassifier(), param_grid, cv=5, scoring='accuracy', verbose=1)
    X_train = X_train.astype(int)
    grid_search.fit(X_train, y_train.to_numpy())

    # Get the best model
    best_model = grid_search.best_estimator_

Fitting 5 folds for each of 144 candidates, totalling 720 fits


  feature_names = data.columns.format()
  feature_names = data.columns.format()
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  feature_names = data.columns.format()
  fea

grid search with xgb execute time : 37076.7734 seconds


In [1]:
Fitting 5 folds for each of 144 candidates, totalling 720 fits
...
grid search with xgb execute time : 37076.7734 seconds

SyntaxError: invalid syntax (3543418266.py, line 1)

In [30]:
with Timer(name="xgb cpu inference") as dask_time:
    X_val = X_val.astype(int)
    y_pred_train = best_model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_pred_train)
    print("Training Accuracy:", train_accuracy)
    
    y_pred_val = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    print("Validation Accuracy:", val_accuracy)

  feature_names = data.columns.format()
  feature_names = data.columns.format()


Training Accuracy: 0.8681626915931702
Validation Accuracy: 0.7921348214149475
xgb cpu inference execute time : 0.7041 seconds
