<a href="https://colab.research.google.com/github/nolantphillips/xG_model/blob/main/notebooks/04_xgboost_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!sudo apt-get update
!sudo apt-get install -y ocl-icd-libopencl1 opencl-headers clinfo
!sudo apt-get install -y libnvidia-compute-550

0% [Working]            Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building depe

In [23]:
!clinfo

Number of platforms                               1
  Platform Name                                   NVIDIA CUDA
  Platform Vendor                                 NVIDIA Corporation
  Platform Version                                OpenCL 3.0 CUDA 12.4.89
  Platform Profile                                FULL_PROFILE
  Platform Extensions                             cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_3d_image_writes cl_khr_byte_addressable_store cl_khr_icd cl_nv_compiler_options cl_nv_device_attribute_query cl_nv_pragma_unroll cl_nv_copy_opts cl_khr_gl_event cl_nv_create_buffer cl_khr_int64_base_atomics cl_khr_int64_extended_atomics cl_nv_kernel_attribute cl_khr_device_uuid cl_khr_pci_bus_info cl_khr_external_semaphore cl_khr_external_memory cl_khr_external_semaphore_opaque_fd cl_khr_external_memory_opaque_fd
  Platform Extensions with Version                cl_khr

In [24]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import numpy as np

In [25]:
processed2023 = pd.read_parquet('/content/drive/MyDrive/projects/nhl/data/processed/processed_shots_2023.parquet', engine='pyarrow')
processed2024 = pd.read_parquet('/content/drive/MyDrive/projects/nhl/data/processed/processed_shots_2024.parquet', engine='pyarrow')
processed2025 = pd.read_parquet('/content/drive/MyDrive/projects/nhl/data/processed/processed_shots_2025.parquet', engine='pyarrow')

full_df = pd.concat([processed2023, processed2024, processed2025])
full_df.head()

cleaned_df = full_df.dropna()
X = cleaned_df.drop(['target', 'shot_class'], axis=1)
y = cleaned_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [26]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
num_pipeline = make_pipeline(StandardScaler(), MinMaxScaler((-1,1)))

preprocessing = ColumnTransformer([
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=num_pipeline)

In [27]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'device': ['cuda'],
    'n_estimators': [100, 250, 375, 500],
    'learning_rate': [0.01,0.05,0.1],
    'booster': ['gbtree', 'gblinear'],
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],
}

xgb = make_pipeline(preprocessing, GridSearchCV(XGBClassifier(device='cuda'), param_grid=parameters, scoring='roc_auc', n_jobs=-1, cv=3))
xgb.fit(X_train, y_train)

In [28]:
probabilities_xgb = xgb.predict_proba(X_test)
pred_test_xgb = xgb.predict(X_test)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [29]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

In [31]:
auc_test = roc_auc_score(y_test, probabilities_xgb[:,1])
auc_train = roc_auc_score(y_train, xgb.predict_proba(X_train)[:,1])
print(f"AUC train: {auc_train}")
print(f"AUC test: {auc_test}")
print(classification_report(y_test, pred_test_xgb))

AUC train: 0.8444115791143407
AUC test: 0.7979009563883557
              precision    recall  f1-score   support

           0       0.93      1.00      0.97    127516
           1       0.78      0.01      0.03      9154

    accuracy                           0.93    136670
   macro avg       0.86      0.51      0.50    136670
weighted avg       0.92      0.93      0.90    136670



In [32]:
confusion_matrix(y_test, pred_test_xgb)

array([[127483,     33],
       [  9036,    118]])

In [35]:
import joblib

joblib.dump(xgb, '/content/drive/MyDrive/projects/nhl/models/xgb_v1.pkl')

['/content/drive/MyDrive/projects/nhl/models/xgb_v1.pkl']