In [None]:
import subprocess
# list GPUs
gpu_list = subprocess.check_output(["nvidia-smi", "-L"]).decode().strip().split("\n")
num_gpus = len(gpu_list)
gpu_models = [line.split()[1] for line in gpu_list] if gpu_list[0] else []
print(f"✅ Detected {num_gpus} GPU(s): {gpu_models}")

import xgboost as xgb
print("🔧 XGBoost version:", xgb.__version__)

In [None]:
!pip install --quiet --upgrade xgboost pandas scikit-learn matplotlib seaborn optuna skl2onnx onnxruntime

In [None]:
!pip install optuna skl2onnx onnxruntime --quiet

In [None]:
import pandas as pd

# Load balanced data
path = "/kaggle/input/datasetos/linux_syslog_labeled.csv"
df = pd.read_csv(path)
print("▶️ Shape:", df.shape)
print("▶️ Label distribution:\n", df.label.value_counts())

df.head(3)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Drop non-numeric / unused columns
drop_cols = ['timestamp','pid','name']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Handle missing
print("▶️ Nulls per col:\n", df.isnull().sum())
df = df.fillna(0)

# Separate X / y
X = df.drop(columns=['label']).astype('float32')
y = df['label']

# Encode labels
le = LabelEncoder()
y_enc = le.fit_transform(y)
print("▶️ Classes:", le.classes_)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc,
    test_size=0.2,
    stratify=y_enc,
    random_state=42
)
print("▶️ Train/Test sizes:", X_train.shape, X_test.shape)

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest  = xgb.DMatrix(X_test,  label=y_test)

In [None]:
params = {
    'objective': 'multi:softprob',
    'num_class': len(le.classes_),
    'tree_method': 'hist',         # dùng cùng với device='cuda'
    'device': 'cuda',
    'eval_metric': 'mlogloss',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}


# Multi-GPU support
if num_gpus > 1:
    params['n_gpus'] = num_gpus
    print(f"▶️ Enabled multi-GPU: n_gpus={num_gpus}")

In [None]:
X_train.columns = [f"f{i}" for i in range(X_train.shape[1])]
X_test.columns = [f"f{i}" for i in range(X_test.shape[1])]


In [None]:
evals = [(dtrain, 'train'), (dtest, 'eval')]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=200,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=10
)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Predict
y_pred_prob = bst.predict(dtest)
y_pred_labels = y_pred_prob.argmax(axis=1)

# Report
print(classification_report(y_test, y_pred_labels, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_labels)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=le.classes_, yticklabels=le.classes_,
            cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('Actual')
plt.title('Confusion Matrix'); plt.show()

In [None]:
# 1. K-fold accuracy
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_final, X, y_enc, cv=5, scoring='accuracy')
print("CV accuracy (5-fold):", scores, "mean =", scores.mean())

# 2. Feature importance
import matplotlib.pyplot as plt
xgb.plot_importance(xgb_final, max_num_features=10)
plt.show()


In [None]:
bst.save_model('xgb_os_scheduler_gpu.json')
import joblib
joblib.dump(le, 'label_encoder.pkl')
print("✅ Models saved: xgb_os_scheduler_gpu.json, label_encoder.pkl")

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tree_method': 'gpu_hist',
        'eval_metric': 'mlogloss',
        'objective': 'multi:softprob',
        'num_class': len(le.classes_),
        'seed': 42
    }
    model = XGBClassifier(**param)
    score = cross_val_score(model, X, y_enc, cv=3, scoring='accuracy')
    return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)
print("✅ Best Optuna params:", study.best_params)

In [None]:
!pip install onnxmltools --quiet


In [None]:
from xgboost import XGBClassifier
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

# Giả sử bạn đã có X_train và y_train
xgb_final = XGBClassifier(
    objective='multi:softprob',
    num_class=len(le.classes_),
    tree_method='hist',
    device='cuda',
    eval_metric='mlogloss',
    seed=42
)
xgb_final.fit(X_train, y_train)

# Chuyển đổi sang ONNX
initial_type = [('input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_xgboost(xgb_final, initial_types=initial_type)

# Lưu ra file
with open("xgb_os_sched.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("✅ Saved ONNX model with onnxmltools")


In [None]:
import joblib
import zipfile

# 1. Lưu model XGBoost gốc (JSON)
bst.save_model("/kaggle/working/xgb_os_scheduler_gpu.json")

# 2. Lưu LabelEncoder
joblib.dump(le, "/kaggle/working/label_encoder.pkl")

# 3. Lưu ONNX (nếu chưa lưu)
with open("/kaggle/working/xgb_os_sched.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("✅ Saved files in /kaggle/working:")
print("   - xgb_os_scheduler_gpu.json")
print("   - label_encoder.pkl")
print("   - xgb_os_sched.onnx")

# 4. (Tùy chọn) Nén tất cả vào một file ZIP
with zipfile.ZipFile("/kaggle/working/model_bundle.zip", "w") as zf:
    for fname in ["xgb_os_scheduler_gpu.json", "label_encoder.pkl", "xgb_os_sched.onnx"]:
        zf.write(f"/kaggle/working/{fname}", arcname=fname)

print("✅ Packed model_bundle.zip")


In [None]:
from IPython.display import FileLink

# Tạo link tải file ZIP
FileLink('/kaggle/working/model_bundle.zip')
