In [1]:
import warnings
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor


warnings.filterwarnings("ignore")

In [2]:
MAIN_FOLDER = "/home/arman/it/AI_work/machine/road_accident_risk"

In [3]:
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/prepared_test.csv")
target_col = "accident_risk"

In [4]:
# Prepare DMatrix for XGBoost
dtrain = xgb.DMatrix(train_data.drop(columns=target_col), label=train_data[target_col], enable_categorical=True)

# Define XGBoost parameters
xgb_params  = {
    'tree_method': 'hist', 'device': 'cuda', 'eval_metric': 'rmse',
    'random_state': 42,'max_bin': 512, 'min_child_weight': 3,
    'max_delta_step': 1, 'max_depth': 11, 'learning_rate': 0.010453775390437146,
    'subsample': 0.8162196077561874,'colsample_bytree': 0.8057453252225478,
    'gamma': 0.011515371568909936,'reg_alpha': 0.1153674139991063,
    'reg_lambda': 0.4029264986439234,'colsample_bylevel': 0.8675078626084138,
    'colsample_bynode': 0.8804930677965951,'scale_pos_weight': 0.3615894752587659,
}

# Run cross-validation
cv_results = xgb.cv(
    params=xgb_params,
    dtrain=dtrain,
    nfold=5,
    num_boost_round=2000,
    metrics='rmse',
    verbose_eval=100,
    early_stopping_rounds=50
)

# Display last few CV results
print(cv_results.tail())

# Extract best boosting round
best_round = cv_results['test-rmse-mean'].idxmin()
best_rmse = cv_results['test-rmse-mean'][best_round]
print(f"Best round: {best_round}, Best CV RMSE: {best_rmse:.7f}")

[0]	train-rmse:0.16585+0.00009	test-rmse:0.16586+0.00035
[100]	train-rmse:0.08276+0.00049	test-rmse:0.08296+0.00048
[200]	train-rmse:0.06180+0.00027	test-rmse:0.06220+0.00017
[300]	train-rmse:0.05786+0.00009	test-rmse:0.05839+0.00013
[400]	train-rmse:0.05713+0.00004	test-rmse:0.05773+0.00013
[500]	train-rmse:0.05697+0.00003	test-rmse:0.05760+0.00014
[600]	train-rmse:0.05693+0.00002	test-rmse:0.05758+0.00014
[700]	train-rmse:0.05691+0.00002	test-rmse:0.05757+0.00014
[800]	train-rmse:0.05691+0.00002	test-rmse:0.05757+0.00014
[900]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1000]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1100]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1200]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1300]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1400]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1500]	train-rmse:0.05690+0.00002	test-rmse:0.05757+0.00014
[1600]	train-rmse:0.05690+0.00002	test-rmse:0.05757+

In [5]:
# Prepare training data
X_train = train_data.drop(columns=target_col)
y_train = train_data[target_col]

# Train XGBoost model
model = XGBRegressor(**xgb_params, enable_categorical=True)
model.fit(X_train, y_train)

# Predict on test set
pred = model.predict(test_data.drop(columns = "id"))

# Prepare submission
sub = pd.DataFrame({
    "id": test_data["id"],
    target_col: pred
})

# Save submission file
sub.to_csv(f"{MAIN_FOLDER}/data/submission.csv", index=False)