# XG boost explorations for Horizon scanning

Creator: Simon Reynaert  
Date: 02/01/2026

In [None]:
#example from documentation website

from xgboost import XGBClassifier

# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)

# create model instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

# fit model
bst.fit(X_train, y_train)

# make predictions
preds = bst.predict(X_test)

In [None]:
import pandas as pd
import numpy as np

# 1. Get probabilities for the whole test set
probs = bst.predict_proba(X_test)

# 2. Create a DataFrame with the flower measurements
df_results = pd.DataFrame(X_test, columns=data['feature_names'])

# 3. Add the Actual and Predicted labels
df_results['Actual'] = [data['target_names'][i] for i in y_test]
df_results['Predicted'] = [data['target_names'][i] for i in preds]

# 4. Add the probabilities for each class
# This creates a column for each species name
for i, class_name in enumerate(data['target_names']):
    df_results[f'{class_name}_prob'] = probs[:, i]

# Display the first few rows
df_results.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Actual,Predicted,setosa_prob,versicolor_prob,virginica_prob
0,5.6,3.0,4.1,1.3,versicolor,versicolor,0.044543,0.910009,0.045447
1,4.7,3.2,1.6,0.2,setosa,setosa,0.925457,0.037738,0.036805
2,5.5,2.4,3.8,1.1,versicolor,versicolor,0.044543,0.910009,0.045447
3,6.1,2.8,4.7,1.2,versicolor,versicolor,0.044543,0.910009,0.045447
4,6.3,2.7,4.9,1.8,virginica,virginica,0.053064,0.164316,0.782621
