# XG boost explorations for Horizon scanning

Creator: Simon Reynaert  
Date: 02/01/2026

In [None]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# 1. Load data
data = load_iris()

# 2. Create your Human-Readable table (The "Master" table)
# We use pd.DataFrame to turn the math-heavy array into a nice table
df_all = pd.DataFrame(data['data'], columns=data['feature_names'])
df_all['species'] = [data['target_names'][i] for i in data['target']]

# Show the table you wanted to see
print("--- Your Input Data ---")
display(df_all.head()) 

# 3. Split the data for the model
# We use the original 'data' object here so the model gets the numbers it needs
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)

# 4. Train the model
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='multi:softprob')
bst.fit(X_train, y_train)

# 5. Make predictions
preds = bst.predict(X_test)

# 6. Get probabilities for the whole test set
probs = bst.predict_proba(X_test)

# 7. Create a DataFrame with the flower measurements
df_results = pd.DataFrame(X_test, columns=data['feature_names'])

# 8. Add the Actual and Predicted labels
df_results['Actual'] = [data['target_names'][i] for i in y_test]
df_results['Predicted'] = [data['target_names'][i] for i in preds]

# 9. Add the probabilities for each class
# This creates a column for each species name
for i, class_name in enumerate(data['target_names']):
    df_results[f'{class_name}_prob'] = probs[:, i]

# Display the first few rows
print("\n--- Model Performance ---")
display(df_results.head())

# Show only the flowers the model got WRONG
print("\n--- Mistakes Made by the Model ---")
mistakes = df_results[df_results['Actual'] != df_results['Predicted']]
mistakes


--- Your Input Data ---


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa



--- Model Performance ---


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Actual,Predicted,setosa_prob,versicolor_prob,virginica_prob
0,6.5,3.0,5.5,1.8,virginica,virginica,0.037052,0.053551,0.909397
1,6.8,2.8,4.8,1.4,versicolor,virginica,0.108166,0.389836,0.501997
2,5.7,3.8,1.7,0.3,setosa,setosa,0.926459,0.038408,0.035133
3,5.8,2.7,5.1,1.9,virginica,virginica,0.037052,0.053551,0.909397
4,4.6,3.2,1.4,0.2,setosa,setosa,0.926459,0.038408,0.035133



--- Mistakes Made by the Model ---


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Actual,Predicted,setosa_prob,versicolor_prob,virginica_prob
1,6.8,2.8,4.8,1.4,versicolor,virginica,0.108166,0.389836,0.501997
10,6.3,2.5,4.9,1.5,versicolor,virginica,0.11156,0.161236,0.727204
18,4.9,2.5,4.5,1.7,virginica,versicolor,0.035558,0.929653,0.03479
24,5.1,2.5,3.0,1.1,versicolor,setosa,0.926459,0.038408,0.035133
27,6.9,3.1,4.9,1.5,versicolor,virginica,0.11156,0.161236,0.727204
