# Investigating probabilistic predictions

Also see `probabilistic-predictions-log-loss.jpg`

In [1]:
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
random_state = 1
score_metrics = ["roc_auc", "average_precision", "accuracy", "recall", "f1", "neg_log_loss"]

X, y = make_classification(n_features=500, class_sep=0.1, n_informative=2, random_state=random_state)

In [3]:
gbm = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, random_state=random_state)
gbm2 = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.01, random_state=random_state)

In [4]:
cv_results = cross_validate(gbm, X, y, scoring=score_metrics, cv=5)

In [5]:
[{x: (v.mean().round(2), v.std().round(2))} for x, v in cv_results.items() if 'test' in x]

[{'test_roc_auc': (0.43, 0.07)},
 {'test_average_precision': (0.46, 0.02)},
 {'test_accuracy': (0.44, 0.04)},
 {'test_recall': (0.36, 0.15)},
 {'test_f1': (0.37, 0.12)},
 {'test_neg_log_loss': (-1.36, 0.21)}]

In [6]:
cv_preds_probs = cross_val_predict(gbm, X, y, cv=5, method='predict_proba')[:,1]
cv_preds_probs.round(3)

array([0.582, 0.936, 0.043, 0.541, 0.686, 0.92 , 0.557, 0.491, 0.752,
       0.024, 0.257, 0.224, 0.014, 0.305, 0.673, 0.653, 0.581, 0.499,
       0.7  , 0.817, 0.097, 0.038, 0.079, 0.42 , 0.838, 0.869, 0.91 ,
       0.589, 0.775, 0.013, 0.528, 0.011, 0.096, 0.326, 0.207, 0.659,
       0.987, 0.004, 0.107, 0.098, 0.385, 0.054, 0.995, 0.9  , 0.145,
       0.108, 0.381, 0.071, 0.951, 0.023, 0.671, 0.234, 0.989, 0.682,
       0.868, 0.779, 0.972, 0.964, 0.02 , 0.903, 0.046, 0.023, 0.844,
       0.063, 0.702, 0.095, 0.234, 0.482, 0.494, 0.58 , 0.515, 0.06 ,
       0.032, 0.914, 0.21 , 0.912, 0.954, 0.02 , 0.892, 0.267, 0.107,
       0.946, 0.199, 0.148, 0.427, 0.942, 0.114, 0.106, 0.165, 0.172,
       0.208, 0.705, 0.054, 0.274, 0.121, 0.235, 0.085, 0.191, 0.666,
       0.11 ])

In [135]:
y

array([1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0])

In [39]:
y

array([0, 0, 1, 0, 1, 0, 1, 1, 1, 0])