# 3.2.5 Test de permutación de un score con permutation_test_score

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold, permutation_test_score
from sklearn.svm import SVC

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

- Evalua la significancia de un score computado con validación cruzada usando permutaciones.
- Se permuta la variable de salida para generar datos aleatorios y computar un valor p empírico en contra de la hipótesis nula de que las variables de entrada y la salida son independientes.

In [None]:
iris = load_iris()
X = iris.data
y = iris.target

## 3.2.5.1 Experimento 1

In [None]:
clf = SVC(kernel="linear", random_state=7)
cv = StratifiedKFold(2, shuffle=True, random_state=0)

score, permutation_scores, pvalue = permutation_test_score(
    # -------------------------------------------------------------------------
    # The object to use to fit the data.
    estimator=clf,
    # -------------------------------------------------------------------------
    # The data to fit.
    X=X,
    # -------------------------------------------------------------------------
    # The target variable to try to predict in the case of supervised learning.
    y=y,
    # -------------------------------------------------------------------------
    # Labels to constrain permutation within groups, i.e. y values are permuted
    # among samples with the same group identifier. When not specified,
    # y values are permuted among all samples.
    groups=None,
    # -------------------------------------------------------------------------
    # Metric
    scoring="accuracy",
    # -------------------------------------------------------------------------
    # Determines the cross-validation splitting strategy.
    cv=cv,
    # -------------------------------------------------------------------------
    # Number of times to permute y.
    n_permutations=1000,
    # -------------------------------------------------------------------------
    # Pass an int for reproducible output for permutation of y values among
    # samples.
    random_state=0,
    # -------------------------------------------------------------------------
    # The verbosity level.
    verbose=0,
    # -------------------------------------------------------------------------
    # Parameters to pass to the fit method of the estimator.
    fit_params=None,
)

score

In [None]:
permutation_scores[:10]

In [None]:
pvalue

In [None]:
fig, ax = plt.subplots()

ax.hist(permutation_scores, bins=20, density=True)
ax.axvline(score, ls="--", color="r")
score_label = f"Score on original\ndata: {score:.2f}\n(p-value: {pvalue:.3f})"
ax.text(0.7, 10, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
ax.set_ylabel("Probability")
plt.show()

## 3.2.5.2 Experimento 2

In [None]:
n_uncorrelated_features = 20
rng = np.random.RandomState(seed=0)
X_rand = rng.normal(size=(X.shape[0], n_uncorrelated_features))

In [None]:
score, permutation_scores, pvalue = permutation_test_score(
    clf,
    X_rand,
    y,
    scoring="accuracy",
    cv=cv,
    n_permutations=1000,
)

score

In [None]:
permutation_scores[:10]

In [None]:
pvalue

In [None]:
fig, ax = plt.subplots()

ax.hist(permutation_scores, bins=20, density=True)
ax.axvline(score, ls="--", color="r")
score_label = f"Score on original\ndata: {score:.2f}\n(p-value: {pvalue:.3f})"
ax.text(0.7, 10, score_label, fontsize=12)
ax.set_xlabel("Accuracy score")
ax.set_ylabel("Probability")
plt.show()

In [None]:
print('ok_')