# P-values in ML

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sc
import sklearn as sk
import sys
import os

from scipy.stats import chi2
from sklearn.ensemble import RandomForestClassifier
from statsmodels.discrete.discrete_model import Logit

### Create dataset

In [None]:
def logit(h):
    ''' Logistic from activation h '''
    p = 1.0 / (1.0 + np.exp(-h))
    r = np.random.rand(len(p))
    y = (r < p).astype('float')
    return y


def rand_date():
    max_time = int(time.time())
    t = random.randint(0, max_time)
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))


def rand_unif(num, mean, std, na_prob=0):
    xi = np.random.rand(num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def rand_norm(num, mean=0.0, std=1.0, na_prob=0):
    xi = np.random.normal(mean, std, num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def create_dataset_01(num=1000, n_rands=5, save=False):
    """ Create dataset y = f(x1, x2, x3) + noise (r* are not used) """
    x1 = rand_norm(num)
    x2 = rand_norm(num)
    x3 = rand_norm(num)
    n = rand_norm(num)
    y = logit(2.0 * x1 - 1.0 * x2 + 0.5 * x3 + 0.1 * n)
    d = {'x1': x1, 'x2': x2, 'x3': x3, 'y': y}
    for i in range(n_rands):
        d[f"rand_{i}"] = rand_norm(num)
    df = pd.DataFrame(d)
    if save:
        file = 'zzz.csv'
        print(f"Saving dataset to file '{file}'")
        df.to_csv(file, index=False)
    return df

# P-values and feature importance

In [None]:
def wilks_model_fit(x, y):
    logit_model = Logit(y, x)
    res = logit_model.fit(disp=0)
    return logit_model, res

def wilks_p_value(df, var_output, vars_null):
    model_null, model_null_results = wilks_model_fit(df[vars_null], df[var_output])
    pvalues = dict()
    for c in df.columns:
        if c != var_output and c not in vars_null:
            xnames = list(vars_null)
            xnames.append(c)
            model_alt, model_alt_res = wilks_model_fit(df[xnames], df[var_output])
            if model_alt is None:
                self._error(f"Could not fit alt model for column/s {c}")
                pval = 1.0
            else:
                d = 2.0 * (model_alt_res.llf - model_null_results.llf)
                pval = chi2.sf(d, 1)
            pvalues[c] = pval
    return pd.Series(pvalues)

def feature_importance(model, x, y, var):
    score_null = model.score(x, y)
    x_shuf = x.copy()
    x_shuf[var] = x_shuf[var].sample(frac=1).values
    score_alt = model.score(x_shuf, y)
    return (score_null - score_alt) / score_null

def feature_importance_multiple_shuffle(model, x_validate, y_validate, vars_input, num_iter=10, scores=None):
    scores = dict() if scores is None else scores
    for var in vars_input:
        delta_scores = list()
        for i in range(num_iter):
            delta_score = feature_importance(model, x_validate, y_validate, var)
            delta_scores.append(delta_score)
        delta_scores = np.array(delta_scores)
        # print(f"{var}:\t{delta_scores.mean()}\t{delta_scores.std()}")
        scores[var] = delta_scores
    return scores

def split_df(df, val_perc=0.2):
    idx = int(len(df) * (1.0 -val_perc))
    return df[:idx], df[idx:]

# One model, one data split

In [None]:

# Create dataset
def p_values_shuffle(num_samples=100, n_rands=5, num_shuffle=10):
    df = create_dataset_01(num=num_samples, n_rands=n_rands)
    # Variables
    var_output = 'y'
    vars_input = [c for c in df.columns if c != var_output]
    vars_null = [c for c in vars_input if c.startswith('rand_')]
    # p-values from Wilks model
    pval_wilks = wilks_p_value(df, var_output, vars_null)
    # Split dataset
    df_train, df_validate = split_df(df, val_perc=0.2)
    x_train, y_train = df_train[vars_input], df_train[var_output]
    x_validate, y_validate = df_validate[vars_input], df_validate[var_output]
    x_train.shape, y_train.shape, x_validate.shape, y_validate.shape
    # Create model
    model = RandomForestClassifier(100)
    model_fit = model.fit(x_train, y_train)
    # Calculate scores (shuffle)
    scores = feature_importance_multiple_shuffle(model, x_validate, y_validate, vars_input, num_iter=num_shuffle)
    # Calculate p-values
    null_scores = np.array([scores[c] for c in vars_null]).flatten()
    pvals = {c: sc.stats.mannwhitneyu(scores[c], null_scores, alternative='greater')[1] for c in vars_input if c not in vars_null}
    return pvals
    

In [None]:
for num_samples in [50, 100, 200, 300, 400, 500, 1000, 2000, 10000]:
    for num_shuffle in [3, 5, 10, 20, 50, 100]:
        pvs = p_values_shuffle(num_samples=num_samples, n_rands=5, num_shuffle=num_shuffle)
        print(f"num_samples:{num_samples}\tnum_shuffle:{num_shuffle}\t{pvs}")
    print("")