# P-values in ML

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sc
import sklearn as sk
import sys
import os

from scipy.stats import chi2
from sklearn.ensemble import RandomForestClassifier
from statsmodels.discrete.discrete_model import Logit

### Create dataset

In [2]:
def logit(h):
    ''' Logistic from activation h '''
    p = 1.0 / (1.0 + np.exp(-h))
    r = np.random.rand(len(p))
    y = (r < p).astype('float')
    return y


def rand_date():
    max_time = int(time.time())
    t = random.randint(0, max_time)
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))


def rand_unif(num, mean, std, na_prob=0):
    xi = np.random.rand(num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def rand_norm(num, mean=0.0, std=1.0, na_prob=0):
    xi = np.random.normal(mean, std, num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def create_dataset_01(num=1000, n_rands=5, save=False):
    """ Create dataset y = f(x1, x2, x3) + noise (r* are not used) """
    x1 = rand_norm(num)
    x2 = rand_norm(num)
    x3 = rand_norm(num)
    n = rand_norm(num)
    y = logit(3.0 * x1 - 2.0 * x2 + 1.0 * x3 + 0.5 * n)
    d = {'x1': x1, 'x2': x2, 'x3': x3, 'y': y}
    for i in range(n_rands):
        d[f"rand_{i}"] = rand_norm(num)
    df = pd.DataFrame(d)
    if save:
        file = 'zzz.csv'
        print(f"Saving dataset to file '{file}'")
        df.to_csv(file, index=False)
    return df

# P-values and feature importance

In [3]:
def wilks_model_fit(x, y):
    logit_model = Logit(y, x)
    res = logit_model.fit(disp=0)
    return logit_model, res

def wilks_p_value(df, var_output, vars_null):
    model_null, model_null_results = wilks_model_fit(df[vars_null], df[var_output])
    pvalues = dict()
    for c in df.columns:
        if c != var_output and c not in vars_null:
            xnames = list(vars_null)
            xnames.append(c)
            model_alt, model_alt_res = wilks_model_fit(df[xnames], df[var_output])
            if model_alt is None:
                self._error(f"Could not fit alt model for column/s {c}")
                pval = 1.0
            else:
                d = 2.0 * (model_alt_res.llf - model_null_results.llf)
                pval = chi2.sf(d, 1)
            pvalues[c] = pval
    return pd.Series(pvalues)

def feature_importance(model, x, y, var):
    score_null = model.score(x, y)
    x_shuf = x.copy()
    x_shuf[var] = x_shuf[var].sample(frac=1).values
    score_alt = model.score(x_shuf, y)
    return (score_null - score_alt) / score_null

def feature_importance_multiple_shuffle(model, x_validate, y_validate, vars_input, num_iter=10, scores=None):
    scores = dict() if scores is None else scores
    for var in vars_input:
        delta_scores = list()
        for i in range(num_iter):
            delta_score = feature_importance(model, x_validate, y_validate, var)
            delta_scores.append(delta_score)
        delta_scores = np.array(delta_scores)
        # print(f"{var}:\tmean: {delta_scores.mean()}\tstd: {delta_scores.std()}\tdelta_scores: {delta_scores}")
        # Append values to array (if any)
        scores[var] = delta_scores if var not in scores else np.append(scores[var], delta_scores).flatten()
    return scores

def split_df(df, var_inputs, var_output, train_index, val_index):
    df_train, df_val = df.iloc[train_index], df.iloc[val_index]
    return df_train[var_inputs], df_train[var_output], df_val[var_inputs], df_val[var_output]


# One model, one data split

In [4]:
from sklearn.model_selection import KFold

# Function to create new model
def new_model(): return RandomForestClassifier(100)

# Create dataset
def p_values_shuffle(num_samples=100, n_rands=5, num_cv=1, num_shuffle=10, show_plot=False, pvals_df=None):
    df = create_dataset_01(num=num_samples, n_rands=n_rands)
    if show_plot:
        sns.pairplot(df, kind='scatter', diag_kind='kde')
    # Variables
    var_output = 'y'
    vars_input = [c for c in df.columns if c != var_output]
    vars_null = [c for c in vars_input if c.startswith('rand_')]
    # p-values from Wilks model
    #pval_wilks = wilks_p_value(df, var_output, vars_null)
    # Create several models...
    scores = dict()
    if num_cv > 1:
        cv_iter = KFold(n_splits=num_cv).split(df)
    else:
        # No cross validation, split 80% / 20%
        idx = int(0.8 * len(df))
        idx_train = range(0, idx)
        idx_val = range(idx, len(df))
        cv_iter = [(idx_train, idx_val)]
    cv_count = 0
    for train_index, val_index in cv_iter:
        cv_count += 1
        # print(f"Cross validation: {cv_count} / {num_cv}")
        # Split dataset
        x_train, y_train, x_validate, y_validate = split_df(df, vars_input, var_output, train_index, val_index)
        # Create model
        model = new_model()
        model_fit = model.fit(x_train, y_train)
        # Calculate scores (shuffle)
        scores = feature_importance_multiple_shuffle(model, x_validate, y_validate, vars_input, num_iter=num_shuffle, scores=scores)
    # Calculate p-values
    pvals_df = pd.DataFrame() if pvals_df is None else pvals_df
    null_scores = np.array([scores[c] for c in vars_null]).flatten()
    for c in vars_input:
        null_scores = np.array([scores[cn] for cn in vars_null if cn != c]).flatten()
        pval = sc.stats.mannwhitneyu(scores[c], null_scores, alternative='greater')[1]
        df_row = pd.DataFrame({'name': c,
            'num_samples': num_samples, 'n_rands': n_rands,
            'num_cv': num_cv, 'num_shuffle': num_shuffle,
            'count_alt': len(scores[c])
                      , 'mean_alt': scores[c].mean(), 'std_alt': scores[c].std()
                      , 'count_null': len(null_scores), 'mean_null': null_scores.mean()
                      , 'std_null': null_scores.std()
                      , 'p_value': pval}, index=[len(pvals_df)])
        pvals_df = pvals_df.append(df_row)
    return pvals_df

In [5]:
pvals_df = None
for num_cv in [1, 3, 5, 10, 20]:
    for num_samples in [50, 100, 200, 300, 400, 500, 1000, 2000, 10000]:
        for num_shuffle in [3, 5, 10, 20, 50, 100]:
            pvals_df = p_values_shuffle(num_samples=num_samples, n_rands=5, num_cv=num_cv, num_shuffle=num_shuffle, pvals_df=pvals_df)
            print(f"num_cv: {num_cv}\tnum_samples:{num_samples}\tnum_shuffle:{num_shuffle}")

pvals_df.to_csv('p_values_shuffle.csv')

num_cv: 1	num_samples:50	num_shuffle:3
num_cv: 1	num_samples:50	num_shuffle:5
num_cv: 1	num_samples:50	num_shuffle:10
num_cv: 1	num_samples:50	num_shuffle:20
num_cv: 1	num_samples:50	num_shuffle:50
num_cv: 1	num_samples:50	num_shuffle:100
num_cv: 1	num_samples:100	num_shuffle:3
num_cv: 1	num_samples:100	num_shuffle:5
num_cv: 1	num_samples:100	num_shuffle:10
num_cv: 1	num_samples:100	num_shuffle:20
num_cv: 1	num_samples:100	num_shuffle:50
num_cv: 1	num_samples:100	num_shuffle:100
num_cv: 1	num_samples:200	num_shuffle:3
num_cv: 1	num_samples:200	num_shuffle:5
num_cv: 1	num_samples:200	num_shuffle:10
num_cv: 1	num_samples:200	num_shuffle:20
num_cv: 1	num_samples:200	num_shuffle:50
num_cv: 1	num_samples:200	num_shuffle:100
num_cv: 1	num_samples:300	num_shuffle:3
num_cv: 1	num_samples:300	num_shuffle:5
num_cv: 1	num_samples:300	num_shuffle:10
num_cv: 1	num_samples:300	num_shuffle:20
num_cv: 1	num_samples:300	num_shuffle:50
num_cv: 1	num_samples:300	num_shuffle:100
num_cv: 1	num_samples:400	

num_cv: 10	num_samples:1000	num_shuffle:5
num_cv: 10	num_samples:1000	num_shuffle:10
num_cv: 10	num_samples:1000	num_shuffle:20
num_cv: 10	num_samples:1000	num_shuffle:50
num_cv: 10	num_samples:1000	num_shuffle:100
num_cv: 10	num_samples:2000	num_shuffle:3
num_cv: 10	num_samples:2000	num_shuffle:5
num_cv: 10	num_samples:2000	num_shuffle:10
num_cv: 10	num_samples:2000	num_shuffle:20
num_cv: 10	num_samples:2000	num_shuffle:50
num_cv: 10	num_samples:2000	num_shuffle:100
num_cv: 10	num_samples:10000	num_shuffle:3
num_cv: 10	num_samples:10000	num_shuffle:5
num_cv: 10	num_samples:10000	num_shuffle:10
num_cv: 10	num_samples:10000	num_shuffle:20
num_cv: 10	num_samples:10000	num_shuffle:50
num_cv: 10	num_samples:10000	num_shuffle:100




num_cv: 20	num_samples:50	num_shuffle:3




num_cv: 20	num_samples:50	num_shuffle:5
num_cv: 20	num_samples:50	num_shuffle:10
num_cv: 20	num_samples:50	num_shuffle:20
num_cv: 20	num_samples:50	num_shuffle:50


























num_cv: 20	num_samples:50	num_shuffle:100
num_cv: 20	num_samples:100	num_shuffle:3
num_cv: 20	num_samples:100	num_shuffle:5
num_cv: 20	num_samples:100	num_shuffle:10
num_cv: 20	num_samples:100	num_shuffle:20
num_cv: 20	num_samples:100	num_shuffle:50
num_cv: 20	num_samples:100	num_shuffle:100
num_cv: 20	num_samples:200	num_shuffle:3
num_cv: 20	num_samples:200	num_shuffle:5
num_cv: 20	num_samples:200	num_shuffle:10
num_cv: 20	num_samples:200	num_shuffle:20
num_cv: 20	num_samples:200	num_shuffle:50
num_cv: 20	num_samples:200	num_shuffle:100
num_cv: 20	num_samples:300	num_shuffle:3
num_cv: 20	num_samples:300	num_shuffle:5
num_cv: 20	num_samples:300	num_shuffle:10
num_cv: 20	num_samples:300	num_shuffle:20
num_cv: 20	num_samples:300	num_shuffle:50
num_cv: 20	num_samples:300	num_shuffle:100
num_cv: 20	num_samples:400	num_shuffle:3
num_cv: 20	num_samples:400	num_shuffle:5
num_cv: 20	num_samples:400	num_shuffle:10
num_cv: 20	num_samples:400	num_shuffle:20
num_cv: 20	num_samples:400	num_shuffle:

In [6]:
pvals_df.head(20)

Unnamed: 0,name,num_samples,n_rands,num_cv,num_shuffle,count_alt,mean_alt,std_alt,count_null,mean_null,std_null,p_value
0,x1,50,5,1,3,3,0.142857,0.349927,15,-0.104762,0.110246,0.145737
1,x2,50,5,1,3,3,-0.142857,0.0,15,-0.104762,0.110246,0.798016
2,x3,50,5,1,3,3,-0.190476,0.067344,15,-0.104762,0.110246,0.908324
3,rand_0,50,5,1,3,3,-0.238095,0.067344,12,-0.071429,0.092214,0.988108
4,rand_1,50,5,1,3,3,-0.095238,0.067344,12,-0.107143,0.118451,0.531067
5,rand_2,50,5,1,3,3,0.0,0.0,12,-0.130952,0.108458,0.036496
6,rand_3,50,5,1,3,3,-0.190476,0.067344,12,-0.083333,0.108458,0.949182
7,rand_4,50,5,1,3,3,0.0,0.0,12,-0.130952,0.108458,0.036496
8,x1,50,5,1,5,5,0.425,0.1,25,0.065,0.06245,9.5e-05
9,x2,50,5,1,5,5,0.0,0.0,25,0.065,0.06245,0.983758


In [7]:
pvals_df.tail(20)

Unnamed: 0,name,num_samples,n_rands,num_cv,num_shuffle,count_alt,mean_alt,std_alt,count_null,mean_null,std_null,p_value
2140,rand_1,10000,5,20,20,400,-0.001422,0.007008,1600,-0.001491,0.007471,0.615263
2141,rand_2,10000,5,20,20,400,-0.001815,0.007167,1600,-0.001393,0.007431,0.9127225
2142,rand_3,10000,5,20,20,400,-0.001764,0.007933,1600,-0.001406,0.007234,0.6157999
2143,rand_4,10000,5,20,20,400,-0.000868,0.007651,1600,-0.001629,0.007304,0.08133016
2144,x1,10000,5,20,50,1000,0.297958,0.026008,5000,0.000323,0.007932,0.0
2145,x2,10000,5,20,50,1000,0.156519,0.025454,5000,0.000323,0.007932,0.0
2146,x3,10000,5,20,50,1000,0.046406,0.018606,5000,0.000323,0.007932,0.0
2147,rand_0,10000,5,20,50,1000,0.000859,0.008396,4000,0.000189,0.007806,0.07424026
2148,rand_1,10000,5,20,50,1000,0.000347,0.008771,4000,0.000317,0.007708,0.3766838
2149,rand_2,10000,5,20,50,1000,0.0004,0.007007,4000,0.000304,0.008147,0.2313997
