# P-values in ML 02

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sb
import scipy as sc
import sklearn as sk
import sys
import os

from scipy.stats import chi2
from sklearn.ensemble import RandomForestClassifier
from statsmodels.discrete.discrete_model import Logit

## Create dataset

In [2]:
def logit(h):
    ''' Logistic from activation h '''
    p = 1.0 / (1.0 + np.exp(-h))
    r = np.random.rand(len(p))
    y = (r < p).astype('float')
    return y


def rand_date():
    max_time = int(time.time())
    t = random.randint(0, max_time)
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(t))


def rand_unif(num, mean, std, na_prob=0):
    xi = np.random.rand(num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def rand_norm(num, mean=0.0, std=1.0, na_prob=0):
    xi = np.random.normal(mean, std, num)
    if na_prob > 0:
        xi_na = (np.random.rand(num) <= na_prob)
        xi[xi_na] = np.nan
    return xi


def rbf(x, mu=0, sigma=1):
    """ Radial basis function """
    z = (x - mu) / sigma
    return np.exp(-(z*z))


def split_df(df, var_inputs, var_output, train_index, val_index):
    df_train, df_val = df.iloc[train_index], df.iloc[val_index]
    return df_train[var_inputs], df_train[var_output], df_val[var_inputs], df_val[var_output]

In [3]:
def create_dataset_01(num=1000, n_rands=5, save=False):
    """ Create dataset y = f(x1, x2, x3) + noise (r* are not used) """
    x1 = rand_norm(num)
    x2 = rand_norm(num)
    x3 = rand_norm(num)
    n = rand_norm(num)
    y = logit(3.0 * x1 - 2.0 * x2 + 1.0 * x3 + 0.5 * n)
    d = {'x1': x1, 'x2': x2, 'x3': x3, 'y': y}
    for i in range(n_rands):
        d[f"rand_{i}"] = rand_norm(num)
    df = pd.DataFrame(d)
    if save:
        file = 'zzz.csv'
        print(f"Saving dataset to file '{file}'")
        df.to_csv(file, index=False)
    return df

In [4]:
def create_dataset_02(num=1000, n_rands=0, save=False):
    """ Create dataset y = f(x1, x2, x3) + noise (r* are not used) """
    x1 = rand_norm(num)
    x2 = rand_norm(num)
    x3 = rand_norm(num)
    n = rand_norm(num)
    y = logit(5.0 * rbf(x1, 1, 1) - 3.0 * rbf(x2, -1, 1) + 1.0 * x3 + 0.5 * n)
    d = {'x1': x1, 'x2': x2, 'x3': x3, 'y': y}
    if n_rands > 0:
        for i in range(n_rands):
            d[f"rand_{i}"] = rand_norm(num)
    df = pd.DataFrame(d)
    if save:
        file = 'zzz.csv'
        print(f"Saving dataset to file '{file}'")
        df.to_csv(file, index=False)
    return df

## Feature importance: Logistic regression

In [5]:
def wilks_model_fit(x, y):
    logit_model = Logit(y, x)
    res = logit_model.fit(disp=0)
    return logit_model, res

def wilks_p_value(df, var_output, vars_null):
    model_null, model_null_results = wilks_model_fit(df[vars_null], df[var_output])
    pvalues = dict()
    for c in df.columns:
        if c != var_output and c not in vars_null:
            xnames = list(vars_null)
            xnames.append(c)
            model_alt, model_alt_res = wilks_model_fit(df[xnames], df[var_output])
            if model_alt is None:
                self._error(f"Could not fit alt model for column/s {c}")
                pval = 1.0
            else:
                d = 2.0 * (model_alt_res.llf - model_null_results.llf)
                pval = chi2.sf(d, 1)
            pvalues[c] = pval
    return pd.Series(pvalues)

## Feature importance: ML

### Cross-validations & models

In [8]:
from sklearn.model_selection import KFold


def new_model(x_train, y_train):
    return self.new_model_function(x_train, y_train)
    model = RandomForestClassifier(100)
    model.fit(self.x_train, self.y_train)
    return model


class FeatureImportanceShuffle:
    """ Feature importance by shuffling input values """
    def __init__(self, model, x_train, y_train, x_val, y_val, num_iter=10):
        self.model = model
        self.x_train, self.y_train, self.x_val, self.y_val = x_train, y_train, x_val, y_val
        self.num_iter = num_iter
        self.scores_null = dict()  # Null score by variable name (one score per variable)
        self.scores_alt = dict()  # Alt scores by variable name (multiple scores for each variable)

    def feature_importance_variable(self, var):
        """ Feature importance by shuffling variable 'var' and comparing performance results """
        if var not in self.scores_null:
!!!!!!!!!            self.scores_null[var] = self.model.score(self.x_val, self.y_val)
        # Shuffle 'var'
        x_shuf = self.x_train.copy()
        x_shuf[var] = x_shuf[var].sample(frac=1).values
        # Calculate 'raw' score
        return model.score(x_shuf, self.y_val)

    def feature_importance():
        """ Feature importance by shuffling variable 'var' and comparing performance results """
        for var in self.x_train.columns:
            if var.startswith('rand_'):
                continue
            # Add scores list
            if var not in self.scores:
                self.scores[var] = list()
            # Perform shuffling 'num_iter' times
            for i in range(self.num_iter):
                self.scores[var].append(self.feature_importance_variable(var))
            scores = np.array(self.scores[var])
            print(f"\t{var}:\tcount: {len(scores)}\tmean: {scores.mean()}\tstd: {scores.std()}")

class FeatureImportance:
    """
    Calculate feature importance.
    """
    def __init__(self, df, var_output, vars_input=None, vars_null=None, new_model_function=new_model):
        self.df = df
        self.var_output, self.vars_input, self.vars_null = var_output, vars_input, vars_null
        if self.vars_input is None:
            self.vars_input = [c for c in df.columns if c != var_output]
        if self.vars_null is None:
            self.vars_null = [c for c in vars_input if c.startswith('rand_')]
        self.new_model_function = new_model_function
        self.pvals_df = None
        self.num_samples = 100
        self.n_rands=5
        self.num_cv = 1
        self.num_shuffle = 10
        self.cvs = list()  # List of cross validation items
        self.models = list()  # Models by cross-validation
        self.scores = list()  # Models scores by cross-validation
        self.scores_null = list()  # Null models scores by cross-validation
        
    def new_model(self):
        return self.new_model_function(self.x_train, self.y_train)
    
    def __call__():
        # Variables
        var_output = 'y'
        # Create several models...
        cv_count = 0
        for train_index, val_index in self.cv_iter():
            cv_count += 1
            x_train, y_train, x_validate, y_validate = split_df(df, vars_input, var_output, train_index, val_index)
            model = new_model(x_train, y_train)
            # Calculate scores (shuffle)
            scores = feature_importance_multiple_shuffle(model, x_validate, y_validate, vars_input, num_iter=num_shuffle, scores=scores)
        # Calculate p-values
        pvals_df = pd.DataFrame() if pvals_df is None else pvals_df
        null_scores = np.array([scores[c] for c in vars_null]).flatten()
        for c in vars_input:
            null_scores = np.array([scores[cn] for cn in vars_null if cn != c]).flatten()
            pval = sc.stats.mannwhitneyu(scores[c], null_scores, alternative='greater')[1]
            df_row = pd.DataFrame({'name': c,
                'num_samples': num_samples, 'n_rands': n_rands,
                'num_cv': num_cv, 'num_shuffle': num_shuffle,
                'count_alt': len(scores[c])
                          , 'mean_alt': scores[c].mean(), 'std_alt': scores[c].std()
                          , 'count_null': len(null_scores), 'mean_null': null_scores.mean()
                          , 'std_null': null_scores.std()
                          , 'p_value': pval}, index=[len(pvals_df)])
            pvals_df = pvals_df.append(df_row)
        return pvals_df
    
    def cv_iter(self):
        """ Create a cross-validation iterator """
        if self.num_cv > 1:
            return KFold(n_splits=self.num_cv).split(df)
        else:
            # No cross validation, split 80% / 20%
            idx = int(0.8 * len(df))
            idx_train = range(0, idx)
            idx_val = range(idx, len(df))
            return [(idx_train, idx_val)]



### P-values analysis

In [None]:
# def pvalues_shuffle_analysis():
#     pvals_df = None
#     for num_cv in [1, 3, 5, 10, 20]:
#         for num_samples in [50, 100, 200, 300, 400, 500, 1000, 2000, 10000]:
#             for num_shuffle in [3, 5, 10, 20, 50, 100]:
#                 pvals_df = p_values_shuffle(num_samples=num_samples, n_rands=5, num_cv=num_cv, num_shuffle=num_shuffle, pvals_df=pvals_df)
#                 print(f"num_cv: {num_cv}\tnum_samples:{num_samples}\tnum_shuffle:{num_shuffle}")
#     pvals_df.to_csv('p_values_shuffle.csv')
#     return pvals_df

In [None]:
num = 1000
df = create_dataset_02(num_samples)