In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import DataFrame, read_csv

In [3]:
mars = read_csv('data/mars.csv').fillna(0)
reject = read_csv('data/reject.csv')
mars = mars[mars.subject.isin(reject.query('reject == 0').subject)].reset_index(drop=True)


In [9]:
from statsmodels.api import Logit

## Define useful functions.
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def zscore(x):
    return (x - np.mean(x)) / np.std(x)

## Define regressors.
mars['x0'] = 1
mars['x1'] = mars.groupby('subject').accuracy.transform(np.sum)
mars['x2'] = np.where(mars.shape_set == 2, 1, 0)
mars['x3'] = np.where(mars.shape_set == 3, 1, 0)

## Prepare data.
Y = mars.accuracy.values
X = mars.filter(regex='x[0-9]').values
K = mars.item.values

In [11]:

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Compute observed effects.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

## Preallocate space.
params = np.zeros((np.unique(K).size, 6))

for i, k in enumerate(np.unique(K)):
        
    ## Restrict to current item.
    y = Y[K == k]
    x = X[K == k].copy()

    ## Normalize sum scores.
    x[:,1] = zscore(x[:,1] - y)

    ## Perform logistic regression.
    fit = Logit(y, x).fit(disp=0)

    ## Check convergence.
    if not fit.mle_retvals['converged']: 
        continue

    ## Compute coefficients.
    params[i,:4] = fit.params
    
    ## Compute contrast (shape set).
    f_test = fit.f_test([[0,0,1,0],[0,0,0,1]])
    params[i,4] = f_test.fvalue.squeeze()
    params[i,5] = f_test.pvalue.squeeze()
    
## Convert to DataFrame.
columns = ['b0','b1','b2','b3','f1','p1']
params = DataFrame(params, columns=columns)
params.insert(0, 'item', np.unique(K))


In [15]:
from tqdm import tqdm

## Simulation parameters.
n_sim = 5000

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
### Compute null effects.
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
np.random.seed(47404)

## Preallocate space.
f_null = np.zeros(n_sim)

for m in tqdm(range(n_sim)):
    
    ## Preallocate space.
    fvals = np.zeros(np.unique(K).size)
    
    for i, k in enumerate(np.unique(K)):
        
        ## Restrict to current item.
        y = Y[K == k]
        x = X[K == k].copy()
        
        ## Normalize sum scores.
        x[:,1] = zscore(x[:,1] - y)
        
        ## Permute item feature labels.
        x[:,-3:] = x[np.random.permutation(np.arange(y.size)),-3:]
        
        ## Perform logistic regression.
        fit = Logit(y, x).fit(disp=0)
        
        ## Check convergence.
        if not fit.mle_retvals['converged']: 
            continue
            
        ## Compute test statistic.
        fvals[i] = fit.f_test([[0,0,1,0],[0,0,0,1]]).fvalue.squeeze()
        
    ## Store maximum statistic.
    f_null[m] = fvals.max(axis=0)
    
## Compute family-wise error p-value.
params['fwe1'] = (np.sum(np.subtract.outer(params.f1.values, f_null) < 0, axis=1) + 1) / (n_sim + 1)

100%|██████████| 5000/5000 [02:39<00:00, 31.27it/s]


In [17]:
params

Unnamed: 0,item,b0,b1,b2,b3,f1,p1,fwe1
0,11,0.304401,0.511507,0.104714,0.450941,1.107456,0.331782,0.992801
1,16,1.177465,0.774342,-0.073922,-0.556069,1.674995,0.189104,0.914417
2,17,0.663881,0.606124,-0.722211,-1.073591,5.911938,0.003042,0.026795
3,23,1.306061,0.578515,-0.685842,-0.924573,4.043785,0.018522,0.168966
4,29,1.742403,1.038205,-0.898992,-1.232711,5.632756,0.003979,0.036793
5,30,1.053741,0.853579,0.553578,0.390391,1.228818,0.294147,0.984003
6,35,0.872825,0.590123,-0.482142,-1.337632,8.80568,0.000193,0.0012
7,45,1.461334,0.766035,0.482663,-0.397954,2.626257,0.074057,0.584483
8,55,2.368685,0.587537,-0.933595,-0.757379,2.402956,0.092237,0.671866
9,58,0.903729,0.917327,-0.16361,-0.587629,1.715011,0.181769,0.905619
