In [8]:
                            """ Pseudocode for Fast FSR algorithm """

""" Date: 4/24/15 
    Modified: Improve missing value adjustments (code to test this comes after this cell) """


""" Data type check """
def df_type(dat):
    
    ### Input params:
    #   dat = dataset whose type is to be checked / transformed
    
    ### Output:
    #   error msg or True boolean
    
    import numpy as np
    import pandas as pd
    
    if isinstance(dat,pd.DataFrame)==False and isinstance(dat,np.ndarray)==False:
        raise Exception("Data must be pandas DataFrame")
    else:
        return True


    
""" p-value computation function """
def pval_comp(max_size=None):
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import scipy.stats as st
    import rpy2.robjects as ro
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # vector of model sizes
    sizes = np.arange(max_size)+1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (sizes+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(sizes+1))



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    # Input params:
    #   xcolnames = array of names of covariates (same order as columns in original dataset)
    #   max_size  = integer max no. of vars in final model (largest model size desired)
    #   col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import rpy2.robjects as ro
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    ### and declare as R objects in global environment
    ro.globalenv['x2'] = com.convert_to_r_matrix(x)
    ro.globalenv['y2'] = com.convert_to_r_matrix(y)
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None):
    
    ### Input params:
    #   pvs      = vector of p-values (monotonically increasing) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of gamma_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pvs * (ncov - S) / (1 + S)
    
    # Check for duplicate p-values
    dups = list(set([x for x in list(pvs) if list(pvs).count(x) > 1]))
    for i in range(len(dups)): g_F[pvs==dups[i]] = min(g_F[pvs==dups[i]])
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_mono == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pvs[-1]
    
    return g_F

    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None):
    
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of alpha_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return alpha_F        
    
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    
    import numpy as np
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return g * (1 + S) / (ncov - S)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return g * (1 + S) / (ncov - S)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname):
    
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_mono
    #            used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    
    ### Output:
    # array of estimated parameters
    
    import numpy as np
    import statsmodels.api as sm
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])

    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]

    ### Fit the linear model using the selected model vars
    fit = sm.OLS(y,x.loc[:,list(modvars)]).fit()
    betaout = pd.DataFrame([fit.params,fit.bse]).T
    betaout.columns = ['beta','beta_se']
    
    return betaout

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f='.4f'):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f = string of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    import numpy as np
    import pandas as pd
    
    ### Round all arrays
    p_od = [format(x,prec_f) for x in p_orig]
    p_md = [format(x,prec_f) for x in p_mono]
    ad = [format(x,prec_f) for x in alphaf]
    gd = [format(x,prec_f) for x in gammaf]
    
    ### Combine the arrays
    tab = pd.DataFrame([size,vname,p_od,p_md,ad,gd]).T
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure dat = pandas dataframes or else convert them
    if bag==False:
        if df_type(dat)==True:
            if isinstance(dat,pd.DataFrame):
                d = dat.copy()
            else:
                if isinstance(dat,np.ndarray):
                    d = pd.DataFrame(dat)
                    vnum = list(np.arange(d.shape[1])+1)
                    vchr = list(np.repeat("V",d.shape[1]))
                    d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(dat)
        
        ### Remove missing values
        d.dropna(inplace=True)
        
        ### Check that p < n to ensure regression solutions
        if (d.shape[1]-1) >= d.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        d = dat.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = d.shape[1]-1
        
    ### Perform forward selection
    fwd_sel = forward(d.iloc[:,1:], pd.DataFrame(d.iloc[:,0]), max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(d.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, d.shape[1], max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(d.iloc[:,1:], d.iloc[:,0], g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, d.shape[1]-1, max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, betahats, alpha_F_g(gs, g_F, d.shape[1]-1)
            else:
                return fsr_results, alpha_F_g(gs, g_F, d.shape[1]-1)
        else:
            if betaout==True:
                return fsr_results, betahats
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, d.shape[1]-1), len(betahats)

    
    
def bagfsr(dat,g0,B=200,max_s=None,v_incl=None,prec=4):
    
    ### Input params:
    #   dat    = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0     = float pre-specified FSR of interest ("gamma0")
    #   B      = integer of number of bagged samples
    #   max_s  = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   v_incl = array of cols corresponding to those vars to force into model
    #   prec   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #   Mean of betahats
    #   SEs of betahats
    #   Avg alpha-to-enter
    #   Avg model size
    #   Prop of times each var included in model
    
    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if df_type(dat)==True:
        if isinstance(dat,pd.DataFrame):
            d = dat.copy()
        else:
            if isinstance(dat,np.ndarray):
                d = pd.DataFrame(dat)
                vnum = list(np.arange(d.shape[1])+1)
                vchr = list(np.repeat("V",d.shape[1]))
                d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
    else:
        return df_type(dat)
    
    ### Remove missing values
    d.dropna(inplace=True)
    
    ### check that p < n to ensure regression solutions
    if (d.shape[1]-1) >= d.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### Create array to keep track of number of times vars enter model
    nentries = pd.DataFrame(np.zeros(d.shape[1]-1),index=d.columns.values[1:])
    
    ### Create array to store all estimated coefficients, ses, alphas, sizes
    allbetas = pd.DataFrame(np.zeros([B,(d.shape[1]-1)]),columns=d.columns.values[1:])
    allses = allbetas.copy()
    alphas = []
    sizes = []
    np.random.seed(1234)
    
    ### Bagging loops
    for i in range(B):

        # Draw with replacement from rows of data
        n_row = d.shape[0]
        rand_row = np.random.randint(0,n_row,n_row)
        newdat = d.iloc[rand_row,:]
        newdat.index = np.arange(n_row)+1
        
        ### Obtain FSR results
        fsrout = ffsr(newdat.iloc[:,1:],pd.DataFrame(newdat.iloc[:,0]),g0,bag=True,max_size=max_s,var_incl=v_incl)
        allbetas.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,0]
        allses.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,1]
        alphas.append(fsrout[1])
        sizes.append(fsrout[2])

        ### Update counts num times var included
        nentries.loc[fsrout[0].index[np.abs(np.around(fsrout[0].iloc[:,0],prec))>0]] += 1
        
    ### Compute averages
    avgbeta = np.around(allbetas.mean(axis=0),prec) # mean across rows / colmeans == mean of each cov's betahat
    avgse = np.around(allses.mean(axis=0),prec)
    avgalpha = np.mean(alphas)
    avgsize = np.mean(sizes)
    var_props = nentries/float(B)
    cov_res = pd.concat([avgbeta,avgse,var_props],axis=1)
    cov_res.columns = ['betahat','betase','prop_incl']
    
    return cov_res, avgalpha, avgsize
    
    

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

# To-do:
# 1. adjust betaest fcn and ffsr to allow for specification of intercept and whether data should be normalized in estimation

In [2]:
###########################################################
### Code to test / build functions:

In [3]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
Y = X.dot(beta)

In [4]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [4]:
# Test potentially faster removal of missing data rows
tx = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
tb = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
ty= tx.dot(tb)
ty[[1,3,13]] = 'NaN'
tx[0,10], tx[8,0], tx[40,5], tx[33,14] = 'NaN', 'NaN', 'NaN', 'NaN'
tty = pd.DataFrame(ty)
ttx = pd.DataFrame(tx)
ttx.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

d = pd.concat([tty,ttx],axis=1)

print d.shape
d.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
print d.shape

(100, 16)
(93, 16)


In [8]:
txna = np.isnan(ttx).any(axis=1)
ttx.shape

(100, 15)

In [16]:
# New handling of missing values
def rm_miss(dat):
    # NOTE: will override original dat file
    dat.dropna(inplace=True)
    return dat

# Current handling of missing values
def rm_miss2(x,y):
    yna = np.isnan(y).any(axis=1)
    xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
    anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
    missrow = np.where(anyna==1)[0]
    y = y.drop(y.index[missrow])
    x = x.drop(x.index[missrow])
    return(x,y)

In [17]:
%timeit rm_miss(d)

1000 loops, best of 3: 517 µs per loop


In [18]:
%timeit rm_miss2(ttx,tty)

1000 loops, best of 3: 1.51 ms per loop


In [None]:
# Alternate version (output)
def rm_miss(dat):
    # NOTE: will override original dat file
    dat.dropna(inplace=True)
    y = dat.iloc[:,0]
    x = dat.iloc[:,1:]
    return (x,y)

# Current handling of missing values
def rm_miss2(x,y):
    yna = np.isnan(y).any(axis=1)
    xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
    anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
    missrow = np.where(anyna==1)[0]
    y = y.drop(y.index[missrow])
    x = x.drop(x.index[missrow])
    return(x,y)

In [14]:
%timeit rm_miss(d)

1000 loops, best of 3: 803 µs per loop


In [15]:
%timeit rm_miss2(ttx,tty)

1000 loops, best of 3: 1.51 ms per loop


In [19]:
# Alternate version (input, output)
def rm_miss(x,y):
    # NOTE: will override original dat file
    dat = pd.concat([y,x],axis=1)
    dat.dropna(inplace=True)
    y = dat.iloc[:,0]
    x = dat.iloc[:,1:]
    return (x,y)

# Current handling of missing values
def rm_miss2(x,y):
    yna = np.isnan(y).any(axis=1)
    xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
    anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
    missrow = np.where(anyna==1)[0]
    y = y.drop(y.index[missrow])
    x = x.drop(x.index[missrow])
    return(x,y)

In [20]:
%timeit rm_miss(ttx,tty)

100 loops, best of 3: 2.15 ms per loop


In [21]:
%timeit rm_miss2(ttx,tty)

1000 loops, best of 3: 1.52 ms per loop


In [23]:
###########################################################
### Test functions:

In [26]:
fwd_r = forward(X2,Y2)

In [28]:
codnames = cov_order(X2.columns.values)

print codnames

#print ro.r('fwd$vorder')

['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1' 'V5' 'V15' 'V12' 'V14' 'V8' 'V9' 'V6'
 'V13']


In [29]:
po = pval_comp(X2.shape[1])

In [30]:
gg00 = 0.05
af = alpha_F(gg00, X2.shape[1])

In [31]:
gf = gamma_F(po, X2.shape[1])

In [32]:
sss = np.arange(X2.shape[1])+1

In [54]:
fsr_results = fsrtable(sss, codnames, po, np.sort(po), af, gf)
fsr_results

Unnamed: 0,S,Var,p,p_m,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.0071,0.0
1,2,V7,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V2,0.0003,0.0003,0.03,0.0004
5,6,V10,0.0078,0.0078,0.0389,0.01
6,7,V1,0.0116,0.0116,0.05,0.0116
7,8,V5,0.0973,0.08,0.0643,0.0756
8,9,V15,0.08,0.0973,0.0833,0.048
9,10,V12,0.1259,0.1259,0.11,0.0572


In [55]:
# Test code possibly more efficient at building output table
format(po[6],'.4f')
[format(x,'.4f') for x in po]
a = '.4f'
[format(x,a) for x in po]
np.array([sss,codnames,[format(x,'.4f') for x in po],[format(x,'.4f') for x in af]]).T.reshape(15,4)
pd.DataFrame([sss,codnames,[format(x,'.4f') for x in po],[format(x,'.4f') for x in af]]).T#.reshape(15,4)

['0.0000',
 '0.0000',
 '0.0000',
 '0.0000',
 '0.0003',
 '0.0078',
 '0.0116',
 '0.0973',
 '0.0800',
 '0.1259',
 '0.2040',
 '0.2480',
 '0.3679',
 '0.6474',
 '0.7110']

In [56]:
# New function to create output table
def fsrtable2(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f='.4f'):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    import numpy as np
    import pandas as pd
    
    ### Round all arrays
    p_od = [format(x,prec_f) for x in p_orig]
    p_md = [format(x,prec_f) for x in p_mono]
    ad = [format(x,prec_f) for x in alphaf]
    gd = [format(x,prec_f) for x in gammaf]
    
    ### Combine the arrays
    tab = pd.DataFrame([size,vname,p_od,p_md,ad,gd]).T
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab

In [61]:
# Compare speed of old and new table functions
%timeit fsrtable(sss, codnames, po, np.sort(po), af, gf)
%timeit fsrtable2(sss, codnames, po, np.sort(po), af, gf)

100 loops, best of 3: 2.39 ms per loop
1000 loops, best of 3: 1.72 ms per loop


In [63]:
# Confirm identical output
print fsrtable(sss, codnames, po, np.sort(po), af, gf)
print fsrtable2(sss, codnames, po, np.sort(po), af, gf)

     S  Var       p     p_m  alpha_F  gamma_F
0    1   V4  0.0000  0.0000   0.0071   0.0000
1    2   V7  0.0000  0.0000   0.0115   0.0000
2    3   V3  0.0000  0.0000   0.0167   0.0000
3    4  V11  0.0000  0.0000   0.0227   0.0000
4    5   V2  0.0003  0.0003   0.0300   0.0004
5    6  V10  0.0078  0.0078   0.0389   0.0100
6    7   V1  0.0116  0.0116   0.0500   0.0116
7    8   V5  0.0973  0.0800   0.0643   0.0756
8    9  V15  0.0800  0.0973   0.0833   0.0480
9   10  V12  0.1259  0.1259   0.1100   0.0572
10  11  V14  0.2040  0.2040   0.1500   0.0680
11  12   V8  0.2480  0.2480   0.2167   0.0572
12  13   V9  0.3679  0.3679   0.3500   0.0526
13  14   V6  0.6474  0.6474   0.7500   0.0432
14  15  V13  0.7110  0.7110   1.0000   0.7110
     S  Var       p     p_m alpha_F gamma_F
0    1   V4  0.0000  0.0000  0.0071  0.0000
1    2   V7  0.0000  0.0000  0.0115  0.0000
2    3   V3  0.0000  0.0000  0.0167  0.0000
3    4  V11  0.0000  0.0000  0.0227  0.0000
4    5   V2  0.0003  0.0003  0.0300  0.0004


In [10]:
d = pd.concat([Y2,X2],axis=1)
ffsr(d,0.05)

Unnamed: 0,S,Var,p,p_m,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.0071,0.0
1,2,V7,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V2,0.0003,0.0003,0.03,0.0005
5,6,V10,0.0078,0.0078,0.0389,0.0112
6,7,V1,0.0116,0.0116,0.05,0.013
7,8,V5,0.0973,0.0973,0.0643,0.0681
8,9,V15,0.08,0.0973,0.0833,0.0681
9,10,V12,0.1259,0.1259,0.11,0.0687


In [5]:
%load_ext line_profiler

In [11]:
lstats = %lprun -r -f ffsr ffsr(d, 0.05)
lstats.print_stats()

Timer unit: 1e-06 s

Total time: 0.033678 s
File: <ipython-input-8-e22c52bb1641>
Function: ffsr at line 286

Line #      Hits         Time  Per Hit   % Time  Line Contents
   286                                           def ffsr(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
   287                                               
   288                                               ### Input params:
   289                                               #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
   290                                               #   g0       = float pre-specified FSR of interest ("gamma0")
   291                                               #   betaout  = boolean of whether to include estimated betahats from final selected model
   292                                               #   gs       = float or vector of gamma's at which to specifically compute alpha_F
   293            

In [18]:
%timeit -n100 ffsr(d,0.05)

100 loops, best of 3: 18.8 ms per loop


In [16]:
import ffsr2_d10 as f2

In [19]:
%timeit -n100 f2.ffsr(X2,Y2,0.05)

100 loops, best of 3: 19.4 ms per loop


In [None]:
# Marginal improvement in speed: -0.6ms

# d.copy (730, 2.2%)                             [+0.3% b/c copying both X and Y now]
# Clean missing values (all of it; 2130, 6.3%)   [-3.8%]
# Forward sel proc (25216, 74.9%)                [+7.5%]
# Cov_order (531, 1.6%)                          [-0.5%]
# Pval_comp (1287, 3.8%)                         [+0.7%]
# Gamma_F (145, 0.4%)                            [0%]
# Fsrtable (3463, 10.3%)                         [-3.5%]

In [28]:
### Alternate version:

""" FastFSR function """
def ffsr2(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure dat = pandas dataframes or else convert them
    if bag==False:
        if df_type(dat)==True:
            if isinstance(dat,pd.DataFrame):
                d = dat.copy()
            else:
                if isinstance(dat,np.ndarray):
                    d = pd.DataFrame(dat)
                    vnum = list(np.arange(d.shape[1])+1)
                    vchr = list(np.repeat("V",d.shape[1]))
                    d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(dat)
        
        ### Remove missing values
        d.dropna(inplace=True)
        
        ### Check that p < n to ensure regression solutions
        if (d.shape[1]-1) >= d.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        d = dat.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = d.shape[1]-1
        
    y, x = pd.DataFrame(d.iloc[:,0]), d.iloc[:,1:]
        
    ### Perform forward selection
    fwd_sel = forward(x, y, max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(d.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, d.shape[1], max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(x, y, g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, d.shape[1]-1, max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, betahats, alpha_F_g(gs, g_F, d.shape[1]-1)
            else:
                return fsr_results, alpha_F_g(gs, g_F, d.shape[1]-1)
        else:
            if betaout==True:
                return fsr_results, betahats
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, d.shape[1]-1), len(betahats)

In [31]:
%timeit -n100 ffsr(d,0.05)
%timeit -n100 f2.ffsr(X2,Y2,0.05)
%timeit -n100 ffsr2(d,0.05)       # KEEP this version

100 loops, best of 3: 19.4 ms per loop
100 loops, best of 3: 20.7 ms per loop
100 loops, best of 3: 19.1 ms per loop


In [32]:
%load_ext rpy2.ipython

In [33]:
%%R -i X2,Y2

fsr.fast<-function(x,y,gam0=.05,digits=4,print=T,plot=F){
# estimated alpha for forward selection using Fast FSR (no simulation)
# typical call: fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20])->out
# for use inside simulation loops, set print=F and plot=F
# version 7 circa Nov. 2009, modified to handle partially blank colnames
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
if(any(colnames(x)==""))colnames(x)<-NULL       # if only partially named columns
colnames(x)<-colnames(x,do.NULL=F,prefix="")    # corrects for no colnames
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
# gmax<-max(ghat)
# index.max<-which.max(ghat)           # index of largest ghat
# alphamax<-alpha[index.max]           # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]           # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
# ghat3<-(m-size1+1)*alpha/(1+S)         # uses final ku est.
ghat4<-(m-size1+1)*alpha/(1+0:m)
#res<-data.frame(real.seq,ghigh=ghat2,glow=ghat[2:ng])
res<-data.frame(real.seq,g=ghat[2:ng])
if(print)print(round(res,digits))
#if(plot){
#plot(zp$a,zp$g,type="b",xlab="Alpha",ylab="Estimated Gamma",xlim=c(0,alphamax))
#points(alphahat.fast,gam0,pch=19)
#lines(c(-1,alphahat.fast),c(gam0,gam0))
#lines(c(alphahat.fast,alphahat.fast),c(-1,gam0))
#}  # ends plot
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}

x2 = as.matrix(X2)
y2 = as.matrix(Y2)

#system.time(fsr.fast(x=x2,y=y2))
start = proc.time()
out <- fsr.fast(x=x2,y=y2)
print(out)
print(proc.time()-start)

Loading required package: leaps
   var   pval  pvmax    Rsq      g
1    4 0.0000 0.0000 0.3778 0.0000
2    7 0.0000 0.0000 0.6402 0.0000
3    3 0.0000 0.0000 0.8519 0.0000
4   11 0.0000 0.0000 1.0000 0.0000
5    2 0.0003 0.0003 1.0000 0.0004
6   10 0.0078 0.0078 1.0000 0.0100
7    1 0.0116 0.0116 1.0000 0.0116
8    5 0.0973 0.0973 1.0000 0.0584
9   15 0.0800 0.0973 1.0000 0.0584
10  12 0.1259 0.1259 1.0000 0.0572
11  14 0.2040 0.2040 1.0000 0.0680
12   8 0.2480 0.2480 1.0000 0.0572
13   9 0.3679 0.3679 1.0000 0.0526
14   6 0.6474 0.6474 1.0000 0.0432
15  13 0.7110 0.7110 1.0000 0.0000
$mod

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)          xV4          xV7          xV3         xV11          xV2  
  4.441e-16    6.000e+00    4.000e+00    5.000e+00    5.000e+00    2.758e-16  
       xV10          xV1  
 -7.575e-16    2.434e-17  


$size
[1] 7

$x.ind
[1]  4  7  3 11  2 10  1

$alphahat.ER
[1] 0.05

   user  system elapsed 
  0.028   0.008   0.042 


In [39]:
%timeit -n1 -r1 f = ffsr2(d,g0=0.05,betaout=True,gs=0.05)
f
# Faster by ~5ms

1 loops, best of 1: 23.3 ms per loop


(     S  Var       p     p_m alpha_F gamma_F
 0    1   V4  0.0000  0.0000  0.0071  0.0000
 1    2   V7  0.0000  0.0000  0.0115  0.0000
 2    3   V3  0.0000  0.0000  0.0167  0.0000
 3    4  V11  0.0000  0.0000  0.0227  0.0000
 4    5   V2  0.0003  0.0003  0.0300  0.0005
 5    6  V10  0.0078  0.0078  0.0389  0.0112
 6    7   V1  0.0116  0.0116  0.0500  0.0130
 7    8   V5  0.0973  0.0973  0.0643  0.0681
 8    9  V15  0.0800  0.0973  0.0833  0.0681
 9   10  V12  0.1259  0.1259  0.1100  0.0687
 10  11  V14  0.2040  0.2040  0.1500  0.0850
 11  12   V8  0.2480  0.2480  0.2167  0.0763
 12  13   V9  0.3679  0.3679  0.3500  0.0788
 13  14   V6  0.6474  0.6474  0.7500  0.0863
 14  15  V13  0.7110  0.7110  1.0000  0.0444,              beta       beta_se
 V4   6.000000e+00  1.672676e-15
 V7   4.000000e+00  1.676875e-15
 V3   5.000000e+00  1.668047e-15
 V11  5.000000e+00  2.016266e-15
 V2   1.137979e-14  1.855839e-15
 V10  5.481726e-15  1.817640e-15
 V1  -1.110223e-15  1.727931e-15, 0.05)

In [None]:
###########################################################
# Compare Python & R on NCSU's dataset
###########################################################

In [53]:
ncaa2 = pd.read_csv("ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)
type(ncaa2)

pandas.core.frame.DataFrame

In [54]:
%%R -i ncaa2

ncaa2 = as.matrix(ncaa2)

system.time(fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20]))

   var   pval  pvmax    Rsq      g
1    2 0.0000 0.0000 0.7069 0.0000
2    3 0.0001 0.0001 0.7539 0.0004
3    5 0.0116 0.0116 0.7708 0.0270
4    4 0.0053 0.0116 0.7901 0.0270
5    7 0.0025 0.0116 0.8110 0.0270
6   17 0.0433 0.0433 0.8197 0.0804
7   15 0.0527 0.0527 0.8274 0.0791
8    6 0.1056 0.1056 0.8327 0.0864
9    9 0.0826 0.1056 0.8386 0.0864
10   8 0.0536 0.1056 0.8457 0.0864
11  12 0.2350 0.2350 0.8484 0.1566
12  10 0.2864 0.2864 0.8505 0.1542
13  13 0.3163 0.3163 0.8524 0.1054
14  18 0.2697 0.3163 0.8546 0.1054
15  11 0.4953 0.4953 0.8555 0.1238
16   1 0.6326 0.6326 0.8559 0.1116
17  14 0.7056 0.7056 0.8562 0.0784
18  19 0.8605 0.8605 0.8563 0.0453
19  16 0.9032 0.9032 0.8563 0.0000
   user  system elapsed 
  0.007   0.000   0.006 


In [65]:
cols = ncaa2.columns.tolist()
cols = cols[-1:] + cols[:-1]
print cols
nca2 = pd.DataFrame(ncaa2[cols],dtype='float')
#nca2

['y', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19']


In [66]:
%timeit -n1 -r1 ffsr2(nca2,0.05)

1 loops, best of 1: 23.8 ms per loop


In [67]:
%%file ffsr3_d11.py
""" Pseudocode for Fast FSR algorithm """

""" Date: 4/24/15 
    Modified: Improve missing value adjustments (code to test this comes after this cell) """


""" Data type check """
def df_type(dat):
    
    ### Input params:
    #   dat = dataset whose type is to be checked / transformed
    
    ### Output:
    #   error msg or True boolean
    
    import numpy as np
    import pandas as pd
    
    if isinstance(dat,pd.DataFrame)==False and isinstance(dat,np.ndarray)==False:
        raise Exception("Data must be pandas DataFrame")
    else:
        return True


    
""" p-value computation function """
def pval_comp(max_size=None):
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import scipy.stats as st
    import rpy2.robjects as ro
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # vector of model sizes
    sizes = np.arange(max_size)+1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (sizes+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(sizes+1))



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    # Input params:
    #   xcolnames = array of names of covariates (same order as columns in original dataset)
    #   max_size  = integer max no. of vars in final model (largest model size desired)
    #   col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import rpy2.robjects as ro
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    ### and declare as R objects in global environment
    ro.globalenv['x2'] = com.convert_to_r_matrix(x)
    ro.globalenv['y2'] = com.convert_to_r_matrix(y)
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None):
    
    ### Input params:
    #   pvs      = vector of p-values (monotonically increasing) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of gamma_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pvs * (ncov - S) / (1 + S)
    
    # Check for duplicate p-values
    dups = list(set([x for x in list(pvs) if list(pvs).count(x) > 1]))
    for i in range(len(dups)): g_F[pvs==dups[i]] = min(g_F[pvs==dups[i]])
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_mono == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pvs[-1]
    
    return g_F

    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None):
    
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of alpha_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return alpha_F        
    
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    
    import numpy as np
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return g * (1 + S) / (ncov - S)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return g * (1 + S) / (ncov - S)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname):
    
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_mono
    #            used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    
    ### Output:
    # array of estimated parameters
    
    import numpy as np
    import statsmodels.api as sm
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])

    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]

    ### Fit the linear model using the selected model vars
    fit = sm.OLS(y,x.loc[:,list(modvars)]).fit()
    betaout = pd.DataFrame([fit.params,fit.bse]).T
    betaout.columns = ['beta','beta_se']
    
    return betaout

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f='.4f'):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f = string of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    import numpy as np
    import pandas as pd
    
    ### Round all arrays
    p_od = [format(x,prec_f) for x in p_orig]
    p_md = [format(x,prec_f) for x in p_mono]
    ad = [format(x,prec_f) for x in alphaf]
    gd = [format(x,prec_f) for x in gammaf]
    
    ### Combine the arrays
    tab = pd.DataFrame([size,vname,p_od,p_md,ad,gd]).T
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure dat = pandas dataframes or else convert them
    if bag==False:
        if df_type(dat)==True:
            if isinstance(dat,pd.DataFrame):
                d = dat.copy()
            else:
                if isinstance(dat,np.ndarray):
                    d = pd.DataFrame(dat)
                    vnum = list(np.arange(d.shape[1])+1)
                    vchr = list(np.repeat("V",d.shape[1]))
                    d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(dat)
        
        ### Remove missing values
        d.dropna(inplace=True)
        
        ### Check that p < n to ensure regression solutions
        if (d.shape[1]-1) >= d.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        d = dat.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = d.shape[1]-1
        
    y, x = pd.DataFrame(d.iloc[:,0]), d.iloc[:,1:]
        
    ### Perform forward selection
    fwd_sel = forward(x, y, max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(d.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, d.shape[1], max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(x, y, g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, d.shape[1]-1, max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, betahats, alpha_F_g(gs, g_F, d.shape[1]-1)
            else:
                return fsr_results, alpha_F_g(gs, g_F, d.shape[1]-1)
        else:
            if betaout==True:
                return fsr_results, betahats
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, d.shape[1]-1), len(betahats)

    
    
def bagfsr(dat,g0,B=200,max_s=None,v_incl=None,prec=4):
    
    ### Input params:
    #   dat    = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0     = float pre-specified FSR of interest ("gamma0")
    #   B      = integer of number of bagged samples
    #   max_s  = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   v_incl = array of cols corresponding to those vars to force into model
    #   prec   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #   Mean of betahats
    #   SEs of betahats
    #   Avg alpha-to-enter
    #   Avg model size
    #   Prop of times each var included in model
    
    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if df_type(dat)==True:
        if isinstance(dat,pd.DataFrame):
            d = dat.copy()
        else:
            if isinstance(dat,np.ndarray):
                d = pd.DataFrame(dat)
                vnum = list(np.arange(d.shape[1])+1)
                vchr = list(np.repeat("V",d.shape[1]))
                d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
    else:
        return df_type(dat)
    
    ### Remove missing values
    d.dropna(inplace=True)
    
    ### check that p < n to ensure regression solutions
    if (d.shape[1]-1) >= d.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### Create array to keep track of number of times vars enter model
    nentries = pd.DataFrame(np.zeros(d.shape[1]-1),index=d.columns.values[1:])
    
    ### Create array to store all estimated coefficients, ses, alphas, sizes
    allbetas = pd.DataFrame(np.zeros([B,(d.shape[1]-1)]),columns=d.columns.values[1:])
    allses = allbetas.copy()
    alphas = []
    sizes = []
    np.random.seed(1234)
    
    ### Bagging loops
    for i in range(B):

        # Draw with replacement from rows of data
        n_row = d.shape[0]
        rand_row = np.random.randint(0,n_row,n_row)
        newdat = d.iloc[rand_row,:]
        newdat.index = np.arange(n_row)+1
        
        ### Obtain FSR results
        fsrout = ffsr(newdat.iloc[:,1:],pd.DataFrame(newdat.iloc[:,0]),g0,bag=True,max_size=max_s,var_incl=v_incl)
        allbetas.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,0]
        allses.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,1]
        alphas.append(fsrout[1])
        sizes.append(fsrout[2])

        ### Update counts num times var included
        nentries.loc[fsrout[0].index[np.abs(np.around(fsrout[0].iloc[:,0],prec))>0]] += 1
        
    ### Compute averages
    avgbeta = np.around(allbetas.mean(axis=0),prec) # mean across rows / colmeans == mean of each cov's betahat
    avgse = np.around(allses.mean(axis=0),prec)
    avgalpha = np.mean(alphas)
    avgsize = np.mean(sizes)
    var_props = nentries/float(B)
    cov_res = pd.concat([avgbeta,avgse,var_props],axis=1)
    cov_res.columns = ['betahat','betase','prop_incl']
    
    return cov_res, avgalpha, avgsize
    
    

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

Writing ffsr3_d11.py


In [49]:
b = bagfsr(X,Y,0.05)

In [50]:
b[0]

Unnamed: 0,betahat,betase,prop_incl
V1,0,0,0
V2,0,0,0
V3,5,0,1
V4,6,0,1
V5,0,0,0
V6,0,0,0
V7,4,0,1
V8,0,0,0
V9,0,0,0
V10,0,0,0


In [51]:
b[1:]

(0.038899999999999997, 6.0)