In [1]:
                            """ Pseudocode for Fast FSR algorithm """

""" Date: 3/27/15 
    Modified: Defined the alpha (2) and gamma functions """

    
""" p-value computation function """
def pval_comp(fwd_proc,ncov):
    
    import scipy.stats as st
    
    ### Input params:
    #   fwd_proc = leaps::regsubsets object generated in Python <-- must be declared in global R env
    #   ncov     = integer number of covariates
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    # make fwd an R object in the global environment
    ro.globalenv['out_r'] = fwd_proc
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('out_r$rss'))
    N = np.array(ro.r('out_r$nn'))
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:ncov] - rss[1:(ncov+1)]) / (rss[1:(ncov+1)] / (N - (ncov+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(ncov+1))



""" Covariate model entry order """
def cov_order(fwd_proc,xcolnames):
    
    # Input params:
    #   fwd_proc  = leaps::regsubsets object generated in Python <-- must be declared in global R env
    #   xcolnames = array of names of covariates
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### Declare fwd as an object in the global R environment
    ro.globalenv['out_cov'] = fwd_proc
    
    ### Pull the cov entry order
    vorder = ro.r('out_cov$vorder[-1]')
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return xcolnames[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y):
    
    ### Input params:
    #   x  = python dataframe of original p covariates, n x p
    #   y  = python outcome dataframe, n x 1
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    from rpy2.robjects.packages import importr
    
    ### Load base R package & regsubsets fcn from leaps R library
    regsub = ro.r('leaps::regsubsets')
    base = importr('base')
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    X = com.convert_to_r_matrix(x)
    Y = com.convert_to_r_matrix(y)
    
    ### Perform forward selection with regsubsets function
    # Note #2: make nvmax an input argument
    return regsub(x=X,y=Y,method="forward",nvmax=base.ncol(X))
    
    
    
""" Alpha computation for model selection """
def alpha_F_m(g0, ncov):
    
    ### Input params:
    #   g0   = float pre-specified FSR (gamma0)
    #   ncov = integer number of covariates
    
    ### Output:
    # array of alpha_F values
    
    # Create indices == model size at given step, call this S
    S = np.arange(ncov)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1
    
    return alpha_F
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov):
    
    import numpy as np
    
    ### Input params:
    #   pvs  = vector of p-values (sorted or unsorted) from forward sel procedure
    #   ncov = integer number of covariates
    
    ### Output:
    # array of gamma_F values
    
    # sort pvalues to be monotonically increasing 
    pv_s = np.sort(pvs)
    
    # Create indices == model size at given step, call this S
    S = np.arange(ncov)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pv_s * (ncov - S) / (1 + S)
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_sort == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pv_s[-1]
    
    return g_F
        
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    
    ### Compute model size for (each) g
    S = np.array([max(np.which(x<=y)) for x in gf, y in g])+1
    
    return g * (1 + S) / (ncov - S)    

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_sort, alphaf, gammaf):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_sort = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    ### Convert all arrays to dataframes
    sized = pd.DataFrame(size)
    vnamed = pd.DataFrame(vname)
    p_od = pd.DataFrame(p_orig)
    p_sd = pd.DataFrame(p_sort)
    ad = pd.DataFrame(alphaf)
    gd = pd.DataFrame(gammaf)
    
    ### Combine the arrays
    tab = pd.concat([sized,vnamed,p_od,p_sd,ad,gd],axis=1)
    tab.columns = ['S','Var','p','p_s','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(x,y,g0):#,gs=Null):
    
    ### Input params:
    #   x  = python dataframe of original p covariates, n x p
    #   y  = python outcome dataframe, n x 1
    #   g0 = float pre-specified FSR of interest
    #   gs = float or vector of gamma's at which to specifically compute alpha_F
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_s:     sorted p-value (vector or original p-values sorted in increasing order)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    # Vector of alpha_F's for specified gamma's (g)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data
    # make sure x,y = pandas dataframes or else convert them
    # remove missing values
    # check that p < n to ensure regression solutions
    
    ### Perform forward selection
    fwd_sel = forward(x, y)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(fwd_sel, x.columns.values)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(fwd_sel, x.shape[1])
    
    ### Sort p-values in ascending order
    p_sort = np.sort(p_orig)
    
    ### Alpha_F computation for all steps in fwd sel proc
    a_F = alpha_F_m(g0, x.shape[1])
    
    ### Gamma_F computation
    g_F = gamma_F(p_sort, x.shape[1])
    
    ### Model size
    S = np.arange(x.shape[1])+1
    
    ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
    fsr_results = fsrtable(S, cov_entry_order, p_orig, p_sort, a_Fm, g_F)
    
    ### Compute alpha_F for specific gamma0
    alphas = alpha_F_g(gs, g_f, x.shape[1])
    
    return fsr_results, alphas

# Notes: 
# 1. will need to adjust above functions to handle fwd_sel steps with tied p-values for >1 cov
# 2. need to adjust function to control how many vars shown in final output
# 3. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

In [1]:
###########################################################
### Code to test / build functions:

In [1]:
%load_ext rpy2.ipython # code to load/connect to R software

In [4]:
import rpy2.robjects as ro
import pandas.rpy.common as com
from rpy2.robjects.packages import importr

# load R package
leaps = importr('leaps')
stats = importr('stats')
base = importr('base')

regsub = ro.r('leaps::regsubsets')

In [2]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1)
Y = X.dot(beta)

In [3]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [5]:
y2 = com.convert_to_r_matrix(Y2)
x2 = com.convert_to_r_matrix(X2)

In [6]:
out = regsub(x=x2,y=y2,method="forward",nvmax=base.ncol(x2))
print(base.summary(out))

Subset selection object
15 Variables  (and intercept)
    Forced in Forced out
V1      FALSE      FALSE
V2      FALSE      FALSE
V3      FALSE      FALSE
V4      FALSE      FALSE
V5      FALSE      FALSE
V6      FALSE      FALSE
V7      FALSE      FALSE
V8      FALSE      FALSE
V9      FALSE      FALSE
V10     FALSE      FALSE
V11     FALSE      FALSE
V12     FALSE      FALSE
V13     FALSE      FALSE
V14     FALSE      FALSE
V15     FALSE      FALSE
1 subsets of each size up to 15
Selection Algorithm: forward
          V1  V2  V3  V4  V5  V6  V7  V8  V9  V10 V11 V12 V13 V14 V15
1  ( 1 )  " " " " " " "*" " " " " " " " " " " " " " " " " " " " " " "
2  ( 1 )  " " " " " " "*" " " " " "*" " " " " " " " " " " " " " " " "
3  ( 1 )  " " " " "*" "*" " " " " "*" " " " " " " " " " " " " " " " "
4  ( 1 )  " " " " "*" "*" " " " " "*" " " " " " " "*" " " " " " " " "
5  ( 1 )  " " "*" "*" "*" " " " " "*" " " " " " " "*" " " " " " " " "
6  ( 1 )  " " "*" "*" "*" " " " " "*" " " " " "*" "*" " " " " " "

In [7]:
ro.globalenv['out2'] = out
print(ro.r('out2$vorder'))
print(ro.r('out2$vorder[-1]'))

 [1]  1  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14

 [1]  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14



In [8]:
varorder = ro.r('out2$vorder[-1]')
vorder2 = np.array(varorder)-2
print list(vorder2+1)
col_nam = X2.columns.values
vnames = col_nam[vorder2[::]]
print vnames
print type(vnames)

[4, 7, 3, 11, 2, 10, 1, 5, 15, 12, 14, 8, 9, 6, 13]
['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1' 'V5' 'V15' 'V12' 'V14' 'V8' 'V9' 'V6'
 'V13']
<type 'numpy.ndarray'>


In [9]:
nc = X2.shape[1]
ncs = np.arange(nc)
nn = np.array(ro.r('out2$nn'))
rss = np.array(ro.r('out2$rss'))
fstats = (rss[:nc] - rss[1:(nc+1)]) / ( rss[1:(nc+1)] / (nn - (nc+1)))
print np.sort(fstats)
import scipy.stats as st
ps = 1 - st.f.cdf(fstats, 1, nn-(nc+1))
print
print np.around(np.sort(ps),3)

[  1.38184810e-01   2.08184008e-01   8.00253031e-01   1.30616893e+00
   1.56296418e+00   2.25275801e+00   2.59152172e+00   2.92586365e+00
   6.06031952e+00   6.67913008e+00   1.28731035e+01   5.10036417e+01
   6.12612668e+01   1.20047339e+02   8.19949956e+31]

[ 0.     0.     0.     0.     0.001  0.011  0.016  0.091  0.111  0.137
  0.215  0.256  0.374  0.649  0.711]


In [10]:
sizes = np.arange(X.shape[1])+1

In [18]:
# Date: 4/20/15
# Goal: more efficient table creation
#tttest = np.array([vnames,sizes,ps]).T.reshape(15,3)
pd.DataFrame([vnames,sizes,ps]).T#.T.reshape(15,3)
pd.DataFrame(np.array([vnames,sizes,ps])).T
#pd.DataFrame(tttest)

ValueError: only 2 non-keyword arguments accepted

In [44]:
cnam = pd.DataFrame(vnames)
sz = pd.DataFrame(sizes)
psd = pd.DataFrame(ps)
ttt = pd.concat([cnam,sz,psd],axis=1)
ttt.columns = ['Vname','Size','p']
print type(ttt)
print ttt

<class 'pandas.core.frame.DataFrame'>
   Vname  Size             p
0     V4     1  3.070005e-10
1     V7     2  1.345091e-11
2     V3     3  1.110223e-16
3    V11     4  0.000000e+00
4     V2     5  5.592588e-04
5    V10     6  1.148310e-02
6     V1     7  1.587059e-02
7     V5     8  1.111889e-01
8    V15     9  9.086165e-02
9    V12    10  1.371262e-01
10   V14    11  2.147034e-01
11    V8    12  2.563364e-01
12    V9    13  3.735738e-01
13    V6    14  6.493720e-01
14   V13    15  7.110283e-01


In [48]:
###########################################################
### Test functions:

In [62]:
fwd2 = forward(X2,Y2)

In [63]:
codnames = cov_order(fwd2,X2.columns.values)

In [64]:
po = pval_comp(fwd2,X2.shape[1])

In [65]:
gg00 = 0.05
af = alpha_F_m(gg00, X2.shape[1])

In [66]:
gf = gamma_F(po, X2.shape[1])

In [67]:
sss = np.arange(X2.shape[1])+1

In [68]:
fsr_results = fsrtable(sss, codnames, po, np.sort(po), af, gf)

In [69]:
fsr_results

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V4,3.070005e-10,0.0,0.007143,0.0
1,2,V7,1.345091e-11,1.110223e-16,0.011538,4.810966e-16
2,3,V3,1.110223e-16,1.345091e-11,0.016667,4.035272e-11
3,4,V11,0.0,3.070005e-10,0.022727,6.754012e-10
4,5,V2,0.0005592588,0.0005592588,0.03,0.000932098
5,6,V10,0.0114831,0.0114831,0.038889,0.01476399
6,7,V1,0.01587059,0.01587059,0.05,0.01587059
7,8,V5,0.1111889,0.09086165,0.064286,0.07067017
8,9,V15,0.09086165,0.1111889,0.083333,0.06671337
9,10,V12,0.1371262,0.1371262,0.11,0.06233007
