In [None]:
                                    """ Pseudocode for Fast FSR algorithm """

""" Date: 3/25/15 
    First running, basic version of FFSR functions """

    
""" p-value computation function """
def pval_comp(fwd_proc,ncov):
    
    ### Input params:
    #   fwd_proc = leaps::regsubsets object generated in Python <-- must be declared in global R env
    #   ncov     = number of covariates
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    # make fwd an R object in the global environment
    ro.globalenv['out_r'] = fwd_proc
    
    # Use F-test to compute p-value : 
    #  F_stat = [ (RSS_r - RSS_f) / (p_f - p_r) ] / [ RSS_f / (n - p_f) ] ~ F(p_f - p_r, n - p_f)
    #  pval = Fdistn_invcdf(F_stat)
    
    # create vector from 1 to number of covariates
    ro.globalenv['ncov'] = ncov
    ro.globalenv['ncovs'] = ro.r('1:ncov')
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    ro.globalenv['fstats'] = ro.r('(out_r$rss[ncovs]-out_r$rss[ncovs+1]) / (out_r$rss[ncovs+1] / (out_r$nn-(ncovs+1))')
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return np.array(ro.r('pf(fstats,1,out_r$nn-(ncovs+1),lower.tail=F)'))



""" Covariate model entry order """
def cov_order(fwd_proc,xcolnames):
    
    # Input params:
    #   fwd_proc  = leaps::regsubsets object generated in Python <-- must be declared in global R env
    #   xcolnames = names of covariates
    
    ### Output:
    # covariate names sorted according to order of entry into the model
    
    ### Declare fwd as an object in the global R environment
    ro.globalenv['out_cov'] = fwd_proc
    
    ### Pull the cov entry order
    vorder = ro.r('out_cov$vorder')
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return xcolnames[vorderinds[::]]
    


""" Forward selection function """
def forward(x,y):
    
    ### Input params:
    #   x  = python dataframe of original p covariates, n x p
    #   y  = python outcome dataframe, n x 1
    
    ### Output:
    # a regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    from rpy2.robjects.packages import importr
    
    ### Load base R package & regsubsets fcn from leaps R library
    regsub = ro.r('leaps::regsubsets')
    base = importr('base')
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    X = com.convert_to_r_matrix(x)
    Y = com.convert_to_r_matrix(y)
    
    ### Perform forward selection with regsubsets function
    # Note #2: make nvmax an input argument
    return regsub(x=X,y=Y,method="forward",nvmax=base.ncol(X))
    
    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_sort, alphaf, gammaf):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_sort = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    ### Convert all arrays to dataframes
    sized = pd.DataFrame(size)
    vnamed = pd.DataFrame(vname)
    p_od = pd.DataFrame(p_orig)
    p_sd = pd.DataFrame(p_sort)
    ad = pd.DataFrame(alphaf)
    gd = pd.DataFrame(gammaf)
    
    ### Combine the arrays
    tab = pd.concat([sized,vnamed,p_od,p_sd,ad,gd],axis=1)
    tab.columns = ['S','Var','p','p_s','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(x,y,g0):
    
    ### Input params:
    #   x  = python dataframe of original p covariates, n x p
    #   y  = python outcome dataframe, n x 1
    #   g0 = pre-specified FSR of interest
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_s:     sorted p-value (vector or original p-values sorted in increasing order)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data
    # make sure x,y = pandas dataframes or else convert them
    # remove missing values
    # check that p < n to ensure regression solutions
    
    ### Perform forward selection
    fwd_sel = forward(x,y)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(fwd_sel,x.columns.values)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(fwd_sel,x.shape[1])
    
    ### Sort p-values in ascending order
    p_sort = np.sort(p_orig)
    
    ### Alpha_F computation
    a_F = alpha_F(gamma0, p_orig, x.shape[1])
    
    ### Gamma_F computation
    g_F = gamma_F(a_F, p_sort, x.shape[1])
    
    ### Model size
    S = np.arange(x.shape[1])+1
    
    ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
    fsr_results = fsrtable(S, cov_entry_order, p_orig, p_sort, a_F, g_F)
    
    return fsr_results

# Notes: 
# 1. will need to adjust above functions to handle fwd_sel steps with tied p-values for >1 cov
# 2. need to adjust function to control how many vars shown in final output
# 3. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

In [1]:
%load_ext rpy2.ipython # code to load/connect to R software

In [19]:
import rpy2.robjects as ro
import pandas.rpy.common as com
from rpy2.robjects.packages import importr

# load R package
leaps = importr('leaps')
stats = importr('stats')
base = importr('base')

regsub = ro.r('leaps::regsubsets')

In [20]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1)
Y = X.dot(beta)

In [21]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [22]:
y2 = com.convert_to_r_matrix(Y2)
x2 = com.convert_to_r_matrix(X2)

In [23]:
out = regsub(x=x2,y=y2,method="forward",nvmax=base.ncol(x2))

In [24]:
ro.globalenv['out2'] = out
print(ro.r('out2$vorder'))
print(ro.r('out2$vorder[-1]'))

 [1]  1  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14

 [1]  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14



In [25]:
varorder = ro.r('out2$vorder[-1]')
vorder2 = np.array(varorder)-2
print list(vorder2+1)
col_nam = X2.columns.values
vnames = col_nam[vorder2[::]]
print vnames
print type(vnames)

[4, 7, 3, 11, 2, 10, 1, 5, 15, 12, 14, 8, 9, 6, 13]
['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1' 'V5' 'V15' 'V12' 'V14' 'V8' 'V9' 'V6'
 'V13']
<type 'numpy.ndarray'>


In [27]:
ro.globalenv['ncov'] = base.ncol(x2)
ro.globalenv['ncovs'] = ro.r('1:ncov')
ro.globalenv['fstats'] = ro.r('(out2$rss[ncovs]-out2$rss[ncovs+1])*(out2$nn-(ncovs+1)) / out2$rss[ncovs+1]')
ps = np.array(ro.r('pf(fstats,1,out2$nn-(ncovs+1),lower.tail=F)'))
print(ro.r('sort(fstats)'))
print
print np.around(np.sort(ps),3)

 [1] 1.381848e-01 2.106624e-01 8.193067e-01 1.352818e+00 1.637391e+00
 [6] 2.386851e+00 2.807482e+00 3.134854e+00 6.637493e+00 7.394751e+00
[11] 1.440562e+01 5.950425e+01 7.074218e+01 1.371970e+02 9.273244e+31


[ 0.     0.     0.     0.     0.     0.008  0.012  0.08   0.097  0.126
  0.204  0.248  0.368  0.647  0.711]


In [28]:
sizes = np.arange(X.shape[1])+1

In [30]:
cnam = pd.DataFrame(vnames)
sz = pd.DataFrame(sizes)
psd = pd.DataFrame(ps)
ttt = pd.concat([cnam,sz,psd],axis=1)
ttt.columns = ['Vname','Size','p']
print type(ttt)
print ttt

<class 'pandas.core.frame.DataFrame'>
   Vname  Size             p
0     V4     1  1.029111e-11
1     V7     2  3.567563e-13
2     V3     3  3.312340e-20
3    V11     4  0.000000e+00
4     V2     5  2.608114e-04
5    V10     6  7.805512e-03
6     V1     7  1.157750e-02
7     V5     8  9.725851e-02
8    V15     9  8.002123e-02
9    V12    10  1.259084e-01
10   V14    11  2.040472e-01
11    V8    12  2.479664e-01
12    V9    13  3.679118e-01
13    V6    14  6.474203e-01
14   V13    15  7.110283e-01
