In [2]:
                            """ Pseudocode for Fast FSR algorithm """

""" Date: 4/1/15 
    Modified: Output beta-hat estimates for final desired model """

    
""" p-value computation function """
def pval_comp(max_size=None,prec_f=4):
    
    import scipy.stats as st
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (max_size+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return np.around(1 - st.f.cdf(fstats, 1, N-(max_size+1)),prec_f)



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    # Input params:
    #   xcolnames = array of names of covariates (same order as columns in original dataset)
    #   max_size  = integer max no. of vars in final model (largest model size desired)
    #   col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    X = com.convert_to_r_matrix(x)
    Y = com.convert_to_r_matrix(y)
        
    ### Declare all objects as R objects in global environment
    ro.globalenv['x2'] = X
    ro.globalenv['y2'] = Y
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None, prec_f=6):
    
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # array of alpha_F values
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return np.around(alpha_F,prec_f)
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None, prec_f=4):
    
    import numpy as np
    
    ### Input params:
    #   pvs      = vector of p-values (sorted or unsorted) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # array of gamma_F values
    
    # sort pvalues to be monotonically increasing 
    pv_s = np.sort(pvs)
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pv_s * (ncov - S) / (1 + S)
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_sort == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pv_s[-1]
    
    return np.around(g_F,prec_f)
        
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov, prec_f):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # integer alpha_F value
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return np.around(g * (1 + S) / (ncov - S),prec_f)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return round(g * (1 + S) / (ncov - S),prec_f)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname, prec_b=6):
    
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_sorted
    #            used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    #   prec_b = precision on beta-hat estimates
    
    ### Output:
    # array of estimated parameters
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])
    
    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]
    
    ### Create linear regression object
    from sklearn import linear_model
    linmod = linear_model.LinearRegression()

    ### Fit the linear model using the selected model vars
    linmod.fit(X2.loc[:,list(modvars)], Y2)
    
    return np.around(linmod.coef_, prec_b)

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_sort, alphaf, gammaf):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_sort = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    ### Convert all arrays to dataframes
    sized = pd.DataFrame(size)
    vnamed = pd.DataFrame(vname)
    p_od = pd.DataFrame(p_orig)
    p_sd = pd.DataFrame(p_sort)
    ad = pd.DataFrame(alphaf)
    gd = pd.DataFrame(gammaf)
    
    ### Combine the arrays
    tab = pd.concat([sized,vnamed,p_od,p_sd,ad,gd],axis=1)
    tab.columns = ['S','Var','p','p_s','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(x,y,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,prec_f=4,prec_b=6):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    #   prec_b   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_s:     sorted p-value (vector or original p-values sorted in increasing order)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    # Vector of alpha_F's for specified gamma's (g)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data
    # make sure x,y = pandas dataframes or else convert them
    try: 
        x.columns.values #isinstance(x,pd.DataFrame)
    except:
        if isinstance(x,np.ndarray):
            x = pd.DataFrame(x)
            vnum = list(np.arange(x.shape[1])+1)
            vchr = list(np.repeat("V",x.shape[1]))
            x.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            raise Exception("x must be pandas DataFrame")
    try: 
        y.columns.values #isinstance(y,pd.DataFrame)
    except:
        if isinstance(y,np.ndarray):
            y = pd.DataFrame(y)
        else:
            raise Exception("y must be pandas DataFrame")
    
    # remove missing values
    yna = np.isnan(np.array(y))
    xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
    anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
    missrow = np.where(anyna==1)[0]
    y = y.drop(y.index[missrow])
    x = x.drop(x.index[missrow])
    
    # check that p < n to ensure regression solutions
    if x.shape[1] >= x.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = x.shape[1]
        
    ### Perform forward selection
    fwd_sel = forward(x, y, max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(x.columns.values, max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size, prec_f)
    
    ### Sort p-values in ascending order
    p_sort = np.sort(p_orig)
    
    ### Alpha_F computation for all steps in fwd sel proc
    a_F = alpha_F(g0, x.shape[1], max_size, prec_f)
    
    ### Gamma_F computation
    g_F = gamma_F(p_sort, x.shape[1], max_size, prec_f)
    
    ### Model size
    S = np.arange(max_size)+1
    
    ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
    fsr_results = fsrtable(S, cov_entry_order, p_orig, p_sort, a_F, g_F)
    
    ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
    if gs!=None: 
        ### Compute alpha_F for specific gammas (gs)
        if betaout==True:
            ### Compute beta_hat of model corresponding to specific gamma0
            return fsr_results, beta_est(x, y, g0, g_F, cov_entry_order, prec_b), alpha_F_g(gs, g_f, x.shape[1])
        else:
            return fsr_results, alpha_F_g(gs, g_f, x.shape[1])
    else:
        if betaout==True:
            ### Compute beta_hat of model corresponding to specific gamma0
            return fsr_results, beta_est(x, y, g0, g_F, cov_entry_order, prec_b)
        else:
            return fsr_results

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

# To-do:
# 1. will need to adjust above functions to handle fwd_sel steps with tied p-values for >1 cov
# 2. adjust for optimal bagging

In [1]:
###########################################################
### Code to test / build functions:

In [1]:
#%load_ext rpy2.ipython # code to load/connect to R software

In [8]:
import rpy2.robjects as ro
import pandas.rpy.common as com
from rpy2.robjects.packages import importr

# load R package
leaps = importr('leaps')
stats = importr('stats')
base = importr('base')

regsub = ro.r('leaps::regsubsets')

In [3]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
Y = X.dot(beta)

In [4]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [6]:
# check type of Y2, Y
if isinstance(Y,np.ndarray):
    print 2
else:
    print 3

if isinstance(Y2,pd.DataFrame):
    print 2
else:
    print 3
    
if isinstance(Y2,np.ndarray):
    print 2
else:
    print 3

2


In [35]:
# remove missing obs
tx = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
tb = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
ty= tx.dot(tb)
ty[[0,5,14]] = 'NaN'
tx[8,0], tx[40,5], tx[33,14] = 'NaN', 'NaN', 'NaN'
tty = pd.DataFrame(ty)
ttx = pd.DataFrame(tx)
ttx.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]
#yna = np.isnan(np.array(tty))
yna = np.isnan(tty).any(axis=1)
xna = np.isnan(ttx).any(axis=1).reshape(100,1)
anyna = np.array([int(max(x,y)) for x,y in zip(xna,yna)])

mrow = np.where(anyna==1)[0]
tyt = tty.drop(tty.index[mrow])
tyt.shape
txt = ttx.drop(ttx.index[mrow])
txt.shape
print ttx.iloc[6:11,0]
print
print txt.iloc[4:8,0]
print
print ttx.iloc[9,0], txt.iloc[6,0], txt.iloc[7,0] # original 9(8) --> new 6 [moves up by one for each of 0,5,8]

6    -0.359556
7    -1.033754
8          NaN
9    -0.701835
10   -0.638298
Name: V1, dtype: float64

6    -0.359556
7    -1.033754
9    -0.701835
10   -0.638298
Name: V1, dtype: float64

-0.701834673191 -0.701834673191 -0.6382981958


In [26]:
blah = [1,2,3,4]
blaah = list(np.repeat("V",4))
blaaah = [a + str(b) for a,b in zip(blaah,blah)]
print blaaah

list1 = [1,2,3,4,5]
list2 = ["one", "two", "three", "four", "five"]

print ''.join([str(a) + b for a,b in zip(list1,list2)])
print type(list2[0])

['V1', 'V2', 'V3', 'V4']
1one2two3three4four5five
<type 'str'>


In [46]:
ffsr(ttx.iloc[0:14,:],tty.iloc[0:14,:])
#np.where(np.isnan(ttx))

Exception: N must be > p for valid regression solutions

In [55]:
y2 = com.convert_to_r_matrix(Y2)
x2 = com.convert_to_r_matrix(X2)
fi = np.array([2,3,13])
mv = 8
ro.globalenv['mv'] = ro.Vector(mv)
ro.globalenv['x2'] = x2
ro.globalenv['y2'] = y2
ro.globalenv['fi'] = ro.FloatVector(fi[:])

In [89]:
ro.globalenv['out2'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=ncol(x2),force.in=fi)')

ro.globalenv['out2'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=mv,force.in=fi)')
out = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=mv,force.in=fi)')

In [83]:
#ro.globalenv['out2'] = out
print (ro.r('summary(out2)'))
print(ro.r('out2$vorder'))    ### NOTE: even though restricted to nvmax = 8 vars, vorder is still full dataset col length!!
print(ro.r('out2$vorder[-1]'))

Subset selection object
15 Variables  (and intercept)
    Forced in Forced out
V2      FALSE      FALSE
V3       TRUE      FALSE
V13      TRUE      FALSE
V1      FALSE      FALSE
V4      FALSE      FALSE
V5      FALSE      FALSE
V6      FALSE      FALSE
V7      FALSE      FALSE
V8      FALSE      FALSE
V9      FALSE      FALSE
V10     FALSE      FALSE
V11     FALSE      FALSE
V12      TRUE      FALSE
V14     FALSE      FALSE
V15     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: forward
         V2  V3  V13 V1  V4  V5  V6  V7  V8  V9  V10 V11 V12 V14 V15
4  ( 1 ) "*" "*" "*" " " "*" " " " " " " " " " " " " " " " " " " " "
5  ( 1 ) "*" "*" "*" " " "*" " " " " " " " " " " " " "*" " " " " " "
6  ( 1 ) "*" "*" "*" " " "*" " " " " "*" " " " " " " "*" " " " " " "
7  ( 1 ) "*" "*" "*" " " "*" " " " " "*" "*" " " " " "*" " " " " " "
8  ( 1 ) "*" "*" "*" " " "*" " " " " "*" "*" " " " " "*" " " "*" " "

 [1]  1  2  3  4  6 13  9 10 15 12 11 16  8  5  7 14

 [1]  2  3  4  6 

In [69]:
varorder = ro.r('out2$vorder[-1]') # NOTE: this order assumes the forced vars at the start of the list!!!
print varorder
print varorder[0:mv]
varorder = varorder[0:mv] # keep only the max-size desired
print
print varorder
print 
vorder2 = np.array(varorder)-2 # remove intercept and make python indices
print list(vorder2+1)
print
# rearrange the var order st forced vars are at start of list
col_nam = X2.columns.values
keep = col_nam[[fi-1]]
poss = [x for x in col_nam if x not in keep]
col_nam2 = np.array(list(keep)+poss)
# print the vars in order of inclusion in model
vnames = col_nam2[vorder2[::]]
print vnames
print type(vnames)

 [1]  2  3  4  6 13  9 10 15 12 11 16  8  5  7 14

[1]  2  3  4  6 13  9 10 15


[1]  2  3  4  6 13  9 10 15


[1, 2, 3, 5, 12, 8, 9, 14]

['V2' 'V3' 'V13' 'V4' 'V11' 'V7' 'V8' 'V14']
<type 'numpy.ndarray'>


In [46]:
ncs = np.arange(mv)
nn = np.array(ro.r('out2$nn'))
rss = np.array(ro.r('out2$rss'))
fstats = (rss[:mv] - rss[1:(mv+1)]) / ( rss[1:(mv+1)] / (nn - (mv+1)))
print np.sort(fstats)
import scipy.stats as st
ps = 1 - st.f.cdf(fstats, 1, nn-(mv+1))
print
print np.around(np.sort(ps),3)

[  6.23703732e-01   1.60657427e+00   2.04019097e+00   2.67362903e+00
   3.71414778e+01   8.57468967e+01   1.61829793e+02   7.71589798e+31]

[ 0.     0.     0.     0.     0.105  0.157  0.208  0.432]


In [49]:
sizes = np.arange(mv)+1
print ps

[  4.31727329e-01   2.60529781e-08   2.08207884e-01   8.99280650e-15
   1.11022302e-16   0.00000000e+00   1.05477783e-01   1.56613699e-01]


In [48]:
cnam = pd.DataFrame(vnames)
sz = pd.DataFrame(sizes)
psd = pd.DataFrame(ps)
ttt = pd.concat([cnam,sz,psd],axis=1)
ttt.columns = ['Vname','Size','p']
print type(ttt)
print ttt

<class 'pandas.core.frame.DataFrame'>
  Vname  Size             p
0    V2     1  4.317273e-01
1    V3     2  2.605298e-08
2   V13     3  2.082079e-01
3    V4     4  8.992806e-15
4   V11     5  1.110223e-16
5    V7     6  0.000000e+00
6    V8     7  1.054778e-01
7   V14     8  1.566137e-01


In [48]:
###########################################################
### Test functions:

In [9]:
fwd_r = forward(X2,Y2)

In [10]:
codnames = cov_order(X2.columns.values)

print codnames

print ro.r('fwd$vorder')
# ro.globalenv['out_cov'] = fwd_proc
    
# ### Pull the cov entry order
# vorder = ro.r('out_cov$vorder[-1]') # remove intercept
# vorder = vorder[0:max_size] # keep only the max model size number of covs

# ### Shift these values down by two (one to exclude intercept, one to make python indices)
# vorderinds = np.array(vorder)-2

# ### Rearrange the var order st forced vars are at start of list
# col_names = xcolnames
# keep = col_nam[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
# poss = [x for x in col_nam if x not in keep] # pull var names of those not forced in (this is a list)
# col_names2 = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list

# ### Sort the columns of X in order to obtain the var names in the entry order
# return col_names2[vorderinds[::]]

['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1' 'V5' 'V15' 'V12' 'V14' 'V8' 'V9' 'V6'
 'V13']
 [1]  1  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14



In [11]:
po = pval_comp(X2.shape[1])

In [12]:
gg00 = 0.05
af = alpha_F(gg00, X2.shape[1])

In [13]:
gf = gamma_F(po, X2.shape[1])

In [14]:
sss = np.arange(X2.shape[1])+1

In [15]:
fsr_results = fsrtable(sss, codnames, po, np.sort(po), af, gf)

In [16]:
fsr_results

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.007143,0.0
1,2,V7,0.0,0.0,0.011538,0.0
2,3,V3,0.0,0.0,0.016667,0.0
3,4,V11,0.0,0.0,0.022727,0.0
4,5,V2,0.0006,0.0006,0.03,0.001
5,6,V10,0.0115,0.0115,0.038889,0.0148
6,7,V1,0.0159,0.0159,0.05,0.0159
7,8,V5,0.1112,0.0909,0.064286,0.0707
8,9,V15,0.0909,0.1112,0.083333,0.0667
9,10,V12,0.1371,0.1371,0.11,0.0623


In [18]:
ftab, bhats = ffsr(X2,Y2,0.05,betaout=True)
print ftab
print
print bhats

     S  Var       p     p_s  alpha_F  gamma_F
0    1   V4  0.0000  0.0000   0.0071   0.0000
1    2   V7  0.0000  0.0000   0.0115   0.0000
2    3   V3  0.0000  0.0000   0.0167   0.0000
3    4  V11  0.0000  0.0000   0.0227   0.0000
4    5   V2  0.0006  0.0006   0.0300   0.0010
5    6  V10  0.0115  0.0115   0.0389   0.0148
6    7   V1  0.0159  0.0159   0.0500   0.0159
7    8   V5  0.1112  0.0909   0.0643   0.0707
8    9  V15  0.0909  0.1112   0.0833   0.0667
9   10  V12  0.1371  0.1371   0.1100   0.0623
10  11  V14  0.2147  0.2147   0.1500   0.0716
11  12   V8  0.2563  0.2563   0.2167   0.0591
12  13   V9  0.3736  0.3736   0.3500   0.0534
13  14   V6  0.6494  0.6494   0.7500   0.0433
14  15  V13  0.7110  0.7110   1.0000   0.7110

[[ 6.  4.  5.  5.  0.  0.  0.]]


In [6]:
gs = ftab.gamma_F
isinstance(np.array([0.05,0.1]),np.ndarray)
sub = [np.where(gs>y) for y in np.array([0.05,0.1])]
sub
ss = [min(x[0]) for x in sub]
ss
min(np.where(gs>0.05)[0])
print alpha_F_g(0.005,gs,15)
print alpha_F_g(np.array([0.05,0.005]),gs,15)
#ss = np.array([max(np.which(x<=0.05)) for x in ftab.gamma_F])+1

0.003
[ 0.05   0.003]


In [7]:
ss = min(np.where(gs>0.005)[0])
ss
vs = ftab.Var[0:ss]

In [8]:
import time

In [9]:
start = time.time()

from sklearn import linear_model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X2, Y2)

# The coefficients
modvar = regr.coef_

(time.time() - start)

0.05408811569213867

In [10]:
start = time.time()

import statsmodels.api as sm
mod = sm.OLS(Y2, X2)
rs = mod.fit()
modvar2 = rs.params

(time.time() - start)

0.10325002670288086

In [15]:
np.isclose(rs.params,regr.coef_)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True]], dtype=bool)

In [29]:
list(vs)

['V4', 'V7', 'V3', 'V11', 'V2']

In [32]:
#X2.loc[:,'V1']
regr.fit(X2.loc[:,list(vs)], Y2)

# The coefficients
modvar = regr.coef_
modvar

array([[  6.00000000e+00,   4.00000000e+00,   5.00000000e+00,
          5.00000000e+00,  -1.33226763e-15]])

In [161]:
ffsr(X2,Y2,0.05,max_size=8)

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V4,5.613099e-11,0.0,0.007143,0.0
1,2,V7,1.902922e-12,1.110223e-16,0.011538,4.810966e-16
2,3,V3,1.110223e-16,1.902922e-12,0.016667,5.708767e-12
3,4,V11,0.0,5.613099e-11,0.022727,1.234882e-10
4,5,V2,0.0003277062,0.0003277062,0.03,0.000546177
5,6,V10,0.008502068,0.008502068,0.038889,0.01093123
6,7,V1,0.01204123,0.01204123,0.05,0.01204123
7,8,V5,0.09725851,0.09725851,0.064286,0.07564551


In [166]:
ffsr(X2,Y2,0.05,var_incl=np.array([5]))

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V5,0.3534489,0.0,0.007143,0.0
1,2,V4,3.970527e-10,1.110223e-16,0.011538,4.810966e-16
2,3,V3,1.44148e-11,1.44148e-11,0.016667,4.324441e-11
3,4,V11,1.110223e-16,3.970527e-10,0.022727,8.73516e-10
4,5,V7,0.0,0.03916484,0.03,0.06527473
5,6,V2,0.03916484,0.1283149,0.038889,0.1649763
6,7,V6,0.1285371,0.1285371,0.05,0.1285371
7,8,V1,0.1283149,0.2354998,0.064286,0.1831665
8,9,V13,0.2354998,0.2582605,0.083333,0.1549563
9,10,V15,0.2582605,0.3126082,0.11,0.1420947
