In [30]:
%%file ffsr2.py
""" Pseudocode for Fast FSR algorithm """

""" Date: 4/17/15 
    Modified: Correct p-value computations 
    Date: 4/19/15
    Modified: Added corrections from Draft8-R_comparisons: p-mono, gamma_F"""


""" Data type check """
def df_type(dat):
    
    ### Input params:
    #   dat = dataset whose type is to be checked / transformed
    
    ### Output:
    #   error msg or True boolean
    
    import numpy as np
    import pandas as pd
    
    if isinstance(dat,pd.DataFrame)==False and isinstance(dat,np.ndarray)==False:
        raise Exception("Data must be pandas DataFrame")
    else:
        return True


    
""" p-value computation function """
def pval_comp(max_size=None):
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import scipy.stats as st
    import rpy2.robjects as ro
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # vector of model sizes
    sizes = np.arange(max_size)+1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (sizes+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(sizes+1))



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    # Input params:
    #   xcolnames = array of names of covariates (same order as columns in original dataset)
    #   max_size  = integer max no. of vars in final model (largest model size desired)
    #   col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import rpy2.robjects as ro
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    ### and declare as R objects in global environment
    ro.globalenv['x2'] = com.convert_to_r_matrix(x)
    ro.globalenv['y2'] = com.convert_to_r_matrix(y)
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None):
    
    ### Input params:
    #   pvs      = vector of p-values (monotonically increasing) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of gamma_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pvs * (ncov - S) / (1 + S)
    
    # Check for duplicate p-values
    dups = list(set([x for x in list(pvs) if list(pvs).count(x) > 1]))
    for i in range(len(dups)): g_F[pvs==dups[i]] = min(g_F[pvs==dups[i]])
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_mono == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pvs[-1]
    
    return g_F

    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None):
    
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of alpha_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return alpha_F        
    
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    
    import numpy as np
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return g * (1 + S) / (ncov - S)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return g * (1 + S) / (ncov - S)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname):
    
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_mono
    #            used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    
    ### Output:
    # array of estimated parameters
    
    import numpy as np
    import statsmodels.api as sm
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])

    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]

    ### Fit the linear model using the selected model vars
    fit = sm.OLS(y,x.loc[:,list(modvars)]).fit()
    betaout = pd.DataFrame([fit.params,fit.bse]).T
    betaout.columns = ['beta','beta_se']
    
    return betaout

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f=4):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    import numpy as np
    import pandas as pd
    
    ### Convert all arrays to dataframes
    sized = pd.DataFrame(size)
    vnamed = pd.DataFrame(vname)
    p_od = pd.DataFrame(np.around(p_orig,prec_f))
    p_md = pd.DataFrame(np.around(p_mono,prec_f))
    ad = pd.DataFrame(np.around(alphaf,prec_f))
    gd = pd.DataFrame(np.around(gammaf,prec_f))
    
    ### Combine the arrays
    tab = pd.concat([sized,vnamed,p_od,p_md,ad,gd],axis=1)
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(X,Y,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f=4,prec_b=6):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = integer of precision (num digits) desired in FSR output table
    #   prec_b   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if bag==False:
        if df_type(X)==True:
            if isinstance(X,pd.DataFrame):
                x = X.copy()
            else:
                if isinstance(X,np.ndarray):
                    x = pd.DataFrame(X)
                    vnum = list(np.arange(x.shape[1])+1)
                    vchr = list(np.repeat("V",x.shape[1]))
                    x.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(X)
        if df_type(Y)==True:
            if isinstance(Y,pd.DataFrame):
                y = Y.copy()
            else:
                if isinstance(Y,np.ndarray):
                    y = pd.DataFrame(Y)
        else:
            return df_type(Y)
        # Remove missing values
        yna = np.isnan(y).any(axis=1)
        xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
        anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
        missrow = np.where(anyna==1)[0]
        y = y.drop(y.index[missrow])
        x = x.drop(x.index[missrow])
        # Check that p < n to ensure regression solutions
        if x.shape[1] >= x.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        x, y = X.copy(), Y.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = x.shape[1]
        
    ### Perform forward selection
    fwd_sel = forward(x, y, max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(x.columns.values, max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, x.shape[1], max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(x, y, g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, x.shape[1], max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, np.around(betahats, prec_b), alpha_F_g(gs, g_F, x.shape[1])
            else:
                return fsr_results, alpha_F_g(gs, g_F, x.shape[1])
        else:
            if betaout==True:
                return fsr_results, np.around(betahats, prec_b)
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, x.shape[1]), len(betahats)

    
    
def bagfsr(X,Y,g0,B=200,max_s=None,v_incl=None,prec=4):
    
    ### Input params:
    #   X      = python dataframe of original p covariates, n x p
    #   Y      = python outcome dataframe, n x 1
    #   g0     = float pre-specified FSR of interest ("gamma0")
    #   B      = integer of number of bagged samples
    #   max_s  = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   v_incl = array of cols corresponding to those vars to force into model
    #   prec   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #   Mean of betahats
    #   SEs of betahats
    #   Avg alpha-to-enter
    #   Avg model size
    #   Prop of times each var included in model
    
    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if df_type(X)==True:
        if isinstance(X,pd.DataFrame):
            x = X.copy()
        else:
            if isinstance(X,np.ndarray):
                x = pd.DataFrame(X)
                vnum = list(np.arange(x.shape[1])+1)
                vchr = list(np.repeat("V",x.shape[1]))
                x.columns = [a + str(b) for a,b in zip(vchr,vnum)]
    else:
        return df_type(X)
    if df_type(Y)==True:
        if isinstance(Y,pd.DataFrame):
            y = Y.copy()
        else:
            if isinstance(Y,np.ndarray):
                y = pd.DataFrame(Y)
    else:
        return df_type(Y)
    
    # Remove missing values
    yna = np.isnan(y).any(axis=1)
    xna = np.isnan(x).any(axis=1).reshape(x.shape[0],1)
    anyna = np.array([int(max(a,b)) for a,b in zip(xna,yna)])
    missrow = np.where(anyna==1)[0]
    y = y.drop(y.index[missrow])
    x = x.drop(x.index[missrow])
    
    # check that p < n to ensure regression solutions
    if x.shape[1] >= x.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### Combine data into single dataframe
    dat = pd.concat([y,x],axis=1)
    
    ### Create array to keep track of number of times vars enter model
    nentries = pd.DataFrame(np.zeros(x.shape[1]),index=x.columns.values)
    
    ### Create array to store all estimated coefficients, ses, alphas, sizes
    allbetas = pd.DataFrame(np.zeros([B,x.shape[1]]),columns=x.columns.values)
    allses = allbetas.copy()
    alphas = []
    sizes = []
    np.random.seed(1234)
    
    ### Bagging loops
    for i in range(B):

        # Draw with replacement from rows of data
        
        n_row = dat.shape[0]
        rand_row = np.random.randint(0,n_row,n_row)
        newdat = dat.iloc[rand_row,:]
        newdat.index = np.arange(n_row)+1
        
        ### Obtain FSR results
        fsrout = ffsr(newdat.iloc[:,1:],pd.DataFrame(newdat.iloc[:,0]),g0,bag=True,max_size=max_s,var_incl=v_incl)
        allbetas.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,0]
        allses.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,1]
        alphas.append(fsrout[1])
        sizes.append(fsrout[2])

        ### Update counts num times var included
        nentries.loc[fsrout[0].index[np.abs(np.around(fsrout[0].iloc[:,0],prec))>0]] += 1
        
    ### Compute averages
    avgbeta = np.around(allbetas.mean(axis=0),prec) # mean across rows / colmeans == mean of each cov's betahat
    avgse = np.around(allses.mean(axis=0),prec)
    avgalpha = np.mean(alphas)
    avgsize = np.mean(sizes)
    var_props = nentries/float(B)
    cov_res = pd.concat([avgbeta,avgse,var_props],axis=1)
    cov_res.columns = ['betahat','betase','prop_incl']
    
    return cov_res, avgalpha, avgsize
    
    

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

# To-do:
# 1. adjust betaest fcn and ffsr to allow for specification of intercept and whether data should be normalized in estimation

Writing ffsr2.py


In [2]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
Y = X.dot(beta)

In [3]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [6]:
%load_ext rpy2.ipython

In [10]:
%%R -i X2,Y2

fsr.fast<-function(x,y,gam0=.05,digits=4,print=T,plot=F){
# estimated alpha for forward selection using Fast FSR (no simulation)
# typical call: fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20])->out
# for use inside simulation loops, set print=F and plot=F
# version 7 circa Nov. 2009, modified to handle partially blank colnames
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
if(any(colnames(x)==""))colnames(x)<-NULL       # if only partially named columns
colnames(x)<-colnames(x,do.NULL=F,prefix="")    # corrects for no colnames
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
# gmax<-max(ghat)
# index.max<-which.max(ghat)           # index of largest ghat
# alphamax<-alpha[index.max]           # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]           # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
# ghat3<-(m-size1+1)*alpha/(1+S)         # uses final ku est.
ghat4<-(m-size1+1)*alpha/(1+0:m)
#res<-data.frame(real.seq,ghigh=ghat2,glow=ghat[2:ng])
res<-data.frame(real.seq,g=ghat[2:ng])
if(print)print(round(res,digits))
#if(plot){
#plot(zp$a,zp$g,type="b",xlab="Alpha",ylab="Estimated Gamma",xlim=c(0,alphamax))
#points(alphahat.fast,gam0,pch=19)
#lines(c(-1,alphahat.fast),c(gam0,gam0))
#lines(c(alphahat.fast,alphahat.fast),c(-1,gam0))
#}  # ends plot
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}

x2 = as.matrix(X2)
y2 = as.matrix(Y2)

system.time(fsr.fast(x=x2,y=y2))

   var   pval  pvmax    Rsq      g
1    4 0.0000 0.0000 0.3778 0.0000
2    7 0.0000 0.0000 0.6402 0.0000
3    3 0.0000 0.0000 0.8519 0.0000
4   11 0.0000 0.0000 1.0000 0.0000
5    2 0.0003 0.0003 1.0000 0.0004
6   10 0.0078 0.0078 1.0000 0.0100
7    1 0.0116 0.0116 1.0000 0.0116
8    5 0.0973 0.0973 1.0000 0.0584
9   15 0.0800 0.0973 1.0000 0.0584
10  12 0.1259 0.1259 1.0000 0.0572
11  14 0.2040 0.2040 1.0000 0.0680
12   8 0.2480 0.2480 1.0000 0.0572
13   9 0.3679 0.3679 1.0000 0.0526
14   6 0.6474 0.6474 1.0000 0.0432
15  13 0.7110 0.7110 1.0000 0.0000
   user  system elapsed 
  0.005   0.001   0.006 


In [11]:
%%time

ffsr(X2,Y2,0.05)

CPU times: user 22.4 ms, sys: 941 µs, total: 23.3 ms
Wall time: 23.5 ms


Unnamed: 0,S,Var,p,p_m,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.0071,0.0
1,2,V7,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V2,0.0003,0.0003,0.03,0.0004
5,6,V10,0.0078,0.0078,0.0389,0.01
6,7,V1,0.0116,0.0116,0.05,0.0116
7,8,V5,0.0973,0.0973,0.0643,0.0584
8,9,V15,0.08,0.0973,0.0833,0.0584
9,10,V12,0.1259,0.1259,0.11,0.0572


In [14]:
pstats = %prun -r -q ffsr(X2, Y2, 0.05)
pstats.sort_stats('time').print_stats();

          22350 function calls (22283 primitive calls) in 0.054 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     3243    0.005    0.000    0.007    0.000 numeric.py:1910(isscalar)
     1600    0.004    0.000    0.019    0.000 common.py:261(notnull)
     1620    0.004    0.000    0.011    0.000 common.py:132(_isnull_new)
     4175    0.003    0.000    0.004    0.000 {isinstance}
       23    0.003    0.000    0.004    0.000 functions.py:95(__call__)
        2    0.003    0.001    0.037    0.019 common.py:282(convert_to_r_dataframe)
     1620    0.002    0.000    0.006    0.000 {pandas.lib.isscalar}
     1620    0.002    0.000    0.013    0.000 common.py:111(isnull)
        2    0.002    0.001    0.002    0.001 {method 'rcall' of 'rpy2.rinterface.SexpClosure' objects}
     1616    0.001    0.000    0.001    0.000 {pandas.lib.checknull}
       90    0.001    0.000    0.001    0.000 vectors.py:230(__init__)
       69    0.0

In [15]:
%load_ext line_profiler

In [16]:
lstats = %lprun -r -f ffsr ffsr(X2,Y2, 0.05)

In [17]:
lstats.print_stats()

Timer unit: 1e-06 s

Total time: 0.035481 s
File: <ipython-input-1-e76dcf916adc>
Function: ffsr at line 289

Line #      Hits         Time  Per Hit   % Time  Line Contents
   289                                           def ffsr(X,Y,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f=4,prec_b=6):
   290                                               
   291                                               ### Input params:
   292                                               #   x        = python dataframe of original p covariates, n x p
   293                                               #   y        = python outcome dataframe, n x 1
   294                                               #   g0       = float pre-specified FSR of interest ("gamma0")
   295                                               #   betaout  = boolean of whether to include estimated betahats from final selected model
   296                                               #   gs       = float or v

In [18]:
%load_ext memory_profiler

In [19]:
%memit ffsr(X2,Y2,0.05)

peak memory: 109.88 MiB, increment: 2.14 MiB


In [26]:
import numpy as np

In [31]:
import ffsr2
mstats = %mprun -f ffsr2.ffsr ffsr2.ffsr(X2,Y2,0.05)

('',)


In [23]:
###########################################################
### Test functions:

In [24]:
fwd_r = forward(X2,Y2)

In [25]:
codnames = cov_order(X2.columns.values)

print codnames

print ro.r('fwd$vorder')
# ro.globalenv['out_cov'] = fwd_proc
    
# ### Pull the cov entry order
# vorder = ro.r('out_cov$vorder[-1]') # remove intercept
# vorder = vorder[0:max_size] # keep only the max model size number of covs

# ### Shift these values down by two (one to exclude intercept, one to make python indices)
# vorderinds = np.array(vorder)-2

# ### Rearrange the var order st forced vars are at start of list
# col_names = xcolnames
# keep = col_nam[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
# poss = [x for x in col_nam if x not in keep] # pull var names of those not forced in (this is a list)
# col_names2 = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list

# ### Sort the columns of X in order to obtain the var names in the entry order
# return col_names2[vorderinds[::]]

['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1' 'V5' 'V15' 'V12' 'V14' 'V8' 'V9' 'V6'
 'V13']
 [1]  1  5  8  4 12  3 11  2  6 16 13 15  9 10  7 14



In [26]:
po = pval_comp(X2.shape[1])

In [27]:
gg00 = 0.05
af = alpha_F(gg00, X2.shape[1])

In [28]:
gf = gamma_F(po, X2.shape[1])

In [29]:
sss = np.arange(X2.shape[1])+1

In [30]:
fsr_results = fsrtable(sss, codnames, po, np.sort(po), af, gf)

In [31]:
fsr_results

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.0071,0.0
1,2,V7,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V2,0.0002,0.0002,0.03,0.0003
5,6,V10,0.0075,0.0075,0.0389,0.0096
6,7,V1,0.0111,0.0111,0.05,0.0111
7,8,V5,0.0954,0.0784,0.0643,0.061
8,9,V15,0.0784,0.0954,0.0833,0.0572
9,10,V12,0.1238,0.1238,0.11,0.0563


In [32]:
ftab, bhats = ffsr(X2,Y2,0.05,betaout=True)
print ftab
print
print bhats

     S  Var       p     p_s  alpha_F  gamma_F
0    1   V4  0.0000  0.0000   0.0071   0.0000
1    2   V7  0.0000  0.0000   0.0115   0.0000
2    3   V3  0.0000  0.0000   0.0167   0.0000
3    4  V11  0.0000  0.0000   0.0227   0.0000
4    5   V2  0.0002  0.0002   0.0300   0.0003
5    6  V10  0.0075  0.0075   0.0389   0.0096
6    7   V1  0.0111  0.0111   0.0500   0.0111
7    8   V5  0.0954  0.0784   0.0643   0.0610
8    9  V15  0.0784  0.0954   0.0833   0.0572
9   10  V12  0.1238  0.1238   0.1100   0.0563
10  11  V14  0.2015  0.2015   0.1500   0.0672
11  12   V8  0.2453  0.2453   0.2167   0.0566
12  13   V9  0.3651  0.3651   0.3500   0.0522
13  14   V6  0.6455  0.6455   0.7500   0.0430
14  15  V13  0.7094  0.7094   1.0000   0.7094

     beta  beta_se
V4      6        0
V7      4        0
V3      5        0
V11     5        0
V2      0        0
V10     0        0
V1     -0        0


In [33]:
gs = ftab.gamma_F
isinstance(np.array([0.05,0.1]),np.ndarray)
sub = [np.where(gs>y) for y in np.array([0.05,0.1])]
sub
ss = [min(x[0]) for x in sub]
ss
min(np.where(gs>0.05)[0])
print alpha_F_g(0.005,gs,15)
print alpha_F_g(np.array([0.05,0.005]),gs,15)
#ss = np.array([max(np.which(x<=0.05)) for x in ftab.gamma_F])+1

0.003
[ 0.05   0.003]


In [34]:
ss = min(np.where(gs>0.005)[0])
ss
vs = ftab.Var[0:ss]

In [35]:
import time
from sklearn import linear_model
import statsmodels.api as sm

In [56]:
start = time.time()

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X2, Y2)

# The coefficients
modvar = regr.coef_

print (time.time() - start)
print
print modvar

0.00100994110107

[[ -9.43689571e-16   5.55111512e-16   5.00000000e+00   6.00000000e+00
   -1.49186219e-15   1.77635684e-15   4.00000000e+00  -2.22044605e-15
   -6.66133815e-16  -1.88737914e-15   5.00000000e+00   1.55431223e-15
   -2.66453526e-15   7.21644966e-16   5.16253706e-15]]


In [37]:
start = time.time()

mod = sm.OLS(Y2, X2)
rs = mod.fit()
modvar2 = rs.params
#modvarse2 = rs.bse

print (time.time() - start)
print
print modvar2

0.00221705436707

V1     4.440892e-16
V2    -1.387779e-15
V3     5.000000e+00
V4     6.000000e+00
V5     1.720846e-15
V6     2.220446e-16
V7     4.000000e+00
V8     6.661338e-16
V9     4.420075e-15
V10   -3.164136e-15
V11    5.000000e+00
V12   -7.771561e-16
V13   -1.505740e-15
V14    6.383782e-16
V15   -6.661338e-16
dtype: float64


In [38]:
# to add intercept:

X3 = sm.add_constant(X2)
X4 = X2.copy()
X4.insert(0,'int',1)
print X4.shape, X3.shape, X2.shape
print X4.iloc[:5,:5]
mod = sm.OLS(Y2, X4)
rs = mod.fit()
modvar2 = rs.params
#modvarse2 = rs.bse

print
print modvar2

(100, 16) (100, 16) (100, 15)
   int        V1        V2        V3        V4
0    1  0.471435 -1.190976  1.432707 -0.312652
1    1  0.002118  0.405453  0.289092  1.321158
2    1 -0.397840  0.337438  1.047579  1.045938
3    1 -0.897157 -0.136795  0.018289  0.755414
4    1 -0.974236 -0.070345  0.307969 -0.208499

int    2.345346e-15
V1    -4.440892e-16
V2    -1.609823e-15
V3     5.000000e+00
V4     6.000000e+00
V5     1.221245e-15
V6     7.938095e-15
V7     4.000000e+00
V8     3.219647e-15
V9     2.782496e-15
V10    3.275158e-15
V11    5.000000e+00
V12   -1.332268e-15
V13   -4.725387e-15
V14   -4.315992e-15
V15   -3.774758e-15
dtype: float64


In [39]:
X3 = X2 + np.random.random(X2.shape[1]*X2.shape[0]).reshape(X2.shape[0],X2.shape[1])
dd = pd.concat([Y2,X3],axis=1)
dd.columns.values[0] = 'Y'
dd.columns.values
fit2 = sm.GLM.from_formula('Y ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + V10 + V11 + V12 + V13 + V14 + V15', data=dd, family=sm.families.Gaussian()).fit()
#fit2 = sm.GLM(Y2,X2,family=sm.families.family.Gaussian).fit()
fit2.summary()

0,1,2,3
Dep. Variable:,Y,No. Observations:,100.0
Model:,GLM,Df Residuals:,84.0
Model Family:,Gaussian,Df Model:,15.0
Link Function:,identity,Scale:,9.25311095482
Method:,IRLS,Log-Likelihood:,-244.42
Date:,"Wed, 15 Apr 2015",Deviance:,777.26
Time:,17:32:51,Pearson chi2:,777.0
No. Iterations:,3,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-8.6063,0.684,-12.580,0.000,-9.947 -7.265
V1,-0.4899,0.319,-1.535,0.125,-1.115 0.135
V2,-0.0009,0.361,-0.002,0.998,-0.709 0.708
V3,4.7984,0.333,14.405,0.000,4.146 5.451
V4,5.4745,0.312,17.562,0.000,4.863 6.085
V5,-0.0869,0.292,-0.298,0.766,-0.660 0.486
V6,0.0632,0.334,0.189,0.850,-0.591 0.717
V7,3.8846,0.339,11.446,0.000,3.219 4.550
V8,-0.3027,0.322,-0.941,0.347,-0.933 0.328


In [40]:
fit2 = sm.GLM(Y2,X3,family=sm.families.Gaussian()).fit()
fit2.summary()

0,1,2,3
Dep. Variable:,0,No. Observations:,100.0
Model:,GLM,Df Residuals:,85.0
Model Family:,Gaussian,Df Model:,14.0
Link Function:,identity,Scale:,26.3716116733
Method:,IRLS,Log-Likelihood:,-297.38
Date:,"Wed, 15 Apr 2015",Deviance:,2241.6
Time:,17:32:56,Pearson chi2:,2240.0
No. Iterations:,3,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
V1,-1.4800,0.522,-2.835,0.005,-2.503 -0.457
V2,-1.3713,0.582,-2.357,0.018,-2.512 -0.231
V3,3.4876,0.534,6.530,0.000,2.441 4.534
V4,4.7535,0.517,9.189,0.000,3.740 5.767
V5,-0.9667,0.479,-2.019,0.044,-1.905 -0.028
V6,-1.5131,0.522,-2.896,0.004,-2.537 -0.489
V7,3.3028,0.568,5.819,0.000,2.190 4.415
V8,-1.3393,0.525,-2.550,0.011,-2.369 -0.310
V9,-0.4280,0.546,-0.784,0.433,-1.498 0.642


In [62]:
np.isclose(np.array(rs.params[1:]).reshape(1,15),regr.coef_)

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True]], dtype=bool)

In [42]:
list(vs)

['V4', 'V7', 'V3', 'V11', 'V2']

In [43]:
btests = beta_est(X2, Y2, 0.05, gs, vs)
print btests
vps = pd.DataFrame(np.zeros(15),index=X2.columns.values)
#print vps
vps.loc[btests.index] += 1
print vps
print type(btests.index)

             beta       beta_se
V4   6.000000e+00  9.710566e-16
V7   4.000000e+00  9.734326e-16
V3   5.000000e+00  9.458140e-16
V11  5.000000e+00  1.149754e-15
V2   4.884981e-15  1.037405e-15
     0
V1   0
V2   1
V3   1
V4   1
V5   0
V6   0
V7   1
V8   0
V9   0
V10  0
V11  1
V12  0
V13  0
V14  0
V15  0
<class 'pandas.core.index.Index'>


In [44]:
#X2.loc[:,'V1']
regr.fit(X2.loc[:,list(vs)], Y2)

# The coefficients
modvar = regr.coef_
print modvar
#print rs.params
dftest = pd.DataFrame([rs.params,rs.bse]).T
dftest.columns = ['beta','beta_se']
print dftest
print np.around(dftest,5)

[[  6.00000000e+00   4.00000000e+00   5.00000000e+00   5.00000000e+00
   -1.33226763e-15]]
             beta       beta_se
int  2.345346e-15  1.950624e-15
V1  -4.440892e-16  2.018584e-15
V2  -1.609823e-15  2.210142e-15
V3   5.000000e+00  1.953846e-15
V4   6.000000e+00  1.926021e-15
V5   1.221245e-15  1.843307e-15
V6   7.938095e-15  1.976243e-15
V7   4.000000e+00  2.069584e-15
V8   3.219647e-15  2.053536e-15
V9   2.782496e-15  2.009177e-15
V10  3.275158e-15  2.094942e-15
V11  5.000000e+00  2.427111e-15
V12 -1.332268e-15  1.962414e-15
V13 -4.725387e-15  1.714302e-15
V14 -4.315992e-15  1.687020e-15
V15 -3.774758e-15  1.993998e-15
     beta  beta_se
int     0        0
V1     -0        0
V2     -0        0
V3      5        0
V4      6        0
V5      0        0
V6      0        0
V7      4        0
V8      0        0
V9      0        0
V10     0        0
V11     5        0
V12    -0        0
V13    -0        0
V14    -0        0
V15    -0        0


In [45]:
ffsr(X2,Y2,0.05,max_size=8)

Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V4,0.0,0.0,0.0071,0.0
1,2,V7,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V2,0.0002,0.0002,0.03,0.0003
5,6,V10,0.0075,0.0075,0.0389,0.0096
6,7,V1,0.0111,0.0111,0.05,0.0111
7,8,V5,0.0954,0.0954,0.0643,0.0742


In [46]:
ffsr(X2,Y2,0.05,var_incl=np.array([5]),betaout=True)



(     S  Var       p     p_s  alpha_F  gamma_F
 0    1   V5  0.3135  0.0000   0.0071   0.0000
 1    2   V4  0.0000  0.0000   0.0115   0.0000
 2    3   V3  0.0000  0.0000   0.0167   0.0000
 3    4  V11  0.0000  0.0000   0.0227   0.0000
 4    5   V7  0.0000  0.0291   0.0300   0.0485
 5    6   V2  0.0291  0.1097   0.0389   0.1410
 6    7   V6  0.1097  0.1114   0.0500   0.1114
 7    8   V1  0.1114  0.2168   0.0643   0.1686
 8    9  V13  0.2168  0.2418   0.0833   0.1451
 9   10  V15  0.2418  0.3013   0.1100   0.1370
 10  11   V9  0.3280  0.3135   0.1500   0.1045
 11  12  V12  0.3013  0.3280   0.2167   0.0757
 12  13  V10  0.5214  0.5214   0.3500   0.0745
 13  14  V14  0.5820  0.5820   0.7500   0.0388
 14  15   V8  0.8641  0.8641   1.0000   0.8641,      beta  beta_se
 V5     -0        0
 V4      6        0
 V3      5        0
 V11     5        0
 V7      4        0)

In [47]:
ffsr(X,Y,0.05,max_size=8,var_incl=np.array([5]))



Unnamed: 0,S,Var,p,p_s,alpha_F,gamma_F
0,1,V5,0.3135,0.0,0.0071,0.0
1,2,V4,0.0,0.0,0.0115,0.0
2,3,V3,0.0,0.0,0.0167,0.0
3,4,V11,0.0,0.0,0.0227,0.0
4,5,V7,0.0,0.0291,0.03,0.0485
5,6,V2,0.0291,0.1097,0.0389,0.141
6,7,V6,0.1097,0.1114,0.05,0.1114
7,8,V1,0.1114,0.3135,0.0643,0.2438


In [48]:
blahdat = pd.concat([Y2,X2],axis=1)
save = pd.DataFrame(np.zeros([5,X2.shape[1]]),columns=X2.columns.values)
print ffsr(blahdat.iloc[:,1:],pd.DataFrame(blahdat.iloc[:,0]),0.05,bag=True)[0].index.values
print ffsr(blahdat.iloc[:,1:],pd.DataFrame(blahdat.iloc[:,0]),0.05,bag=True)[0]
save.loc[0,ffsr(blahdat.iloc[:,1:],pd.DataFrame(blahdat.iloc[:,0]),0.05,bag=True)[0].index.values] = np.around(ffsr(blahdat.iloc[:,1:],pd.DataFrame(blahdat.iloc[:,0]),0.05,bag=True)[0].iloc[:,0],8)
print save
crap = ffsr(blahdat.iloc[:,1:],pd.DataFrame(blahdat.iloc[:,0]),0.05,bag=True)[0]
crap.index[np.abs(np.around(crap.iloc[:,0],8))>0]
#np.abs(np.around(crap.iloc[:,0],8))>0
crapnp = pd.DataFrame(np.zeros(X2.shape[1]),index=X2.columns.values)
print crapnp
crapnp.loc[crap.index[np.abs(np.around(crap.iloc[:,0],8))>0]] += 1
print crapnp

['V4' 'V7' 'V3' 'V11' 'V2' 'V10' 'V1']
             beta       beta_se
V4   6.000000e+00  1.672676e-15
V7   4.000000e+00  1.676875e-15
V3   5.000000e+00  1.668047e-15
V11  5.000000e+00  2.016266e-15
V2   1.137979e-14  1.855839e-15
V10  5.481726e-15  1.817640e-15
V1  -1.110223e-15  1.727931e-15
   V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  V11  V12  V13  V14  V15
0  -0   0   5   6   0   0   4   0   0    0    5    0    0    0    0
1   0   0   0   0   0   0   0   0   0    0    0    0    0    0    0
2   0   0   0   0   0   0   0   0   0    0    0    0    0    0    0
3   0   0   0   0   0   0   0   0   0    0    0    0    0    0    0
4   0   0   0   0   0   0   0   0   0    0    0    0    0    0    0
     0
V1   0
V2   0
V3   0
V4   0
V5   0
V6   0
V7   0
V8   0
V9   0
V10  0
V11  0
V12  0
V13  0
V14  0
V15  0
     0
V1   0
V2   0
V3   1
V4   1
V5   0
V6   0
V7   1
V8   0
V9   0
V10  0
V11  1
V12  0
V13  0
V14  0
V15  0


In [49]:
b = bagfsr(X,Y,0.05)


In [50]:
b[0]

Unnamed: 0,betahat,betase,prop_incl
V1,0,0,0
V2,0,0,0
V3,5,0,1
V4,6,0,1
V5,0,0,0
V6,0,0,0
V7,4,0,1
V8,0,0,0
V9,0,0,0
V10,0,0,0


In [51]:
b[1:]

(0.038899999999999997, 6.0)