In [1]:
                            """ Pseudocode for Fast FSR algorithm """

""" Date: 4/28/15 
    Modified: Create super fast FFSR fcn for bagging """



""" Data type check """
def df_type(dat):
    
    """
    ### Purpose: 
     Check if 'dat' is a pandas Dataframe or a numpy Ndarray
    
    ### Input params:
     dat = dataset whose type is to be checked / transformed
    
    ### Output:
     error msg or True boolean
    """
    
    import numpy as np
    import pandas as pd
    
    if isinstance(dat,pd.DataFrame)==False and isinstance(dat,np.ndarray)==False:
        raise Exception("Data must be pandas DataFrame")
    else:
        return True


    
""" p-value computation function """
def pval_comp(max_size=None):
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import scipy.stats as st
    import rpy2.robjects as ro
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # vector of model sizes
    sizes = np.arange(max_size)+1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (sizes+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(sizes+1))



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    # Input params:
    #   xcolnames = array of names of covariates (same order as columns in original dataset)
    #   max_size  = integer max no. of vars in final model (largest model size desired)
    #   col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object (requires running 'forward' fcn prior to this fcn) ###
    
    import numpy as np
    import rpy2.robjects as ro
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    ### and declare as R objects in global environment
    ro.globalenv['x2'] = com.convert_to_r_matrix(x)
    ro.globalenv['y2'] = com.convert_to_r_matrix(y)
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None):
    
    ### Input params:
    #   pvs      = vector of p-values (monotonically increasing) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of gamma_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pvs * (ncov - S) / (1 + S)
    
    # Check for duplicate p-values
    dups = list(set([x for x in list(pvs) if list(pvs).count(x) > 1]))
    for i in range(len(dups)): g_F[pvs==dups[i]] = min(g_F[pvs==dups[i]])
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_mono == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pvs[-1]
    
    return g_F

    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None):
    
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of alpha_F values
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return alpha_F        
    
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
    #          used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    
    import numpy as np
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return g * (1 + S) / (ncov - S)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return g * (1 + S) / (ncov - S)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname):
    
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_mono
    #            used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    
    ### Output:
    # array of estimated parameters
    
    import numpy as np
    import statsmodels.api as sm
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])

    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]

    ### Fit the linear model using the selected model vars
    fit = sm.OLS(y,x.loc[:,list(modvars)]).fit()
    betaout = pd.DataFrame([fit.params,fit.bse]).T
    betaout.columns = ['beta','beta_se']
    
    return betaout

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f='.4f'):
    
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f = string of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    
    import numpy as np
    import pandas as pd
    
    ### Round all arrays
    p_od = [format(x,prec_f) for x in p_orig]
    p_md = [format(x,prec_f) for x in p_mono]
    ad = [format(x,prec_f) for x in alphaf]
    gd = [format(x,prec_f) for x in gammaf]
    
    ### Combine the arrays
    tab = pd.DataFrame([size,vname,p_od,p_md,ad,gd]).T
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab
    
    
    
""" FastFSR function """
def ffsr(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)

    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure dat = pandas dataframes or else convert them
    if bag==False:
        if df_type(dat)==True:
            if isinstance(dat,pd.DataFrame):
                d = dat.copy()
            else:
                if isinstance(dat,np.ndarray):
                    d = pd.DataFrame(dat)
                    vnum = list(np.arange(d.shape[1])+1)
                    vchr = list(np.repeat("V",d.shape[1]))
                    d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(dat)
        
        ### Remove missing values
        d.dropna(inplace=True)
        
        ### Check that p < n to ensure regression solutions
        if (d.shape[1]-1) >= d.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        d = dat.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = d.shape[1]-1
        
    ### Perform forward selection
    fwd_sel = forward(d.iloc[:,1:], pd.DataFrame(d.iloc[:,0]), max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(d.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, d.shape[1]-1, max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(d.iloc[:,1:], pd.DataFrame(d.iloc[:,0]), g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, d.shape[1]-1, max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, betahats, alpha_F_g(gs, g_F, d.shape[1]-1)
            else:
                return fsr_results, alpha_F_g(gs, g_F, d.shape[1]-1)
        else:
            if betaout==True:
                return fsr_results, betahats
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, d.shape[1]-1), len(betahats)
    
    
    
""" FastFSR for bagging function """
def ffsr_bag(dat,g0=0.05,max_size=None,var_incl=None,prec_f='.4f'):
    
    ### NOTE: it is assumed that data has been transformed, cleaned, and is given in correct format
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    #   vector of estimated beta param's for final model (based on g0)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   number of param's in final model

    import numpy as np
    import pandas as pd
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = dat.shape[1]-1
        
    ### Perform forward selection
    fwd_sel = forward(dat.iloc[:,1:], pd.DataFrame(dat.iloc[:,0]), max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(dat.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, dat.shape[1]-1, max_size)
    
    ### Compute beta_hat of model corresponding to specific gamma0
    betahats = beta_est(dat.iloc[:,1:], pd.DataFrame(dat.iloc[:,0]), g0, g_F, cov_entry_order)
        
    return betahats, alpha_F_g(g0, g_F, dat.shape[1]-1), len(betahats)

    
    
def bagfsr(dat,g0,B=200,max_s=None,v_incl=None,prec=4):
    
    ### Input params:
    #   dat    = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0     = float pre-specified FSR of interest ("gamma0")
    #   B      = integer of number of bagged samples
    #   max_s  = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   v_incl = array of cols corresponding to those vars to force into model
    #   prec   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #   Mean of betahats
    #   SEs of betahats
    #   Avg alpha-to-enter
    #   Avg model size
    #   Prop of times each var included in model
    
    import numpy as np
    import pandas as pd
    from sklearn.utils import resample
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if df_type(dat)==True:
        if isinstance(dat,pd.DataFrame):
            d = dat.copy()
        else:
            if isinstance(dat,np.ndarray):
                d = pd.DataFrame(dat)
                vnum = list(np.arange(d.shape[1])+1)
                vchr = list(np.repeat("V",d.shape[1]))
                d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
    else:
        return df_type(dat)
    
    ### Remove missing values
    d.dropna(inplace=True)
    
    ### check that p < n to ensure regression solutions
    if (d.shape[1]-1) >= d.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### Create array to keep track of number of times vars enter model
    nentries = pd.DataFrame(np.zeros(d.shape[1]-1),index=d.columns.values[1:])
    
    ### Create array to store all estimated coefficients, ses, alphas, sizes
    allbetas = pd.DataFrame(np.zeros([B,(d.shape[1]-1)]),columns=d.columns.values[1:])
    allses = allbetas.copy()
    alphas = []
    sizes = []
    np.random.seed(1234)
    
    ### Bagging loops
    for i in range(B):
        # Draw with replacement from rows of data
        newdat = pd.DataFrame(resample(d, replace=True))
        newdat.columns = d.columns.values
        
        ### Obtain FSR results
        fsrout = ffsr_bag(newdat,g0,max_size=max_s,var_incl=v_incl)
        allbetas.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,0]
        allses.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,1]
        alphas.append(fsrout[1])
        sizes.append(fsrout[2])

        ### Update counts of num times var included
        nentries.loc[fsrout[0].index[np.abs(np.around(fsrout[0].iloc[:,0],prec))>0]] += 1
        
    ### Compute averages
    avgbeta = allbetas.mean(axis=0) # mean across rows / colmeans == mean of each cov's betahat
    avgse = allses.mean(axis=0)
    avgalpha = np.mean(alphas)
    avgsize = np.mean(sizes)
    var_props = nentries/float(B)
    cov_res = pd.concat([avgbeta,avgse,var_props],axis=1)
    cov_res.columns = ['betahat','betase','prop_incl']
    
    return cov_res, avgalpha, avgsize
    
    

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

""" Draft 13 module defined at bottom of notebook """

In [4]:
###########################################################

In [7]:
import numpy as np
import pandas as pd

np.random.seed(1234)

X = np.random.multivariate_normal(np.zeros(15),np.eye(15),(100))
beta = np.array([0,0,5,6,0,0,4,0,0,0,5,0,0,0,0]).reshape(15,1) # signif betas: 3,4,7,11
Y = X.dot(beta)

In [8]:
Y2 = pd.DataFrame(Y)
X2 = pd.DataFrame(X)
X2.columns = ["V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15"]

In [9]:
d = pd.concat([Y2,X2],axis=1)

In [5]:
##### Check speed of ffsr_bag vs ffsr
%timeit ffsr(d,0.05)
%timeit ffsr_bag(d,0.05)

The slowest run took 26.17 times longer than the fastest. This could mean that an intermediate result is being cached 
1 loops, best of 3: 19.4 ms per loop
The slowest run took 7.13 times longer than the fastest. This could mean that an intermediate result is being cached 
100 loops, best of 3: 18.6 ms per loop


In [6]:
%load_ext line_profiler

In [7]:
lstats = %lprun -r -f ffsr_bag ffsr_bag(d,0.05)
lstats.print_stats()

Timer unit: 1e-06 s

Total time: 0.037304 s
File: <ipython-input-1-0eacd6692129>
Function: ffsr_bag at line 391

Line #      Hits         Time  Per Hit   % Time  Line Contents
   391                                           def ffsr_bag(dat,g0=0.05,max_size=None,var_incl=None,prec_f='.4f'):
   392                                               
   393                                               ### NOTE: it is assumed that data has been transformed, cleaned, and is given in correct format
   394                                               
   395                                               ### Input params:
   396                                               #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
   397                                               #   g0       = float pre-specified FSR of interest ("gamma0")
   398                                               #   betaout  = boolean of whether to include estimated betahats f

In [8]:
#### The time suck occurs primarily in the forward function (need faster alternative)

In [None]:
# Could try defining a forward selection proc in Python
# but would this really speed up the code? Not with large (p) datasets since this still steps through all p cov's

# Need to translate the Fortran Leaps functions which rely on the 'efficient' branch-and-bound algorithm in order to perform
# forward selection

def seq_forw_select(features, max_k, criterion_func, print_steps=False):
    """
    Implementation of a Sequential Forward Selection algorithm.
    
    Keyword Arguments:
        features (list): The feature space as a list of features.
        max_k: Termination criterion; the size of the returned feature subset.
        criterion_func (function): Function that is used to evaluate the
            performance of the feature subset.
        print_steps (bool): Prints the algorithm procedure if True.
    
    Returns the selected feature subset, a list of features of length max_k.

    """
    
    # Initialization
    feat_sub = [features[0]]
    k = 0
    d = len(features)
    if max_k > d:
        max_k = d
    
    while True:
        
        # Inclusion step
        if print_steps:
            print('\nInclusion from feature space', features)
        crit_func_max = criterion_func(feat_sub + [features[1]])
        best_feat = features[1]
        for x in features[2:]:
            crit_func_eval = criterion_func(feat_sub + [x])
            if crit_func_eval < crit_func_max:
                crit_func_max = crit_func_eval
                best_feat = x
        feat_sub.append(best_feat)
        if print_steps:
            print('include: {} -> feature subset: {}'.format(best_feat, feat_sub))
        features.remove(best_feat)
        
        # Termination condition
        k = len(feat_sub)
        if k == max_k:
            break
                
    return feat_sub



def criterion_f(f):

    x = X2.copy()
    x.insert(0,'int',1.)
    y = Y2

    import statsmodels.api as sm

    mod = sm.OLS(y, x.loc[:,f])
    rs = mod.fit()
    
    return rs.pvalues[-1]


In [None]:
# feats = list(X2)

# X3 = X2.copy()
# X3.insert(0,'int',1.)

# b = ['int'] + [feats[3]]
# mod = sm.OLS(Y2, X3.loc[:,b])
# rs = mod.fit()
# print rs.pvalues[-1]
# print criterion_f(b)

# b = b + [feats[2]]
# mod = sm.OLS(Y2, X2.loc[:,b])
# rs = mod.fit()
# print rs.pvalues
# print criterion_f(b)

# b = b + [feats[6]]
# mod = sm.OLS(Y2, X3.loc[:,b])
# rs = mod.fit()
# print rs.pvalues
# print criterion_f(b)

# b = b + [feats[2]]
# mod = sm.OLS(Y2, X3.loc[:,b])
# rs = mod.fit()
# print rs.pvalues
# print criterion_f(b)

# b = b + [feats[10]]
# mod = sm.OLS(Y2, X3.loc[:,b])
# rs = mod.fit()
# print rs.pvalues
# print criterion_f(b)

# b = b + [feats[1]]
# mod = sm.OLS(Y2, X3.loc[:,b])
# rs = mod.fit()
# print rs.pvalues
# print criterion_f(b)

# feats = list(X3)
# print feats
# seq_forw_select(feats, 6, criterion_f, print_steps=True)

# import statsmodels.api as sm
# mod = sm.OLS(Y2, X2.loc[:,b])
# rs = mod.fit()
# rs.pvalues

In [None]:
#### Attempt at using cython_gsl in order to call linear fitting fcn from GNU Scientific Library

In [32]:
%load_ext Cython

In [33]:
%%cython -l gsl -l gslcblas

cimport cython
from cython_gsl cimport *

def main ():
    cdef int i, n
    n = 4
    cdef double x[4], y[4], w[4]
    x[0] =  1970
    x[1] = 1980
    x[2] = 1990
    x[3] = 2000
    y[0] = 12
    y[1] = 11
    y[2] = 14
    y[3] = 13
    w[0] = 0.1
    w[1] = 0.2
    w[2] = 0.3
    w[3] = 0.4

    cdef double c0, c1, cov00, cov01, cov11, chisq

    gsl_fit_wlinear (x, 1, w, 1, y, 1, n,
                     &c0, &c1, &cov00, &cov01, &cov11,
                     &chisq)

    print "# best fit: Y = %g + %g X\n" % (c0, c1)
    print "# covariance matrix:\n"
    print "# [ %g, %g\n#   %g, %g]\n" % (cov00, cov01, cov01, cov11)
    print "# chisq = %g\n", chisq

    for i from 0 <= i < n:
        print "data: %g %g %g\n" % ( x[i], y[i], 1/sqrt(w[i]))

    print "\n"

    cdef double xf, yf, yf_err
    for i from -30 <= i < 130:
        xf = x[0] + (i/100.0) * (x[n-1] - x[0])

        gsl_fit_linear_est (xf, c0, c1, cov00, cov01, cov11, &yf, &yf_err)

        print "fit: %g %g\n" %(xf, yf)
        print "hi : %g %g\n" %(xf, yf + yf_err)
        print "lo : %g %g\n" %(xf, yf - yf_err)



CompileError: command 'gcc' failed with exit status 1

In [35]:
%load_ext Cython
from matplotlib import pyplot as plt

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [36]:
%%cython -lgsl -lgslcblas
'''
Gibbs sampler for function:

f(x,y) = x x^2 \exp(-xy^2 - y^2 + 2y - 4x)

using conditional distributions:

x|y \sim Gamma(3, y^2 +4)
y|x \sim Normal(\frac{1}{1+x}, \frac{1}{2(1+x)})

Original version written by Flavio Coelho.
Tweaked by Chris Fonnesbeck.
Ported to CythonGSL Thomas V. Wiecki.
'''
cimport cython
from cython_gsl cimport *

import numpy as np
cimport numpy as np

from libc.math cimport sqrt

cdef gsl_rng *r = gsl_rng_alloc(gsl_rng_mt19937)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
def gibbs(int N=20000, int thin=500):
    cdef: 
        double x = 0
        double y = 0
        Py_ssize_t i, j
        np.ndarray[np.float64_t, ndim=2] samples = np.empty((N, 2), dtype=np.float64)

    for i in range(N):
        for j in range(thin):
            x = gsl_ran_gamma(r, 3, 1.0 / (y * y + 4))
            y = gsl_ran_gaussian(r, 1.0 / sqrt(x + 1))
        samples[i, 0] = x
        samples[i, 1] = y
    return samples

CompileError: command 'gcc' failed with exit status 1

In [1]:
%%file ffsr5_d13.py
""" Pseudocode for Fast FSR algorithm """

""" Date: 4/28/15 
    Modified: Create super fast FFSR fcn for bagging """



""" Data type check """
def df_type(dat):
    
    """
    ### Purpose: 
    # Check if 'dat' is a pandas Dataframe or a numpy Ndarray
    
    ### Input params:
    #   dat = dataset whose type is to be checked / transformed
    
    ### Output:
    # error msg or True boolean
    """
    
    import numpy as np
    import pandas as pd
    
    if isinstance(dat,pd.DataFrame)==False and isinstance(dat,np.ndarray)==False:
        raise Exception("Data must be pandas DataFrame")
    else:
        return True


    
""" p-value computation function """
def pval_comp(max_size=None):
    
    """
    ### Purpose:
     Compute the sequential p-values of the variables added to the model
    
    ### Input params:
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of p-values of each covariate at its given entry step
    
    ### NOTE: fwd should be a global-env R object 
    ###       (requires running 'forward' fcn prior to this fcn)
    """
    
    import numpy as np
    import scipy.stats as st
    import rpy2.robjects as ro
    
    # Pull RSS values & num_obs from fwd_proc object
    rss = np.array(ro.r('fwd$rss'))
    N = np.array(ro.r('fwd$nn'))
    
    if max_size==None:
        max_size = len(rss)-1
    
    # vector of model sizes
    sizes = np.arange(max_size)+1
    
    # compute the F stats as defined above where p_f - p_r = 1 for each iteration
    fstats = (rss[0:max_size] - rss[1:(max_size+1)]) / (rss[1:(max_size+1)] / (N - (sizes+1)))
    
    # return the p-values by comparing these stats to the F distn: F(1, n - p_f)
    return 1 - st.f.cdf(fstats, 1, N-(sizes+1))



""" Covariate model entry order """
def cov_order(xcolnames,max_size=None,col_incl=None):
    
    """
    ### Purpose: 
    # Determine order of covariate entry into final model
    
    ### Input params:
    # xcolnames = array of names of covariates (same order as columns in original dataset)
    # max_size  = integer max no. of vars in final model (largest model size desired)
    # col_incl  = array vector of columns to forcefully include in all models
    
    ### Output:
    # array of covariate names sorted according to order of entry into the model
    
    ### NOTE: fwd should be a global-env R object 
    ###       (requires running 'forward' fcn prior to this fcn)
    """
    
    import numpy as np
    import rpy2.robjects as ro
    
    if max_size==None:
        max_size = len(xcolnames)
        
    ### Pull the cov entry order
    vorder = ro.r('fwd$vorder[-1]') # remove intercept
    vorder = vorder[0:max_size] # keep only the max model size number of covs
    
    ### Shift these values down by two (one to exclude intercept, one to make python indices)
    vorderinds = np.array(vorder)-2
    
    ### Rearrange the var order st forced vars are at start of list
    if col_incl==None:
        col_incl = np.arange(max_size)+1
    keep = xcolnames[[col_incl-1]] # pull var names of those vars forced into model (this is an array)
    poss = [x for x in xcolnames if x not in keep] # pull var names of those not forced in (this is a list)
    col_names = np.array(list(keep)+poss) # = rearranged array of varnames w/forced-in vars at start of list
    
    ### Sort the columns of X in order to obtain the var names in the entry order
    return col_names[vorderinds[::]]

    

""" Forward selection function """
def forward(x,y,max_size=None,col_incl=None):
    
    """
    ### Purpose:
    # Perform the forward selection procedure via the R function leaps::regsubsets
    
    ### Input params:
    #   x        = python dataframe of original p covariates, n x p
    #   y        = python outcome dataframe, n x 1
    #   max_size = integer max no. of vars in final model (largest model size desired)
    #   col_incl = array vector of columns to forcefully include in all models
    
    ### Output:
    # regsubsets R object -- the raw full output of the forward selection proc
    """
    
    ### Load python packages to call R functions
    import rpy2.robjects as ro
    import pandas.rpy.common as com
    
    ### Convert x and y to R matrices <-- MAKE SURE x,y input == DATAFRAMES (or else change them to df's)!!!
    ### and declare as R objects in global environment
    ro.globalenv['x2'] = com.convert_to_r_matrix(x)
    ro.globalenv['y2'] = com.convert_to_r_matrix(y)
    if max_size==None:
        max_size = x.shape[1]
    ro.globalenv['maxv'] = ro.Vector(max_size)
    if col_incl==None:
        ro.r('coli=NULL')
    else:
        ro.globalenv['coli'] = ro.FloatVector(col_incl[:])
    
    ### Perform forward selection with regsubsets function
    ro.globalenv['fwd'] = ro.r('leaps::regsubsets(x=x2,y=y2,method="forward",nvmax=maxv,force.in=coli)')
    
    
    
""" Gamma computation """
def gamma_F(pvs, ncov, max_size=None):
    
    """
    ### Purpose:
    # Compute the gamma (FSR) values at each step in the model build procedure
     
    ### Input params:
    #   pvs      = vector of p-values (monotonically increasing) from forward sel procedure
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of gamma_F values
    """    
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # gamma_F_i = p_s_i * (ncov - S_i) / (1 + S_i)
    g_F = pvs * (ncov - S) / (1 + S)
    
    # Check for duplicate p-values
    dups = list(set([x for x in list(pvs) if list(pvs).count(x) > 1]))
    for i in range(len(dups)): g_F[pvs==dups[i]] = min(g_F[pvs==dups[i]])
    
    # if table run on all vars, the last gamma = 0,
    #  instead set equal to the last pv_mono == final rate of unimp var inclusion
    if(g_F[-1]==0): 
        g_F[-1]=pvs[-1]
    
    return g_F

    
    
""" Alpha computation for model selection """
def alpha_F(g0, ncov, max_size=None):
    
    """
    ### Purpose:
     Compute alpha-to-enter value corresponding to each step in model build procedure
     
    ### Input params:
    #   g0       = float pre-specified FSR (gamma0)
    #   ncov     = integer total number of covariates in data
    #   max_size = integer max no. of vars in final model (largest model size desired)
    
    ### Output:
    # array of alpha_F values
    """    
    
    import numpy as np
    
    if max_size==None:
        max_size = ncov
        
    # Create indices == model size at given step, call this S
    S = np.arange(max_size)+1
    
    # alpha_F_i = gamma_0 * (1 + S_i) / (ncov - S_i)
    alpha_F = g0 * (1 + S) / (ncov - S)
    
    # if table run on all vars, the last alpha = inf
    #  instead set equal to 1 == include all vars
    alpha_F[np.isinf(alpha_F)] = 1.
    
    return alpha_F        
    
    
    
""" Alpha computation for specific gamma """
def alpha_F_g(g, gf, ncov):
    
    """
    ### Purpose:
    # Compute alpha-to-enter for a pre-specified gamma (FSR)
     
    ### Input params:
    #   g    = float or vector (length k) of specified FSR at which to compute alpha
    #   gf   = vector gamma_F's computed from gamma0, pv_sorted
            used to compute largest size model (S) for which gamma_F < g
    #   ncov = integer of total number covariates in data
    
    ### Output:
    # integer alpha_F value
    """
    
    import numpy as np
    
    ### Compute model size for gf closest to (but still <) g
    #S = np.array([max(np.which(x<=y)) for x in gf y in g])+1
    if isinstance(g,np.ndarray): # if g is a vector
        s_s = [np.where(gf>y) for y in g]
        S = np.array([min(x[0]) for x in s_s])
        return g * (1 + S) / (ncov - S)
    else: # if g is a number
        S = min(np.where(gf>g)[0])
        return g * (1 + S) / (ncov - S)


    
""" Beta-hat computation for specific gamma """
def beta_est(x, y, g, gf, vname):
    
    """
    ### Purpose:
    # Compute parameter estimates for final model given a pre-specified gamma (FSR)
     
    ### Input params:
    #   x      = python dataframe of original p covariates, n x p
    #   y      = python outcome dataframe, n x 1
    #   g      = float of specified FSR at which to compute alpha
    #   gf     = vector gamma_F's computed from gamma0, pv_mono
                 used to compute largest size model (S) for which gamma_F < g
    #   vname  = ordered vector of names of vars entered into model under forward selection
    
    ### Output:
    # array of estimated parameters
    """
    
    import numpy as np
    import statsmodels.api as sm
    
    ### Compute model size corresponding to g
    S = min(np.where(gf>g)[0])

    ### Pull the cov names of those vars included in the above size model
    modvars = vname[:S]

    ### Fit the linear model using the selected model vars
    fit = sm.OLS(y,x.loc[:,list(modvars)]).fit()
    betaout = pd.DataFrame([fit.params,fit.bse]).T
    betaout.columns = ['beta','beta_se']
    
    return betaout

    
    
""" FSR Results Table """
def fsrtable(size, vname, p_orig, p_mono, alphaf, gammaf, prec_f='.4f'):
    
    """
    ### Purpose:
    # Build the results table for the ffsr function
     
    ### Input params:
    #   size   = model size at each step of forward sel proc                   [S]
    #   vname  = variable name that entered at each step (num vars = p)        [Var]
    #   p_orig = p-values at each step                                         [p]
    #   p_mono = ascending p-values                                            [p_s]
    #   alphaf = alpha-to-enter (p-value cutoff) for model entry at each step  [alpha_F]
    #   gammaf = FSR at each step                                              [gamma_F]
    #   prec_f = string of precision (num digits) desired in FSR output table
    
    ### Output:
    # table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    """
    
    import numpy as np
    import pandas as pd
    
    ### Round all arrays
    p_od = [format(x,prec_f) for x in p_orig]
    p_md = [format(x,prec_f) for x in p_mono]
    ad = [format(x,prec_f) for x in alphaf]
    gd = [format(x,prec_f) for x in gammaf]
    
    ### Combine the arrays
    tab = pd.DataFrame([size,vname,p_od,p_md,ad,gd]).T
    tab.columns = ['S','Var','p','p_m','alpha_F','gamma_F']
    
    return tab



# class ffsr_obj(object):
#     if bag==False:
#         if gs!=None: 
#             if betaout==True:
#                 def __init__(self, fsr_results, betahats, gs):
#                     self.fsres = fsr_results
#                     self.beta = betahats
#                     self.alpha = alpha_F_g(gs, g_F, d.shape[1]-1)
#             else:
#                 def __init__(self, fsr_results, gs):
#                     self.fsres = fsr_results
#                     self.alpha = alpha_F_g(gs, g_F, d.shape[1]-1)
#         else:
#             if betaout==True:
#                 def __init__(self, fsr_results, betahats):
#                     self.fsres = fsr_results
#                     self.beta = betahats
#             else:
#                 def __init__(self, fsr_results):
#                     self.fsres = fsr_results
#     else:
#         def __init__(self, betahats, g0):
#             self.beta = betahats
#             self.alpha = alpha_F_g(g0, g_F, d.shape[1]-1)
#             self.size = len(betahats)

    
    
""" FastFSR function """
def ffsr(dat,g0=0.05,betaout=False,gs=None,max_size=None,var_incl=None,bag=False,prec_f='.4f'):
    
    """
    ### Purpose:
     Perform the Fast False Selection Rate procedure with linear regression
     
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string to be given to 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    # Table of [S   Var   p   p_s   alpha_F   gamma_F], dim = num_steps(== p) x 6
    #   S:       model size at given step
    #   Var:     name of var that entered at given step
    #   p:       p-value of var that entered at given step
    #   p_m:     mono. inc. p-value (vector or original p-values arranged to be monotonically increasing)
    #   alpha_F: cutoff value for model entry given gamma_0 and current p_s value
    #   gamma_F: FSR given current alpha_F and model size (== step num)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   vector of estimated beta param's for final model (based on g0)
    """
    
    import numpy as np
    import pandas as pd
    
    ### Clean and check data - make sure dat = pandas dataframes or else convert them
    if bag==False:
        if df_type(dat)==True:
            if isinstance(dat,pd.DataFrame):
                d = dat.copy()
            else:
                if isinstance(dat,np.ndarray):
                    d = pd.DataFrame(dat)
                    vnum = list(np.arange(d.shape[1])+1)
                    vchr = list(np.repeat("V",d.shape[1]))
                    d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
        else:
            return df_type(dat)
        
        ### Remove missing values
        d.dropna(inplace=True)
        
        ### Check that p < n to ensure regression solutions
        if (d.shape[1]-1) >= d.shape[0]:
            raise Exception("N must be > p for valid regression solutions")
    else:
        d = dat.copy()
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = d.shape[1]-1
        
    ### Perform forward selection
    fwd_sel = forward(d.iloc[:,1:], pd.DataFrame(d.iloc[:,0]), max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(d.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, d.shape[1]-1, max_size)
    
    ### Check if betaout desired, if so compute beta_hat of model corresponding to specific gamma0
    if betaout==True or bag==True:
        betahats = beta_est(d.iloc[:,1:], pd.DataFrame(d.iloc[:,0]), g0, g_F, cov_entry_order)
        
    ### Check if bagging desired
    if bag==False: 
        ### Alpha_F computation for all steps in fwd sel proc
        a_F = alpha_F(g0, d.shape[1]-1, max_size)
        
        ### Model size
        S = np.arange(max_size)+1
        
        ### Combine S, Cov_names, p-vals, sorted p-vals, alpha_F, gamma_F into table
        fsr_results = fsrtable(S, cov_entry_order, p_orig, p_mono, a_F, g_F)
        
        ### Return selected output: FSR table (+ betahat) (+ alpha_specific)
        if gs!=None: 
            ### Compute alpha_F for specific gammas (gs)
            if betaout==True:
                return fsr_results, betahats, alpha_F_g(gs, g_F, d.shape[1]-1)
            else:
                return fsr_results, alpha_F_g(gs, g_F, d.shape[1]-1)
        else:
            if betaout==True:
                return fsr_results, betahats
            else:
                return fsr_results
    else:
        return betahats, alpha_F_g(g0, g_F, d.shape[1]-1), len(betahats)
    
    
    
""" FastFSR for bagging function """
def ffsr_bag(dat,g0=0.05,max_size=None,var_incl=None,prec_f='.4f'):
    
    """
    ### Purpose:
     Perform Fast False Selection Rate procedure in efficient manner conducive for bagging.
     
    ### NOTE: it is assumed that data has been transformed, cleaned, and is given in correct format.
    
    ### Input params:
    #   dat      = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0       = float pre-specified FSR of interest ("gamma0")
    #   betaout  = boolean of whether to include estimated betahats from final selected model
    #   gs       = float or vector of gamma's at which to specifically compute alpha_F
    #   max_size = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   var_incl = array of cols corresponding to those vars to force into model
    #   bag      = boolean of whether to output FSR table (non-bagging results) or reduced output for bagging purposes
    #   prec_f   = string of precision (num digits) desired in FSR output table (string appropriate for 'format' python fcn)
    
    ### Output: 
    #      (note: gamma = FSR, gamma_0 = pre-specified/desired FSR)
    #   vector of estimated beta param's for final model (based on g0)
    #       and
    #   vector of alpha_F's for specified gamma's (g)
    #       and
    #   number of param's in final model
    """

    import numpy as np
    import pandas as pd
    
    ### If max model size not specified, select all possible cov.s
    if max_size==None:
        max_size = dat.shape[1]-1
        
    ### Perform forward selection
    fwd_sel = forward(dat.iloc[:,1:], pd.DataFrame(dat.iloc[:,0]), max_size, var_incl)
    
    ### Save order of covariate entry into model
    cov_entry_order = cov_order(dat.columns.values[1:], max_size, var_incl)
    
    ### Compute p-value of each covariate entering the model
    p_orig = pval_comp(max_size)
    
    ### Arrange p-values in mono. inc. order
    p_mono = np.array([max(p_orig[:(i+1)]) for i in range(len(p_orig))])
        
    ### Gamma_F computation
    g_F = gamma_F(p_mono, dat.shape[1]-1, max_size)
    
    ### Compute beta_hat of model corresponding to specific gamma0
    betahats = beta_est(dat.iloc[:,1:], pd.DataFrame(dat.iloc[:,0]), g0, g_F, cov_entry_order)
        
    return betahats, alpha_F_g(g0, g_F, dat.shape[1]-1), len(betahats)

    
    
def bagfsr(dat,g0,B=200,max_s=None,v_incl=None,prec=4):
    
    """
    ### Purpose:
     Perform bagging with Fast False Selection Rate procedure to allow for more accurate predictions.
     
    ### Input params:
    #   dat    = python dataframe of original p covariates, 1 outcome (in first col.): n x p+1
    #   g0     = float pre-specified FSR of interest ("gamma0")
    #   B      = integer of number of bagged samples
    #   max_s  = integer of largest model size == max num vars to incl in final model (default = num covs in dataset)
    #   v_incl = array of cols corresponding to those vars to force into model
    #   prec   = integer of precision (num digits) desired in beta-hat parameter estimates of final model
    
    ### Output: 
    #   Mean of betahats
    #   SEs of betahats
    #   Avg alpha-to-enter
    #   Avg model size
    #   Prop of times each var included in model
    """
    
    import numpy as np
    import pandas as pd
    from sklearn.utils import resample
    
    ### Clean and check data - make sure X, Y = pandas dataframes or else convert them
    if df_type(dat)==True:
        if isinstance(dat,pd.DataFrame):
            d = dat.copy()
        else:
            if isinstance(dat,np.ndarray):
                d = pd.DataFrame(dat)
                vnum = list(np.arange(d.shape[1])+1)
                vchr = list(np.repeat("V",d.shape[1]))
                d.columns = [a + str(b) for a,b in zip(vchr,vnum)]
    else:
        return df_type(dat)
    
    ### Remove missing values
    d.dropna(inplace=True)
    
    ### check that p < n to ensure regression solutions
    if (d.shape[1]-1) >= d.shape[0]:
        raise Exception("N must be > p for valid regression solutions")
    
    ### Create array to keep track of number of times vars enter model
    nentries = pd.DataFrame(np.zeros(d.shape[1]-1),index=d.columns.values[1:])
    
    ### Create array to store all estimated coefficients, ses, alphas, sizes
    allbetas = pd.DataFrame(np.zeros([B,(d.shape[1]-1)]),columns=d.columns.values[1:])
    allses = allbetas.copy()
    alphas = []
    sizes = []
    np.random.seed(1234)
    
    ### Bagging loops
    for i in range(B):
        # Draw with replacement from rows of data
        newdat = pd.DataFrame(resample(d, replace=True))
        newdat.columns = d.columns.values
        
        ### Obtain FSR results
        fsrout = ffsr_bag(newdat,g0,max_size=max_s,var_incl=v_incl)
        allbetas.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,0]
        allses.loc[i,fsrout[0].index.values] = fsrout[0].iloc[:,1]
        alphas.append(fsrout[1])
        sizes.append(fsrout[2])

        ### Update counts of num times var included
        nentries.loc[fsrout[0].index[np.abs(np.around(fsrout[0].iloc[:,0],prec))>0]] += 1
        
    ### Compute averages
    avgbeta = allbetas.mean(axis=0) # mean across rows / colmeans == mean of each cov's betahat
    avgse = allses.mean(axis=0)
    avgalpha = np.mean(alphas)
    avgsize = np.mean(sizes)
    var_props = nentries/float(B)
    cov_res = pd.concat([avgbeta,avgse,var_props],axis=1)
    cov_res.columns = ['betahat','betase','prop_incl']
    
    return cov_res, avgalpha, avgsize
    
    

# Notes: 
# 1. appropriate transformations are expected to have been applied prior to utilization of FSR algorithm

Overwriting ffsr5_d13.py
