## Create files to run timing profiles on Python and R functions

#### Standard FFSR

In [2]:
%%file ffsr_r_run.py

import rpy2.robjects as ro
import pandas.rpy.common as com

ro.r("""fsr.fast<-function(x,y,gam0=.05,digits=4,print=T,plot=F){
# estimated alpha for forward selection using Fast FSR (no simulation)
# typical call: fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20])->out
# for use inside simulation loops, set print=F and plot=F
# version 7 circa Nov. 2009, modified to handle partially blank colnames
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
if(any(colnames(x)==""))colnames(x)<-NULL       # if only partially named columns
colnames(x)<-colnames(x,do.NULL=F,prefix="")    # corrects for no colnames
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm)#,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
# gmax<-max(ghat)
# index.max<-which.max(ghat)           # index of largest ghat
# alphamax<-alpha[index.max]           # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]           # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
# ghat3<-(m-size1+1)*alpha/(1+S)         # uses final ku est.
ghat4<-(m-size1+1)*alpha/(1+0:m)
#res<-data.frame(real.seq,ghigh=ghat2,glow=ghat[2:ng])
alphas<-gam0 * (1. + S[2:ng]) / (m - S[2:ng])
res<-data.frame(S=S[2:ng],real.seq,alpha=alphas,g=ghat[2:ng])
if(print)print(round(res,digits))
#if(plot){
#plot(zp$a,zp$g,type="b",xlab="Alpha",ylab="Estimated Gamma",xlim=c(0,alphamax))
#points(alphahat.fast,gam0,pch=19)
#lines(c(-1,alphahat.fast),c(gam0,gam0))
#lines(c(alphahat.fast,alphahat.fast),c(-1,gam0))
#}  # ends plot
return(list(res=round(res,digits),mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}""")

ro.r('ncaa <- as.matrix(read.table("ncaa_data2.txt",header=T))')

print "NCSU R Results:\n"

print ro.r('system.time(fsr.fast(x=ncaa[,1:19],y=ncaa[,20]))')

Overwriting ffsr_r_run.py


In [2]:
%%file ffsr_p_run.py

from ffsr import ffsr
import pandas as pd
import numpy as np

### Read in NCAA2 data from NCSU webpage
ncaadata = pd.read_csv("ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)

# move outcome variable to first column
cols = ncaadata.columns.tolist()
cols = cols[-1:] + cols[:-1]

ncaa2 = pd.DataFrame(ncaadata[cols],dtype='float')

print "Python Results:\n"

print ffsr(ncaa2,0.05).fsres

Overwriting ffsr_p_run.py


#### Force-in FFSR

In [5]:
%%file ffsr_force_r_run.py

import rpy2.robjects as ro
import pandas.rpy.common as com

ro.r("""fsr.fast.include<-function(x,y,gam0=.05,digits=4,print=T,inc){
# estimated alpha for forward selection
# this program allows variables to be forced in
# for example inc=c(12,3,5) forces in variables in columns 12,3, and 5 of x
# not set up to handle inc=NULL, use fsr.fast when not including variables
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
colnames(x)<- as.character(1:m)
m.inc=length(inc)
inc.reo=c(inc,setdiff(1:m,inc))        # new order for x's, inc at beginning
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,force.in=inc,method="forward")->out.x
ch=out.x$vorder-1
vorder.new=inc.reo[ch[2:(m1+1)]]     # order without intercept
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
pv.orig[1:m.inc]=rep(0,m.inc)
for (i in (m.inc+1):m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
alphas<-gam0 * (1. + S[2:ng]) / (m - S[2:ng])
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
real.seq<-data.frame(S=S[2:ng],var=vorder.new,pval=pv.orig,
         pvmax=pvm,alpha=alphas,g=ghat[2:ng])#,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
alpha<-c(0,pvm[(m.inc+1):m1])        # note alpha reduced by number forced in
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
pvm=pvm[(m.inc+1):m1]                # redefine to get rid of 0's at beginnning
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-m.inc-S)*alpha/(1+S)              # gammahat_ER
####
if(print)print(round(real.seq,digits),S)
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-m.inc-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]          # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-m.inc-Sind)  # ER est.
size<-sum(pvm<=alphahat.fast)+m.inc       # size of model without intercept
colnames(x)<-colnames(x,do.NULL=F,prefix="")      # corrects for no colnames
x<-x[,colnames(x)[vorder.new[1:size]]]
x.ind<-vorder.new[1:size]
mod <- lm(y~x)
return(list(mod=mod,size=size,x.ind=x.ind,alphahat.ER=alphahat.fast,inc=inc))
}""")

ro.r('ncaa <- as.matrix(read.table("ncaa_data2.txt",header=T))')

print "NCSU R Results:\n"

print ro.r('system.time(fsr.fast.include(x=ncaa[,1:19],y=ncaa[,20],inc=c(12,3,5)))')

Overwriting ffsr_force_r_run.py


In [4]:
%%file ffsr_force_p_run.py

from ffsr import ffsr
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

### Read in NCAA2 data from NCSU webpage
ncaadata = pd.read_csv("ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)

# move outcome variable to first column
cols = ncaadata.columns.tolist()
cols = cols[-1:] + cols[:-1]

ncaa2 = pd.DataFrame(ncaadata[cols],dtype='float')

print "Python Results:\n"

print ffsr(ncaa2,0.05,var_incl=np.array([12,3,5])).fsres

Overwriting ffsr_force_p_run.py


#### Bagging FFSR

In [6]:
#%%file ffsr_bag_r_run.py

import rpy2.robjects as ro
import pandas.rpy.common as com

ro.r("""bag.fsr<-function(x,y,B=100,gam0=.05){
# gives average coefficients from fsr.fast6.sim
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
hold<-matrix(rep(0,m*B),nrow=B)      # holds coefficients
interc<-rep(0,B)                     # holds intercepts
alphahat<-rep(0,B)                   # holds alphahats
size<-rep(0,B)                       # holds sizes
for(i in 1:B){
index<-sample(1:n,n,replace=T)
out<-fsr.fast6.sim(x=x[index,],y=y[index],gam0=gam0)
if (out$size>0) hold[i,out$x.ind]<-out$mod$coeff[2:(out$size+1)]
interc[i]<-out$mod$coeff[1]
alphahat[i]<-out$alphahat.ER
size[i]<-out$size
}                                    # ends i loop
coeff.av<-apply(hold,2, mean)
coeff.sd<-rep(0,m)
coeff.sd<-sqrt(apply(hold,2, var))
interc.av<-mean(interc)
interc.sd<-sd(interc)
amean<-mean(alphahat)
sizem<-mean(size)
prop<-rep(0,m)
for(j in 1:m){prop[j]<-sum(abs(hold[,j])>0)/B}
as.matrix(x)->x                      # in case x is a data frame
pred<-x%*%coeff.av+interc.av
return(list(coeff.av=coeff.av,coeff.sd=coeff.sd,interc.av=interc.av,pred=pred,
            interc.sd=interc.sd,prop=prop,amean=amean,sizem=sizem))
}""")

ro.r("""fsr.fast6.sim<-function(x,y,gam0=.05){
# estimated alpha for forward selection
# short output version
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]          # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
colnames(x)<-colnames(x,do.NULL=F,prefix="")      # corrects for no colnames
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}""")

ro.r('ncaa <- as.matrix(read.table("ncaa_data2.txt",header=T))')

print "NCSU R Results:\n"

print ro.r('system.time(bag.fsr(x=ncaa[,1:19],y=ncaa[,20],B=200)->out.ncaa)')

print ro.r('paste("Mean of estimated alpha-to-enter:",round(out.ncaa$amean,4))')
print ro.r('paste("Mean size of selected model:",round(out.ncaa$sizem,4))')

NCSU R Results:

   user  system elapsed 
  0.821   0.019   0.848 

[1] "Mean of estimated alpha-to-enter: 0.0462"

[1] "Mean size of selected model: 7.685"



In [1]:
#%%file ffsr_bag_p_run.py

from ffsr import bagfsr
import pandas as pd
import numpy as np

### Read in NCAA2 data from NCSU webpage
ncaadata = pd.read_csv("ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)

# move outcome variable to first column
cols = ncaadata.columns.tolist()
cols = cols[-1:] + cols[:-1]

ncaa2 = pd.DataFrame(ncaadata[cols],dtype='float')

f = bagfsr(ncaa2,0.05)

print "Python Results:"
print
print "Mean of estimated alpha-to-enter:", round(f.alpha,4)
print
print "Mean size of selected model:", f.size

Python Results:

Mean of estimated alpha-to-enter: 0.0503

Mean size of selected model: 7.095


## Change dir to run unit tests

In [12]:
import os
os.getcwd()

'/home/bitnami/STA-663-Nicole-Solomon-Project/Report'

In [3]:
import os
os.chdir('/home/bitnami/STA-663-Nicole-Solomon-Project/Tests')
!py.test

platform linux2 -- Python 2.7.9 -- py-1.4.25 -- pytest-2.6.3
collected 31 items 
[0m
test_alpha.py ....
test_alphag.py ..........
test_bagfsr.py ...
test_beta.py .....
test_covnames.py .
test_df_type.py ..
test_ffsr.py ..
test_gamma.py ...
test_pvals.py .

