## Compare Python algorithm to R algorithm ##

In [None]:
# Date: 4/28/15

In [1]:
from ffsr import *
import pandas as pd
import numpy as np

### Read in NCAA2 data from NCSU webpage
ncaadata = pd.read_csv("ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)

# move outcome variable to first column
cols = ncaadata.columns.tolist()
cols = cols[-1:] + cols[:-1]
print cols

ncaa2 = pd.DataFrame(ncaadata[cols],dtype='float')

['y', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19']


In [2]:
%load_ext rpy2.ipython

In [3]:
%%R -i ncaa2

fsr.fast<-function(x,y,gam0=.05,digits=4,print=T,plot=F){
# estimated alpha for forward selection using Fast FSR (no simulation)
# typical call: fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20])->out
# for use inside simulation loops, set print=F and plot=F
# version 7 circa Nov. 2009, modified to handle partially blank colnames
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
if(any(colnames(x)==""))colnames(x)<-NULL       # if only partially named columns
colnames(x)<-colnames(x,do.NULL=F,prefix="")    # corrects for no colnames
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
# gmax<-max(ghat)
# index.max<-which.max(ghat)           # index of largest ghat
# alphamax<-alpha[index.max]           # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]           # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
# ghat3<-(m-size1+1)*alpha/(1+S)         # uses final ku est.
ghat4<-(m-size1+1)*alpha/(1+0:m)
#res<-data.frame(real.seq,ghigh=ghat2,glow=ghat[2:ng])
res<-data.frame(real.seq,g=ghat[2:ng])
if(print)print(round(res,digits))
#if(plot){
#plot(zp$a,zp$g,type="b",xlab="Alpha",ylab="Estimated Gamma",xlim=c(0,alphamax))
#points(alphahat.fast,gam0,pch=19)
#lines(c(-1,alphahat.fast),c(gam0,gam0))
#lines(c(alphahat.fast,alphahat.fast),c(-1,gam0))
#}  # ends plot
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}

ncaa = as.matrix(ncaa2)

system.time(fsr.fast(x=ncaa[,2:20],y=ncaa[,1]))

Loading required package: leaps
   var   pval  pvmax    Rsq      g
1    2 0.0000 0.0000 0.7069 0.0000
2    3 0.0001 0.0001 0.7539 0.0004
3    5 0.0116 0.0116 0.7708 0.0270
4    4 0.0053 0.0116 0.7901 0.0270
5    7 0.0025 0.0116 0.8110 0.0270
6   17 0.0433 0.0433 0.8197 0.0804
7   15 0.0527 0.0527 0.8274 0.0791
8    6 0.1056 0.1056 0.8327 0.0864
9    9 0.0826 0.1056 0.8386 0.0864
10   8 0.0536 0.1056 0.8457 0.0864
11  12 0.2350 0.2350 0.8484 0.1566
12  10 0.2864 0.2864 0.8505 0.1542
13  13 0.3163 0.3163 0.8524 0.1054
14  18 0.2697 0.3163 0.8546 0.1054
15  11 0.4953 0.4953 0.8555 0.1238
16   1 0.6326 0.6326 0.8559 0.1116
17  14 0.7056 0.7056 0.8562 0.0784
18  19 0.8605 0.8605 0.8563 0.0453
19  16 0.9032 0.9032 0.8563 0.0000
   user  system elapsed 
  0.034   0.002   0.037 


In [5]:
%%time
ffsr(ncaa2,0.05).fsres

CPU times: user 26.4 ms, sys: 4.72 ms, total: 31.2 ms
Wall time: 29.1 ms


Unnamed: 0,S,Var,p,p_m,alpha_F,gamma_F
0,1,x2,0.0,0.0,0.0056,0.0
1,2,x3,0.0001,0.0001,0.0088,0.0004
2,3,x5,0.0116,0.0116,0.0125,0.027
3,4,x4,0.0053,0.0116,0.0167,0.027
4,5,x7,0.0025,0.0116,0.0214,0.027
5,6,x17,0.0433,0.0433,0.0269,0.0804
6,7,x15,0.0527,0.0527,0.0333,0.0791
7,8,x6,0.1056,0.1056,0.0409,0.0864
8,9,x9,0.0826,0.1056,0.05,0.0864
9,10,x8,0.0536,0.1056,0.0611,0.0864


## Compare bagging FFSR algorithms ##

In [19]:
%%R -i ncaa2

bag.fsr<-function(x,y,B=100,gam0=.05){
# gives average coefficients from fsr.fast6.sim
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
hold<-matrix(rep(0,m*B),nrow=B)      # holds coefficients
interc<-rep(0,B)                     # holds intercepts
alphahat<-rep(0,B)                   # holds alphahats
size<-rep(0,B)                       # holds sizes
for(i in 1:B){
index<-sample(1:n,n,replace=T)
out<-fsr.fast6.sim(x=x[index,],y=y[index],gam0=gam0)
if (out$size>0) hold[i,out$x.ind]<-out$mod$coeff[2:(out$size+1)]
interc[i]<-out$mod$coeff[1]
alphahat[i]<-out$alphahat.ER
size[i]<-out$size
}                                    # ends i loop
coeff.av<-apply(hold,2, mean)
coeff.sd<-rep(0,m)
coeff.sd<-sqrt(apply(hold,2, var))
interc.av<-mean(interc)
interc.sd<-sd(interc)
amean<-mean(alphahat)
sizem<-mean(size)
prop<-rep(0,m)
for(j in 1:m){prop[j]<-sum(abs(hold[,j])>0)/B}
as.matrix(x)->x                      # in case x is a data frame
pred<-x%*%coeff.av+interc.av
return(list(coeff.av=coeff.av,coeff.sd=coeff.sd,interc.av=interc.av,pred=pred,
            interc.sd=interc.sd,prop=prop,amean=amean,sizem=sizem))
}

fsr.fast6.sim<-function(x,y,gam0=.05){
# estimated alpha for forward selection
# short output version
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]          # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
colnames(x)<-colnames(x,do.NULL=F,prefix="")      # corrects for no colnames
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}

system.time(bag.fsr(x=ncaa2[,2:20],y=ncaa2[,1],B=200)->out.ncaa)

   user  system elapsed 
  0.907   0.010   0.924 


In [4]:
%%time
f = bagfsr(ncaa2,0.05)

CPU times: user 6.46 s, sys: 202 ms, total: 6.66 s
Wall time: 5.87 s


In [20]:
%%R
print("Coefficent estimates")
print(round(out.ncaa$coeff.av,3))
cat("\n")
print("Bootstrap standard deviations of coeff. estimates")
print(round(out.ncaa$coeff.sd,3))
cat("\n")
print("Proportion of times variable appears in selected model")
print(round(out.ncaa$prop,3))
cat("\n")
print("Mean of estimated alpha-to-enter")
print(round(out.ncaa$amean,4))
cat("\n")
print("Mean size of selected model")
print(round(out.ncaa$sizem,4))

[1] "Coefficent estimates"
 [1]  0.023  3.057  0.195  0.785  0.207  0.102 -2.257 -0.351  1.254  0.000
[11]  0.075 -0.126  0.026  0.022  0.000  0.028 -0.041 -0.028  0.022

[1] "Bootstrap standard deviations of coeff. estimates"
 [1] 0.058 0.963 0.148 0.279 0.159 0.133 1.564 0.471 1.524 0.000 0.171 0.285
[13] 0.068 0.160 0.000 0.138 0.082 0.070 0.068

[1] "Proportion of times variable appears in selected model"
 [1] 0.200 1.000 0.685 0.950 0.675 0.400 0.730 0.400 0.440 0.250 0.260 0.170
[13] 0.170 0.100 0.300 0.175 0.250 0.195 0.165

[1] "Mean of estimated alpha-to-enter"
[1] 0.0445

[1] "Mean size of selected model"
[1] 7.515


In [18]:
print f.covs
print "Mean alpha\n", f.alpha
print "Mean size\n", f.size

      betahat    betase  prop_incl
x1   0.027735  0.007111      0.160
x2   1.765775  0.349283      1.000
x3   0.295532  0.045073      0.695
x4   0.573319  0.182151      0.940
x5   0.210366  0.056223      0.705
x6   0.110519  0.034700      0.375
x7  -1.356295  0.665157      0.775
x8  -0.145946  0.072469      0.300
x9   0.624109  0.343975      0.370
x10 -0.000104  0.000035      0.195
x11  0.044783  0.028402      0.230
x12 -0.121148  0.028495      0.115
x13  0.009884  0.007564      0.125
x14 -0.035171  0.036668      0.140
x15 -0.000000  0.000000      0.000
x16 -0.015030  0.016207      0.155
x17 -0.058610  0.010006      0.230
x18 -0.045009  0.010923      0.180
x19  0.015478  0.007667      0.125
Mean alpha
0.0502519486233
Mean size
7.095
