In [None]:
""" Compare Python algorithm to R algorithm """

In [1]:
import ffsr3_d11 as f11

In [3]:
import pandas as pd

ncaadata = pd.read_csv("../ncaa_data2.txt", delim_whitespace=True, skipinitialspace=True)
cols = ncaadata.columns.tolist()
cols = cols[-1:] + cols[:-1]
print cols
ncaa2 = pd.DataFrame(ncaadata[cols],dtype='float')

['y', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19']


In [3]:
%load_ext rpy2.ipython

In [4]:
%%R -i ncaa2

fsr.fast<-function(x,y,gam0=.05,digits=4,print=T,plot=F){
# estimated alpha for forward selection using Fast FSR (no simulation)
# typical call: fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20])->out
# for use inside simulation loops, set print=F and plot=F
# version 7 circa Nov. 2009, modified to handle partially blank colnames
require(leaps)
ok<-complete.cases(x,y)
x<-x[ok,]                            # get rid of na's
y<-y[ok]                             # since regsubsets can't handle na's
m<-ncol(x)
n<-nrow(x)
if(m >= n) m1 <- n-5  else m1<-m     # to get rid of NA's in pv
vm<-1:m1
as.matrix(x)->x                      # in case x is a data frame
if(any(colnames(x)==""))colnames(x)<-NULL       # if only partially named columns
colnames(x)<-colnames(x,do.NULL=F,prefix="")    # corrects for no colnames
pvm<-rep(0,m1)                       # to create pvm below
regsubsets(x,y,method="forward")->out.x
pv.orig<-1-pf((out.x$rss[vm]-out.x$rss[vm+1])*(out.x$nn-(vm+1))/out.x$rss[vm+1],1,out.x$nn-(vm+1))
for (i in 1:m1){pvm[i]<-max(pv.orig[1:i])}  # sequential max of pvalues
alpha<-c(0,pvm)
ng<-length(alpha)
S<-rep(0,ng)                         # will contain num. of true entering in orig.
real.seq<-data.frame(var=(out.x$vorder-1)[2:(m1+1)],pval=pv.orig,
         pvmax=pvm,Rsq=round(1-out.x$rss[2:(m1+1)]/out.x$rss[1],4))
for (ia in 2:ng){                    # loop through alpha values for S=size
S[ia] <- sum(pvm<=alpha[ia])         # size of models at alpha[ia], S[1]=0
}
ghat<-(m-S)*alpha/(1+S)              # gammahat_ER
# add additional points to make jumps
alpha2<-alpha[2:ng]-.0000001
ghat2<-(m-S[1:(ng-1)])*alpha2/(1+S[1:(ng-1)])
zp<-data.frame(a=c(alpha,alpha2),g=c(ghat,ghat2))
zp<-zp[order(zp$a),]
gmax<-max(zp$g)
index.max<-which.max(zp$g)           # index of largest ghat
alphamax<-zp$a[index.max]            # alpha with largest ghat
# gmax<-max(ghat)
# index.max<-which.max(ghat)           # index of largest ghat
# alphamax<-alpha[index.max]           # alpha with largest ghat
ind<-(ghat <= gam0 & alpha<=alphamax)*1
Sind<-S[max(which(ind > 0))]           # model size with ghat just below gam0
alphahat.fast<-(1+Sind)*gam0/(m-Sind)  # ER est.
size1<-sum(pvm<=alphahat.fast)+1       # size of model including intercept
x<-x[,colnames(x)[(out.x$vorder-1)[2:size1]]]
if(size1>1) x.ind<-(out.x$vorder-1)[2:size1]  else x.ind<-0
if (size1==1) {mod <- lm(y~1)} else {mod <- lm(y~x)}
# ghat3<-(m-size1+1)*alpha/(1+S)         # uses final ku est.
ghat4<-(m-size1+1)*alpha/(1+0:m)
#res<-data.frame(real.seq,ghigh=ghat2,glow=ghat[2:ng])
res<-data.frame(real.seq,g=ghat[2:ng])
if(print)print(round(res,digits))
#if(plot){
#plot(zp$a,zp$g,type="b",xlab="Alpha",ylab="Estimated Gamma",xlim=c(0,alphamax))
#points(alphahat.fast,gam0,pch=19)
#lines(c(-1,alphahat.fast),c(gam0,gam0))
#lines(c(alphahat.fast,alphahat.fast),c(-1,gam0))
#}  # ends plot
return(list(mod=mod,size=size1-1,x.ind=x.ind,alphahat.ER=alphahat.fast))
}

ncaa2 = as.matrix(ncaa2)

system.time(fsr.fast(x=ncaa2[,1:19],y=ncaa2[,20]))

Loading required package: leaps
   var   pval  pvmax    Rsq      g
1    9 0.0000 0.0000 0.5590 0.0000
2    3 0.0004 0.0004 0.6170 0.0020
3   14 0.0045 0.0045 0.6500 0.0134
4   19 0.0044 0.0045 0.6807 0.0134
5    6 0.0794 0.0794 0.6917 0.1852
6   10 0.0948 0.0948 0.7015 0.1761
7    8 0.1260 0.1260 0.7095 0.1890
8   13 0.4703 0.4703 0.7113 0.3848
9   12 0.4501 0.4703 0.7133 0.3848
10   5 0.4625 0.4703 0.7152 0.3848
11  16 0.5203 0.5203 0.7166 0.3469
12   4 0.5283 0.5283 0.7180 0.2845
13  15 0.5584 0.5584 0.7192 0.2393
14  18 0.7567 0.7567 0.7196 0.2522
15   1 0.8386 0.8386 0.7197 0.2097
16  17 0.8692 0.8692 0.7198 0.0966
17   7 0.8614 0.8692 0.7199 0.0966
18  11 0.8722 0.8722 0.7200 0.0000
19   2 0.8648 0.8722 0.7201 0.0000
   user  system elapsed 
  0.030   0.003   0.034 


In [6]:
%timeit -n1 -r1 f11.ffsr(ncaa2,0.05)

1 loops, best of 1: 28.1 ms per loop
