---
title: "Test BAS"
output: pdf_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(BAS)
library(ISLR)
library(dplyr)
library(ROCR)
library(alr3)

rmse = function(obs, pred) { sqrt(sum((obs - pred)^2)/length(obs))}

```


1.  Generate a dataset with $p = 20$ features and $n=1000$ as follows:
First let's set our random seed in case we need to rerun parts later.

```{r jenny, echo=TRUE}
# set the random seed so that we can replicate results.
set.seed(8675309)
```

In order to simulate data, we need to specify the values of the  "true" parameters.  For this study we will use

```{r true}
# true parameters
sigma = 5
betatrue = c(1,2,0,0,0,-1,0,1.5, 0,0,0,1,0,.5,0,0,0,0,-1,1.5,3.5)
#          int|    X1                            | X2     |X3 

truemodel = betatrue != 0
```


Generate Data with correlated columns.
```{r data, cache=TRUE} 

#sample size
n = 1000

# generate some standard normals
  Z = matrix(rnorm(n*10, 0, 1), ncol=10, nrow=n)
   
# Create X1 by taking linear cominations of Z to induce correlation among X1 components
  X1 = cbind(Z, 
             (Z[,1:5] %*% c(.3, .5, .7, .9, 1.1) %*% t(rep(1,5)) +
             matrix(rnorm(n*5, 0, 1), ncol=5, nrow=n))
             )
  
# generate X2 as a standard normal  
  X2 <- matrix(rnorm(n*4,0,1), ncol=4, nrow=n)
  
# Generate X3 as a linear combination of X2 and noise  
  X3 <- X2[,4]+rnorm(n,0,sd=0.1)
  
# combine them  
  X <- cbind(X1,X2,X3)

# Generate mu     
# X does not have a column of ones for the intercept so need to add the intercept  
# for true mu  
mu = betatrue[1] + X %*% betatrue[-1] 
  
# now generate Y  
Y = mu + rnorm(n,0,sigma)  
  
# make a dataframe and save it

df = data.frame(Y, X, mu)

```


2. Split your data set into a training set containing $300$ observations and a test set containing $700$ observations.  Before splitting reset the random seed based on your team number

```{r new-seed}
set.seed(10)   # replace 0 with team number before runing
train = sample(1:900, size=200, rep=FALSE)
df.train=df[train,]
df.test=df[-train,]

```

```{r}
data(Caravan, package="ISLR")
set.seed(0)  #update
test= 1:1000
train = -test
Caravan = Caravan %>%
  mutate(MOSTYPE = factor(MOSTYPE),
         MGEMLEEF = factor(MGEMLEEF),
         MOSTYPE = factor(MOSTYPE),
         MGODRK = factor(MGODRK),
         MOSHOOFD = factor(MOSHOOFD))

df.car.train = Caravan[train, ]
df.car.test = Caravan[test, ]


```

## Using BAS for Subset Selection  (optional)
 
 Notes:  in addition to the packages discussed in ISLR, we can enumerate all possible models or sample models using the package `BAS`.  The resulting objects will be large under enumeration with $p=10$ as there are over 1 million models.
 
 For linear models we can enumerate all models using AIC based on the following code:
 
 
```{r bas_aic }
 sim.aic = bas.lm(Y ~ . -mu, data=df[train,], method="deterministic",
                  n.models=2^20, prior="AIC")
```
 
 Get predictions with the best AIC model  "Highest Posterior Probabilty model = 'HPM' and extract the variables.
 
```{r}
 pred = predict(sim.aic, newdata = df[-train,], estimator='HPM')
 variable.names(pred)
```
 
 Find the RMSE
```{r rmse-bas}
 rmse(df[-train, "Y"], pred$fit)
```
 
 For BIC, change the prior to 'BIC"; see `help(bas.lm)` for other priors if you want to explore more options.
 
 Now try the above using MCMC,
 
 
```{r bas.aic.mcmc }
 sim.aic.mcmc = bas.lm(Y ~ . -mu, data=df[train,], method="MCMC",
                  MCMC.iterations=50000, prior="AIC")
 sim.aic.mcmc$n.Unique
```

 `n.Unique` is the number of unique models that were sampled using MCMC
 
```{r}
 diagnostics(sim.aic.mcmc,  eval=FALSE)  # Is there a strong 1:1 relationship suggesting convergence?
```
 
 
```{r,}
 pred = predict(sim.aic.mcmc, newdata = df[-train,], estimator='HPM')
 variable.names(pred)
 rmse(df[-train, "Y"], pred$fit)
```
 
 Did you find the same model?
 
 
 For glms the syntax is lightly different for specifying which prior to use.
 
```{r}
 set.seed(0)
 test = 1:1000  # as in ISLR Chapter 4
 train = -test
 #start with model formula from AIC stepwise  `step.car`
 car.bas = bas.glm(Purchase ~ ., betaprior=aic.prior(), 
                   data=Caravan, subset=train,
                   family=binomial(),
                   method='MCMC',
                   MCMC.iterations = 5000)  #short run to check timing
 diagnostics(car.bas)
 image(car.bas)
 
 car.bas = bas.glm(step.car$formula, betaprior=aic.prior(), 
                   data=Caravan, subset=train,
                   family=binomial(),
                   method='MCMC',
                   MCMC.iterations = 50000)  #longer run to check convergence object takes over 4.3 MB 
 
 diagnostics(car.bas)
 image(car.bas)
```
 
```{r}
 pred = predict(car.bas, newdata=Caravan[-train,], type="response", estimator="HPM")
 
 table(pred$fit > .25, Caravan[-train, "Purchase"])
```