# HW5 Problem:  Predicting Solubility -- using Applied Predictive Modeling
## Due Sunday May 28, at 11:55pm

Jennifer MacDonald 604501712

CS249 -- Spring 2017 -- D.S. Parker &copy; 2017

#  The Goal

For this problem you will analyze a variant of the <i>Solubility</i> dataset. You are to predict the solubility values for a set of test data:
<ul><li>
Given the file <tt>training_set</tt>, develop a regression model that is as accurate as possible.
</li><li>
Use your model to predict solubility for each row of data in <tt>test_set.csv</tt>.
</li><li>
Put your predictions in a .csv file called  <tt>HW5_Solubility_Predictions.csv</tt> and upload it to CCLE.
</li></ul>

#  Appied Predictive Modeling

<p>The <tt>caret</tt> library (<a href="http://caret.r-forge.r-project.org">http://caret.r-forge.r-project.org</a>) is integrated with R packages that support Supervised Learning using
mainstream model evaluation methods and  100 popular models.</p>

In [1]:
not.installed <- function(pkg) !is.element(pkg, installed.packages()[,1])
    
if (not.installed("caret")) install.packages("caret", repos="http://cran.us.r-project.org")
if (not.installed("lattice")) install.packages("lattice", repos="http://cran.us.r-project.org")
if (not.installed("ggplot2")) install.packages("ggplot2", repos="http://cran.us.r-project.org")    

library(caret)
library(lattice)
library(ggplot2)

Loading required package: lattice
Loading required package: ggplot2


An integrated package for supervised learning, using over 50 kinds of
models, and a variety of different metrics:</p>

In [2]:
if (not.installed("AppliedPredictiveModeling")) {
    
    install.packages("AppliedPredictiveModeling")
    
    
    library(AppliedPredictiveModeling)
    
    
    for (chapter in c(2,3,4,6,7,8,10, 11,12,13,14,16,17,19))  getPackages(chapter)  # this takes a while

        
        
} else {

    library(AppliedPredictiveModeling)

}

In [3]:
# Grid Search is often used in APM to search a model's parameter space, and
# some chapters use the "doMC" package to do Multi-Core computation
# (supported only on Linux or MacOS):

if (not.installed("doMC"))  install.packages("doMC")   # multicore computation in R
if (not.installed("foreach"))  install.packages("foreach")
if (not.installed("iterators"))  install.packages("iterators")
if (not.installed("parallel"))  install.packages("parallel")    

library(doMC)
library(foreach)
library(iterators)
library(parallel)    

Loading required package: foreach
Loading required package: iterators
Loading required package: parallel


## Get the datasets

In [4]:
my.training.set = data.matrix(read.csv(file="training_set.csv"))
my.test.set = data.matrix(read.csv(file="test_set.csv"))

## Construct a model from <tt>training_set.csv</tt>

Using the <tt>training_set.csv</tt> data, construct a regression model.

In [5]:
# Separate the training data from the data we want to predict
solTrainY = c(subset(my.training.set, select = c(solubility)))
solTrainX = subset(my.training.set, select = -c(solubility))

solTestX = as.data.frame(my.test.set)

## Use caret's preProcess function to transform for skewness
solTrainXPP <- preProcess(solTrainX, method=c("BoxCox"))
## Apply the transformations
solTrainXtrans <- predict(solTrainXPP, solTrainX)

# Similarly transform the testing data
solTestXPP <- preProcess(solTestX, method=c("BoxCox"))
solTestXtrans <- predict(solTestXPP, solTestX)

In [6]:
### Create a control function that will be used across models. We
### create the fold assignments explicitly instead of relying on the
### random number seed being set to identical values.

set.seed(100)
indx <- createFolds(solTrainY, returnTrain = TRUE)
ctrl <- trainControl(method = "cv", index = indx)

## Generate predictions from <tt>test_set.csv</tt>

In [7]:
### Using a set of predictors reduced by unsupervised
### filtering. We apply a filter to reduce extreme between-predictor
### correlations.

tooHigh <- findCorrelation(cor(solTrainXtrans), .9)

trainXfiltered <- solTrainXtrans[, -tooHigh]
testXfiltered  <-  solTestXtrans[, -tooHigh]

set.seed(100)
lmTune <- train(x = trainXfiltered, y = c(solTrainY),
                method = "lm",
                trControl = ctrl)


lmFit = lm(solubility ~., data=as.data.frame(my.training.set))
r_squared = cat(summary(lmFit)$r.squared)

predictions = suppressWarnings(as.data.frame(predict(lmFit, newdata = solTestX)))
colnames(predictions) <- "Solubility"

# write.csv(predictions, file = "HW5_Solubility_Predictions.csv")

predictions

0.9394862

Solubility
-2.7069885
-2.0753398
-1.5059886
-3.2890363
-1.5640699
-8.3175572
-0.9259844
-5.7480065
-0.2230922
-1.5473190
