## R SWAT Modelagem

Github do SWAT: https://github.com/sassoftware/R-swat

Action sets: https://go.documentation.sas.com/?docsetId=allprodsactions&docsetTarget=actionSetsByName.htm&docsetVersion=3.5&locale=en

Documentação: https://developer.sas.com/apis/swat/r/v1.3.0/R-swat.pdf

In [None]:
#install.packages('https://github.com/sassoftware/R-swat/releases/download/v1.5.0/R-swat-1.5.0-linux64.tar.gz',
#                   repos=NULL, type='file')
#install.packages('https://github.com/sassoftware/R-swat/releases/download/v1.5.0/R-swat-1.5.0-win64.tar.gz',
#                   repos=NULL, type='file')


In [1]:
# Load necessary packages
library('swat')
library('ggplot2')
library('reshape2')
#options(cas.print.messages = FALSE)

SWAT 1.5.0

"package 'ggplot2' was built under R version 4.0.2"


In [4]:
conn <- CAS('pdcesx14134.exnet.sas.com', 
            port=8777, protocol = "http",
            caslib = 'casuser', 
            authinfo = './.authinfo')

NOTE: Connecting to CAS and generating CAS action functions for loaded

      action sets...

NOTE: To generate the functions with signatures (for tab completion), set 

      options(cas.gen.function.sig=TRUE).



In [None]:
cas.table.tableInfo(conn)

In [None]:
cas.table.caslibInfo(conn)

In [None]:
## Carregando Actionsets no CAS
actionsets <- c('sampling', 'decisionTree', 'neuralNet', 'percentile')
for(i in actionsets){
    loadActionSet(conn, i)
}

In [None]:
# Carregando dados para CAS
castbl <- cas.read.csv(conn, './data/hmeq.csv')

In [None]:
class(castbl)

In [None]:
head(castbl)

In [None]:
cas.simple.summary(castbl)

In [None]:
# Trazer dados para maquina local
df <- to.casDataFrame(castbl, obs = nrow(castbl))

In [None]:
head(df)

In [None]:
# Formatacao de dados
d <- melt(df[sapply(df, is.numeric)], id.vars=NULL)
head(d)

In [None]:
options(repr.plot.width=12, repr.plot.height=6)

ggplot(d, aes(x = value)) +
    geom_histogram(fill = 'blue', bins = 25) +
    facet_wrap(~variable,
               scales = 'free_x') 

In [None]:
# Ver dados faltantes de todas variaveis
tbl <- cas.simple.distinct(castbl)$Distinct
tbl

In [None]:
# Pegando dados missing
cas.nmiss(castbl)

In [None]:
# Visualizacao de missing
tbl$PctMiss <- tbl$NMiss/nrow(castbl)

In [None]:
ggplot(tbl, aes(Column, PctMiss)) +
    geom_col(fill = 'blue') +
    ggtitle('Pct Missing Values') +
    theme(plot.title = element_text(hjust = 0.5))

In [None]:
# Imputacao de dados missing
cas.dataPreprocess.impute(castbl,
    methodContinuous = 'MEDIAN',
    methodNominal = 'MODE',
    inputs = colnames(castbl)[-1],
    copyAllVars = TRUE,
    casOut = list(name = 'hmeq', 
                replace = TRUE)
)

In [None]:
# Particionamento de dados
cas.sampling.srs(conn,
    table = 'hmeq',
    samppct = 30,
    partind = TRUE,
    output = list(casOut = list(name = 'hmeq', replace = T), 
                  copyVars = 'ALL')
)

In [None]:
hmeq1 <- defCasTable(conn, 'hmeq')

In [None]:
head(hmeq1)

In [None]:
indata <- 'hmeq'

# Pega infromacao das variaveis
colinfo <- head(cas.table.columnInfo(conn, table = indata)$ColumnInfo, -1)

In [None]:
# Variavel target
target <- colinfo$Column[1]


In [None]:
# Separacao para modelos que lidam com missing
inputs <- colinfo$Column[-1]
nominals <- c(target, subset(colinfo, Type == 'varchar')$Column)

In [None]:
# Separacao para modelos que nao lidam com missing
imp.inputs <- grep('IMP_', inputs, value = T)
imp.nominals <- c(target, grep('IMP_', nominals, value = T))

In [None]:
# Treina modelos
## Arvore de decisão
cas.decisionTree.dtreeTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    varImp = TRUE,
    casOut = list(name = 'dt_model', replace = TRUE)
)

In [None]:
## Random Forest
cas.decisionTree.forestTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'rf_model', replace = TRUE)
)

In [None]:
## Gradient Boosting
cas.decisionTree.gbtreeTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = inputs,
    nominals = nominals,
    casOut = list(name = 'gbt_model', replace = TRUE)
)

In [None]:
## Neural Network
cas.neuralNet.annTrain(conn,
    table = list(name = indata, where = '_PartInd_ = 0'),
    target = target,
    inputs = imp.inputs,
    hidden = 7,
    nominals = imp.nominals,
    casOut = list(name = 'nn_model', replace = TRUE)
)

In [None]:
### Prevendo um unico modelo
cas.decisionTree.dtreeScore(
    object       = hmeq1,
    modelTable   = list(name = 'dt_model'),
    copyVars     = list(target, '_PartInd_'),
    assessonerow = TRUE,
    casOut       = list(name = 'dt_scored', replace = T)
)

In [None]:
dt_scores <- defCasTable(conn, 'dt_scored')

In [None]:
head(dt_scores)

In [None]:
models <- c('dt','rf','gbt','nn')
scores <- c(cas.decisionTree.dtreeScore, cas.decisionTree.forestScore, 
            cas.decisionTree.gbtreeScore, cas.neuralNet.annScore)
names(scores) <- models

In [None]:
# Funcao para atumatizar processo de predicao em novos dados
score.params <- function(model){return(list(
    object       = defCasTable(conn, indata),
    modelTable   = list(name = paste0(model, '_model')),
    copyVars     = list(target, '_PartInd_'),
    assessonerow = TRUE,
    casOut       = list(name = paste0(model, '_scored'), replace = T)
))}

In [None]:
lapply(models, 
       function(x) {do.call(scores[[x]], 
                            score.params(x))}
      )

In [None]:
# Carrega actionset para a
# loadActionSet(conn, 'percentile')

In [None]:
## assesment de um único modelo
asses <-  cas.percentile.assess(conn,
        table    = list(name = paste0('dt_scored'), 
                        where = '_PartInd_ = 1'),
        inputs   = paste0('_dt_P_           1'),
        response = target,
        event    = '1')

In [None]:
head(asses$LIFTInfo)

In [None]:
head(asses$ROCInfo)

In [None]:
# Funcao para comparacao de modelos
assess.model <- function(model){
    cas.percentile.assess(conn,
        table    = list(name = paste0(model,'_scored'), 
                        where = '_PartInd_ = 1'),
        inputs   = paste0('_', model, '_P_           1'),
        response = target,
        event    = '1')
}

In [None]:
model.names <- c('Decision Tree', 'Random Forest', 
                 'Gradient Boosting', 'Neural Network')
roc.df <- data.frame()

for (i in 1:length(models)){
    tmp <- (assess.model(models[i]))$ROCInfo
    tmp$Model <- model.names[i] 
    roc.df <- rbind(roc.df, tmp)
}

In [None]:
# Manipulacao do DF
compare <- subset(roc.df, round(roc.df$CutOff, 2) == 0.5)
rownames(compare) <- NULL
compare[,c('Model','TP','FP','FN','TN')]

In [None]:
# Cria dataframe pra comparar missclassification
compare$Misclassification <- 1 - compare$ACC
miss <- compare[order(compare$Misclassification), c('Model','Misclassification')]
rownames(miss) <- NULL
miss

In [None]:
# Add nova coluna pra ser usada com label da curva Roc
roc.df$Models <- paste(roc.df$Model, round(roc.df$C, 3), sep = ' - ')

In [None]:
# Cria curva ROC
options(repr.plot.width=14, repr.plot.height=6)

plot <- ggplot(data = roc.df[c('FPR', 'Sensitivity', 'Models')],
    aes(x = FPR, y = Sensitivity, colour = Models)) +
    geom_line(size =1.2) +
    labs(x = 'False Positive Rate', y = 'True Positive Rate')
plot

In [None]:
plotly::embed_notebook(plotly::ggplotly(plot))

In [None]:
# Fim sessao
cas.session.endSession(conn)