In [1]:

library(caret, quiet=TRUE);
library(base64enc)
library(httr)
library(party)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress

Loading required package: grid
Loading required package: mvtnorm
Loading required package: modeltools
Loading required package: stats4
Loading required package: strucchange
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: sandwich


# Build a Model

In [2]:

## binary classification in iris dataset:
    
num_class <- 3
lb <- as.numeric(iris$Species) - 1

iris$Species2 = as.factor(sapply(iris$Species , function(x) if( x == 'setosa') 'yes' else 'no'))
dataset = as.matrix(iris[, 1:4])
    
create_model  =  function() {

    set.seed(1960)
    formula <- as.formula(Species2 ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width)
    model <- train(formula, iris, method = "cforest", 
                   controls=cforest_control(maxdepth=3, ntree=512, mtry=3))
    return(model)
}

In [3]:
head(iris)

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species,Species2
5.1,3.5,1.4,0.2,setosa,yes
4.9,3.0,1.4,0.2,setosa,yes
4.7,3.2,1.3,0.2,setosa,yes
4.6,3.1,1.5,0.2,setosa,yes
5.0,3.6,1.4,0.2,setosa,yes
5.4,3.9,1.7,0.4,setosa,yes


In [4]:
# dataset
model = create_model()
# cat(model$feature_names)
# plot(model)


In [5]:
pred <- predict(model, dataset , type="prob")
pred_labels <- predict(model, dataset , type="raw")
error_rate = sum(pred_labels != iris$Species2)/length(pred_labels)
error_rate

# SQL Code Generation

In [6]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_test_model", SerializedModel = b64_data , 
                SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json", verbose())
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [7]:
# labels(model$finalModel)

In [8]:
lModelSQL = test_ws_sql_gen(model)
N = nchar(lModelSQL)
L = 2000
cat(substr(lModelSQL, 0, L) , "\n ... \n" , 
    substr(lModelSQL, N/2 - L/2, N/2 + L/2) , "\n ... \n" , 
    substr(lModelSQL, N-L, N), "\n")












































































































































































































































WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_2" <= 1.9) THEN 2 ELSE 3 END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."P_0" AS "P_0", "Values"."P_1" AS "P_1", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 2 AS nid, 0.0 AS "P_0", 1.0 AS "P_1", 1 AS "D", 1.0 AS "DP" UNION ALL SELECT 3 AS nid, 1.0 AS "P_0", 0.0 AS "P_1", 0 AS "D", 1.0 AS "DP") AS "Values"), 
"DT_Output" AS 
(SELECT "DT_node_lookup"."KEY" AS "KEY", "DT_node_lookup".node_id_2 AS node_id_2, "DT_node_data".nid AS nid, "DT_node_data"."P_0" AS "P_0", "DT_node_data"."P_1" AS "P_1", "DT_node_data"."D" AS "D", "DT_node_data"."DP" AS "DP" 
FROM "DT_node_lookup" LEFT OUTER JOIN "DT_n

# Execute the SQL Code

In [9]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [10]:
df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.4,3.9,1.7,0.4,6


In [11]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [12]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out)

KEY,Score_no,Score_yes,Proba_no,Proba_yes,LogProba_no,LogProba_yes,Decision,DecisionProba
116,,,1,0,0,-1.797693e+308,no,1
87,,,1,0,0,-1.797693e+308,no,1
71,,,1,0,0,-1.797693e+308,no,1
68,,,1,0,0,-1.797693e+308,no,1
51,,,1,0,0,-1.797693e+308,no,1
146,,,1,0,0,-1.797693e+308,no,1


In [13]:
# colnames(df1)

# R CFOREST Output

In [14]:
pred  =  predict(model, as.matrix(dataset), type = "prob")
# pred <- matrix(pred , ncol=3, byrow=TRUE)
# p0 = 1 - pred
df_r_out = data.frame(pred)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(dataset))
df_r_out$Score_no  =  NA
df_r_out$Score_yes  =  NA
df_r_out$LogProba_no  =  log(df_r_out$Proba_no)
df_r_out$LogProba_yes  =  log(df_r_out$Proba_yes)
df_r_out$Decision =   predict(model, as.matrix(dataset), type = "raw")
df_r_out$DecisionProba =  apply(pred, 1, function(x) max(x))
head(df_r_out)


Proba_no,Proba_yes,KEY,Score_no,Score_yes,LogProba_no,LogProba_yes,Decision,DecisionProba
0,1,1,,,-inf,2.220446e-16,yes,1
0,1,2,,,-inf,2.220446e-16,yes,1
0,1,3,,,-inf,2.220446e-16,yes,1
0,1,4,,,-inf,2.220446e-16,yes,1
0,1,5,,,-inf,2.220446e-16,yes,1
0,1,6,,,-inf,2.220446e-16,yes,1


In [15]:
# df_r_out

# Compare R and SQL output

In [16]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Proba_no_1,Proba_yes_1,Score_no_1,Score_yes_1,LogProba_no_1,LogProba_yes_1,Decision_1,DecisionProba_1,Score_no_2,Score_yes_2,Proba_no_2,Proba_yes_2,LogProba_no_2,LogProba_yes_2,Decision_2,DecisionProba_2
1,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1
2,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1
3,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1
4,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1
5,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1
6,0,1,,,-inf,2.220446e-16,yes,1,,,0,1,-1.797693e+308,0,yes,1


In [17]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

KEY,Proba_no_1,Proba_yes_1,Score_no_1,Score_yes_1,LogProba_no_1,LogProba_yes_1,Decision_1,DecisionProba_1,Score_no_2,Score_yes_2,Proba_no_2,Proba_yes_2,LogProba_no_2,LogProba_yes_2,Decision_2,DecisionProba_2


In [18]:
stopifnot(nrow(diffs_df) == 0)

In [19]:
summary(df_sql_out)

      KEY         Score_no       Score_yes         Proba_no     
 Min.   :  1.00   Mode:logical   Mode:logical   Min.   :0.0000  
 1st Qu.: 38.25   NA's:150       NA's:150       1st Qu.:0.0000  
 Median : 75.50                                 Median :1.0000  
 Mean   : 75.50                                 Mean   :0.6684  
 3rd Qu.:112.75                                 3rd Qu.:1.0000  
 Max.   :150.00                                 Max.   :1.0000  
   Proba_yes       LogProba_no           LogProba_yes         Decision 
 Min.   :0.0000   Min.   :-1.798e+308   Min.   :-1.798e+308   no :100  
 1st Qu.:0.0000   1st Qu.:-1.798e+308   1st Qu.:-1.798e+308   yes: 50  
 Median :0.0000   Median :  0.000e+00   Median :-1.798e+308            
 Mean   :0.3316   Mean   :-5.513e+307   Mean   :-1.198e+308            
 3rd Qu.:1.0000   3rd Qu.:  0.000e+00   3rd Qu.:  0.000e+00            
 Max.   :1.0000   Max.   :  0.000e+00   Max.   :  0.000e+00            
 DecisionProba   
 Min.   :0.9121  
 1st 

In [20]:
summary(df_r_out)

    Proba_no        Proba_yes           KEY         Score_no      
 Min.   :0.0000   Min.   :0.0000   Min.   :  1.00   Mode:logical  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 38.25   NA's:150      
 Median :1.0000   Median :0.0000   Median : 75.50                 
 Mean   :0.6684   Mean   :0.3316   Mean   : 75.50                 
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:112.75                 
 Max.   :1.0000   Max.   :1.0000   Max.   :150.00                 
 Score_yes       LogProba_no        LogProba_yes      Decision 
 Mode:logical   Min.   :    -Inf   Min.   :    -Inf   no :100  
 NA's:150       1st Qu.:    -Inf   1st Qu.:    -Inf   yes: 50  
                Median :2.22e-16   Median :    -Inf            
                Mean   :    -Inf   Mean   :    -Inf            
                3rd Qu.:2.22e-16   3rd Qu.:2.22e-16            
                Max.   :2.22e-16   Max.   :2.22e-16            
 DecisionProba   
 Min.   :0.9121  
 1st Qu.:1.0000  
 Median :1.0000  
 Mean   :0.