In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(party)
library(httr)


Loading required package: grid
Loading required package: mvtnorm
Loading required package: modeltools
Loading required package: stats4
Loading required package: strucchange
Loading required package: zoo

Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

Loading required package: sandwich

Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:
set.seed(1960)

create_model  =  function() {

    model  <- train(Species ~ ., data = iris, method = "cforest",
                     controls=cforest_control(maxdepth=3, ntree=512, mtry=3))
    
    return(model)
}

In [3]:
# dataset
model = create_model()

In [4]:
pred <- predict(model, as.matrix(iris[, -5]) , type="prob")
pred_labels <- predict(model, as.matrix(iris[, -5]) , type="raw")
sum(pred_labels != iris$Species)/length(pred_labels)


# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
N = nchar(lModelSQL)
L = 2000
cat(substr(lModelSQL, 0, L) , "\n ... \n" , 
    substr(lModelSQL, N/2 - L/2, N/2 + L/2) , "\n ... \n" , 
    substr(lModelSQL, N-L, N), "\n")


WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_2" <= 1.9) THEN 2 ELSE CASE WHEN ("ADS"."Feature_3" <= 1.6) THEN CASE WHEN ("ADS"."Feature_2" <= 4.6) THEN 5 ELSE 6 END ELSE CASE WHEN ("ADS"."Feature_3" <= 1.8) THEN 8 ELSE 9 END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."P_0" AS "P_0", "Values"."P_1" AS "P_1", "Values"."P_2" AS "P_2", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 2 AS nid, 1.0 AS "P_0", 0.0 AS "P_1", 0.0 AS "P_2", 0 AS "D", 1.0 AS "DP" UNION ALL SELECT 5 AS nid, 0.0 AS "P_0", 1.0 AS "P_1", 0.0 AS "P_2", 1 AS "D", 1.0 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0", 0.692307692307692 AS "P_1", 0.307692307692308 AS "P_2", 1 AS "D", 0.692307692307692 AS "DP" UNION ALL SELECT 8 AS nid, 0.0 AS "P_0", 0.2 AS "P_1", 0.8 AS "P_2", 2 AS "D", 0.8 AS "DP" UNION ALL SELECT 9 AS nid, 0.0 AS "P_0", 0.0 AS "P_1", 1.0 AS "P_2", 2 AS "D", 1.0 AS "DP") AS "Valu

# Execute the SQL Code

In [7]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [8]:
dataset = iris[,-5]

df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.4,3.9,1.7,0.4,6


In [9]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [23]:
df_sql_out = sqlQuery(conn, lModelSQL)
df_sql_out = df_sql_out[order(df_sql_out$KEY),]
rownames(df_sql_out) <- NULL
set.seed(1960)
df_sql_out[sample(nrow(df_sql_out), 5), ]

Unnamed: 0,KEY,Score_setosa,Score_versicolor,Score_virginica,Proba_setosa,Proba_versicolor,Proba_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
84,84,,,,0,0.4889994,0.511000612,-1.797693e+308,-0.715394,-0.6713845,virginica,0.5110006
83,83,,,,0,0.9941722,0.005827845,-1.797693e+308,-0.005844893,-5.145108,versicolor,0.9941722
53,53,,,,0,0.7195683,0.280431725,-1.797693e+308,-0.3291039,-1.271425,versicolor,0.7195683
30,30,,,,1,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,setosa,1.0
1,1,,,,1,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,setosa,1.0


# R CFOREST Output

In [18]:
pred_proba  =  predict(model, as.matrix(iris[,-5]), type = "prob")
df_r_out = data.frame(pred_proba)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(dataset))
df_r_out$Score_setosa  =  NA
df_r_out$Score_versicolor  =  NA
df_r_out$Score_virginica  =  NA
df_r_out$LogProba_setosa  =  log(df_r_out$Proba_setosa)
df_r_out$LogProba_versicolor =  log(df_r_out$Proba_versicolor)
df_r_out$LogProba_virginica  =  log(df_r_out$Proba_virginica)
df_r_out$Decision =   predict(model, as.matrix(iris[,-5]), type = "raw")
df_r_out$DecisionProba =  apply(pred_proba, 1, function(x) max(x))
set.seed(1960)
df_r_out[sample(nrow(df_r_out), 5), ]
# head(df_r_out)



Unnamed: 0,Proba_setosa,Proba_versicolor,Proba_virginica,KEY,Score_setosa,Score_versicolor,Score_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
84,0,0.4889994,0.511000612,84,,,,-inf,-0.715394041,-0.6713845,virginica,0.5110006
83,0,0.9941722,0.005827845,83,,,,-inf,-0.005844893,-5.145108,versicolor,0.9941722
53,0,0.7195683,0.280431725,53,,,,-inf,-0.329103865,-1.271425,versicolor,0.7195683
30,1,0.0,0.0,30,,,,0.0,-inf,-inf,setosa,1.0
1,1,0.0,0.0,1,,,,0.0,-inf,-inf,setosa,1.0


# Compare R and SQL output

In [12]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2
1,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1
2,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1
3,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1
4,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1
5,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1
6,1,0,0,,,,0,-inf,-inf,⋯,,,1,0,0,0,-1.797693e+308,-1.797693e+308,setosa,1


In [13]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

“number of rows of result is not a multiple of vector length (arg 2)”

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2


In [14]:
stopifnot(nrow(diffs_df) == 0)

In [15]:
summary(df_sql_out)

      KEY         Score_setosa   Score_versicolor Score_virginica
 Min.   :  1.00   Mode:logical   Mode:logical     Mode:logical   
 1st Qu.: 38.25   NA's:150       NA's:150         NA's:150       
 Median : 75.50                                                  
 Mean   : 75.50                                                  
 3rd Qu.:112.75                                                  
 Max.   :150.00                                                  
  Proba_setosa    Proba_versicolor  Proba_virginica   LogProba_setosa      
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :-1.798e+308  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:-1.798e+308  
 Median :0.0000   Median :0.01338   Median :0.00937   Median :-1.798e+308  
 Mean   :0.3317   Mean   :0.33635   Mean   :0.33199   Mean   :-1.198e+308  
 3rd Qu.:1.0000   3rd Qu.:0.97372   3rd Qu.:0.95672   3rd Qu.:  0.000e+00  
 Max.   :1.0000   Max.   :0.99417   Max.   :0.99869   Max.   :  0.000e+00  
 LogPr

In [16]:
summary(df_r_out)

  Proba_setosa    Proba_versicolor  Proba_virginica        KEY        
 Min.   :0.0000   Min.   :0.00000   Min.   :0.00000   Min.   :  1.00  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.: 38.25  
 Median :0.0000   Median :0.01338   Median :0.00937   Median : 75.50  
 Mean   :0.3317   Mean   :0.33635   Mean   :0.33199   Mean   : 75.50  
 3rd Qu.:1.0000   3rd Qu.:0.97372   3rd Qu.:0.95672   3rd Qu.:112.75  
 Max.   :1.0000   Max.   :0.99417   Max.   :0.99869   Max.   :150.00  
 Score_setosa   Score_versicolor Score_virginica LogProba_setosa
 Mode:logical   Mode:logical     Mode:logical    Min.   :-Inf   
 NA's:150       NA's:150         NA's:150        1st Qu.:-Inf   
                                                 Median :-Inf   
                                                 Mean   :-Inf   
                                                 3rd Qu.:   0   
                                                 Max.   :   0   
 LogProba_versicolor LogProba_virginica        D