In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)



Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:

## multiclass classification in iris dataset:
    
set.seed(1960)

dataset = as.matrix(iris[, -5])

create_model  =  function() {

    formula <- as.formula(Species ~.)
    
    model  <- train(Species ~ ., data = iris, method = "xgbTree")

    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)

In [4]:
pred <- predict(model, as.matrix(iris[, -5]) , type="prob")
pred_labels <- predict(model, as.matrix(iris[, -5]) , type="raw")
sum(pred_labels != iris$Species)/length(pred_labels)



# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json", verbose())
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)
































































































































WITH "XGB_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_3" < 0.800000012) THEN 1 ELSE 2 END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."Score" AS "Score" 
FROM (SELECT 1 AS nid, 0.430622011 AS "Score" UNION ALL SELECT 2 AS nid, -0.220048919 AS "Score") AS "Values"), 
"DT_Output" AS 
(SELECT "DT_node_lookup"."KEY" AS "KEY", "DT_node_lookup".node_id_2 AS node_id_2, "DT_node_data".nid AS nid, "DT_node_data"."Score" AS "Score" 
FROM "DT_node_lookup" LEFT OUTER JOIN "DT_node_data" ON "DT_node_lookup".node_id_2 = "DT_node_data".nid), 
"XGB_Model_0_0" AS 
(SELECT "DT_Output"."KEY" AS "KEY", "DT_Output"."Score" AS "Score_setosa", 0.0 AS "Score_versicolor", 0.0 AS "Score_virginica" 
FROM "DT_Output"), 
"DT_node_lookup_1" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"

# Execute the SQL Code

In [7]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [8]:
df_sql = as.data.frame(iris[,-5])
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(iris))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.4,3.9,1.7,0.4,6


In [9]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [10]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

Unnamed: 0,KEY,Score_setosa,Score_versicolor,Score_virginica,Proba_setosa,Proba_versicolor,Proba_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
135,1,,,,0.9972031,0.002345572,0.0004513598,-0.00280085,-6.055226,-7.703246,setosa,0.9972031
103,2,,,,0.9972279,0.002177048,0.0005950183,-0.002775916,-6.129785,-7.426918,setosa,0.9972279
61,3,,,,0.9973712,0.002177361,0.000451436,-0.002632258,-6.129642,-7.703077,setosa,0.9973712
72,4,,,,0.997333,0.002177278,0.0004897321,-0.002670572,-6.12968,-7.621652,setosa,0.997333
66,5,,,,0.9972031,0.002345572,0.0004513598,-0.00280085,-6.055226,-7.703246,setosa,0.9972031
85,6,,,,0.9970595,0.002489236,0.0004512949,-0.002944863,-5.995779,-7.70339,setosa,0.9970595


In [11]:
# colnames(df1)

# R XGBoost Output

In [12]:
pred_proba  =  predict(model, as.matrix(iris[,-5]), type = "prob")
df_r_out = data.frame(pred_proba)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(dataset))
df_r_out$Score_setosa  =  NA
df_r_out$Score_versicolor  =  NA
df_r_out$Score_virginica  =  NA
df_r_out$LogProba_setosa  =  log(df_r_out$Proba_setosa)
df_r_out$LogProba_versicolor =  log(df_r_out$Proba_versicolor)
df_r_out$LogProba_virginica  =  log(df_r_out$Proba_virginica)
df_r_out$Decision =   predict(model, as.matrix(iris[,-5]), type = "raw")
df_r_out$DecisionProba =  apply(pred_proba, 1, function(x) max(x))
head(df_r_out)


Proba_setosa,Proba_versicolor,Proba_virginica,KEY,Score_setosa,Score_versicolor,Score_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
0.9972031,0.002345574,0.0004513605,1,,,,-0.002800807,-6.055225,-7.703244,setosa,0.9972031
0.997228,0.00217705,0.0005950193,2,,,,-0.002775882,-6.129785,-7.426917,setosa,0.997228
0.9973712,0.002177362,0.0004514366,3,,,,-0.002632265,-6.129641,-7.703076,setosa,0.9973712
0.997333,0.002177279,0.0004897326,4,,,,-0.002670513,-6.129679,-7.621651,setosa,0.997333
0.9972031,0.002345574,0.0004513605,5,,,,-0.002800807,-6.055225,-7.703244,setosa,0.9972031
0.9970594,0.002489238,0.0004512955,6,,,,-0.002944927,-5.995779,-7.703388,setosa,0.9970594


# Compare R and SQL output

In [13]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2
1,0.9972031,0.002345574,0.0004513605,,,,-0.002800807,-6.055225,-7.703244,⋯,,,0.9972031,0.002345572,0.0004513598,-0.00280085,-6.055226,-7.703246,setosa,0.9972031
2,0.997228,0.00217705,0.0005950193,,,,-0.002775882,-6.129785,-7.426917,⋯,,,0.9972279,0.002177048,0.0005950183,-0.002775916,-6.129785,-7.426918,setosa,0.9972279
3,0.9973712,0.002177362,0.0004514366,,,,-0.002632265,-6.129641,-7.703076,⋯,,,0.9973712,0.002177361,0.000451436,-0.002632258,-6.129642,-7.703077,setosa,0.9973712
4,0.997333,0.002177279,0.0004897326,,,,-0.002670513,-6.129679,-7.621651,⋯,,,0.997333,0.002177278,0.0004897321,-0.002670572,-6.12968,-7.621652,setosa,0.997333
5,0.9972031,0.002345574,0.0004513605,,,,-0.002800807,-6.055225,-7.703244,⋯,,,0.9972031,0.002345572,0.0004513598,-0.00280085,-6.055226,-7.703246,setosa,0.9972031
6,0.9970594,0.002489238,0.0004512955,,,,-0.002944927,-5.995779,-7.703388,⋯,,,0.9970595,0.002489236,0.0004512949,-0.002944863,-5.995779,-7.70339,setosa,0.9970595


In [14]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

“number of rows of result is not a multiple of vector length (arg 2)”

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2


In [18]:
stopifnot(nrow(diffs_df) == 0)

In [15]:
summary(df_sql_out)

      KEY         Score_setosa   Score_versicolor Score_virginica
 Min.   :  1.00   Mode:logical   Mode:logical     Mode:logical   
 1st Qu.: 38.25   NA's:150       NA's:150         NA's:150       
 Median : 75.50                                                  
 Mean   : 75.50                                                  
 3rd Qu.:112.75                                                  
 Max.   :150.00                                                  
  Proba_setosa       Proba_versicolor    Proba_virginica    
 Min.   :0.0002060   Min.   :0.0002739   Min.   :0.0004486  
 1st Qu.:0.0009005   1st Qu.:0.0021771   1st Qu.:0.0004610  
 Median :0.0026646   Median :0.0024892   Median :0.0034061  
 Mean   :0.3339933   Mean   :0.3329923   Mean   :0.3330144  
 3rd Qu.:0.9970595   3rd Qu.:0.9910231   3rd Qu.:0.9918060  
 Max.   :0.9973712   Max.   :0.9981459   Max.   :0.9995201  
 LogProba_setosa     LogProba_versicolor LogProba_virginica        Decision 
 Min.   :-8.487538   Min.   :-8.20

In [16]:
summary(df_r_out)

  Proba_setosa       Proba_versicolor    Proba_virginica          KEY        
 Min.   :0.0002060   Min.   :0.0002739   Min.   :0.0004486   Min.   :  1.00  
 1st Qu.:0.0009005   1st Qu.:0.0021771   1st Qu.:0.0004610   1st Qu.: 38.25  
 Median :0.0026646   Median :0.0024892   Median :0.0034061   Median : 75.50  
 Mean   :0.3339933   Mean   :0.3329923   Mean   :0.3330144   Mean   : 75.50  
 3rd Qu.:0.9970594   3rd Qu.:0.9910231   3rd Qu.:0.9918060   3rd Qu.:112.75  
 Max.   :0.9973712   Max.   :0.9981459   Max.   :0.9995201   Max.   :150.00  
 Score_setosa   Score_versicolor Score_virginica LogProba_setosa    
 Mode:logical   Mode:logical     Mode:logical    Min.   :-8.487540  
 NA's:150       NA's:150         NA's:150        1st Qu.:-7.012534  
                                                 Median :-5.927741  
                                                 Mean   :-4.384824  
                                                 3rd Qu.:-0.002945  
                                        