In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(httr)
library(mlbench)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:
set.seed(1960)

data(BreastCancer)
# summary(BreastCancer)

bc = BreastCancer[,-1]

for(i in 1:(ncol(bc) - 1)){
  bc[, i] <- as.numeric(bc[, i])
  bc[is.na(bc[,i]), i] <- mean(bc[,i], na.rm = TRUE)
}
TGT_IDX = ncol(bc)

create_model  =  function() {

    model  <- train(Class ~ ., data = bc, method = "svmRadial", prob.model=TRUE)
    
    return(model)
}

In [3]:
# dataset
model = create_model()

In [4]:
pred <- predict(model, as.matrix(bc[, -TGT_IDX]) , type="prob")
pred_labels <- predict(model, as.matrix(bc[, -TGT_IDX]) , type="raw")
sum(pred_labels != bc$Class)/length(pred_labels)


In [5]:
pred[1:5,]

benign,malignant
0.9991649,0.0008351421
0.6507895,0.349210543
0.9998614,0.0001386441
0.8345979,0.1654021127
0.9992412,0.0007588317


# SQL Code Generation

In [6]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_svm_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [7]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)

WITH kernel_input_with_scaling AS 
(SELECT "ADS"."KEY" AS "KEY", (CAST("ADS"."Feature_0" AS FLOAT) - 4.4177396280400565) / 2.8157406585949314 AS "Feature_0", (CAST("ADS"."Feature_1" AS FLOAT) - 3.13447782546495) / 3.0514591099542008 AS "Feature_1", (CAST("ADS"."Feature_2" AS FLOAT) - 3.2074391988555084) / 2.971912767215713 AS "Feature_2", (CAST("ADS"."Feature_3" AS FLOAT) - 2.8068669527896994) / 2.8553792392170236 AS "Feature_3", (CAST("ADS"."Feature_4" AS FLOAT) - 3.2160228898426317) / 2.2142998866490484 AS "Feature_4", (CAST("ADS"."Feature_5" AS FLOAT) - 3.544655929721816) / 3.6018516398045315 AS "Feature_5", (CAST("ADS"."Feature_6" AS FLOAT) - 3.4377682403433485) / 2.438364252324251 AS "Feature_6", (CAST("ADS"."Feature_7" AS FLOAT) - 2.866952789699571) / 3.0536338936127745 AS "Feature_7", (CAST("ADS"."Feature_8" AS FLOAT) - 1.569384835479256) / 1.619802614296755 AS "Feature_8" 
FROM "INPUT_DATA" AS "ADS"), 
"SV_data" AS 
(SELECT "Values".sv_idx AS sv_idx, "Values".dual_coeff AS dual

# Execute the SQL Code

In [8]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [9]:
dataset = bc[, -TGT_IDX]

df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,KEY
5,1,1,1,2,1,3,1,1,1
5,4,4,5,7,10,3,2,1,2
3,1,1,1,2,2,3,1,1,3
6,8,8,1,3,4,3,7,1,4
4,1,1,3,2,1,3,1,1,5
8,10,10,8,7,10,9,7,1,6


In [10]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [11]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out)

KEY,Score_benign,Score_malignant,Proba_benign,Proba_malignant,LogProba_benign,LogProba_malignant,Decision,DecisionProba
184,,,0.03109583,0.9689041732,-3.4706816558,-0.03158956,malignant,0.9689042
87,,,0.04087824,0.9591217581,-3.1971573403,-0.04173725,malignant,0.9591218
652,,,0.99778928,0.0022107247,-0.002213172,-6.11443488,benign,0.9977893
477,,,0.99980556,0.0001944392,-0.0001944581,-8.54539082,benign,0.9998056
273,,,0.04083476,0.9591652418,-3.1982216426,-0.04169191,malignant,0.9591652
550,,,0.04092294,0.9590770637,-3.1960645835,-0.04178385,malignant,0.9590771


# R Caret SVM Output

In [12]:
pred_proba  =  predict(model, as.matrix(dataset), type = "prob")
df_r_out = data.frame(pred_proba)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(dataset))
df_r_out$Score_benign  =  NA
df_r_out$Score_malignant  =  NA
df_r_out$LogProba_benign  =  log(df_r_out$Proba_benign)
df_r_out$LogProba_malignant =  log(df_r_out$Proba_malignant)
df_r_out$Decision =   predict(model, as.matrix(dataset), type = "raw")
df_r_out$DecisionProba =  apply(pred_proba, 1, function(x) max(x))
head(df_r_out)

Proba_benign,Proba_malignant,KEY,Score_benign,Score_malignant,LogProba_benign,LogProba_malignant,Decision,DecisionProba
0.99916486,0.0008351421,1,,,-0.000835491,-7.08790869,benign,0.9991649
0.65078946,0.349210543,2,,,-0.4295691039,-1.05208026,benign,0.6507895
0.99986136,0.0001386441,3,,,-0.0001386537,-8.88360033,benign,0.9998614
0.83459789,0.1654021127,4,,,-0.1808052422,-1.79937572,benign,0.8345979
0.99924117,0.0007588317,5,,,-0.0007591197,-7.18373061,benign,0.9992412
0.04095817,0.9590418333,6,,,-3.1952040575,-0.04182058,malignant,0.9590418


# Compare R and SQL output

In [13]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Proba_benign_1,Proba_malignant_1,Score_benign_1,Score_malignant_1,LogProba_benign_1,LogProba_malignant_1,Decision_1,DecisionProba_1,Score_benign_2,Score_malignant_2,Proba_benign_2,Proba_malignant_2,LogProba_benign_2,LogProba_malignant_2,Decision_2,DecisionProba_2
1,0.99916486,0.0008351421,,,-0.000835491,-7.08790869,benign,0.9991649,,,0.99916486,0.0008351421,-0.000835491,-7.08790869,benign,0.9991649
2,0.65078946,0.349210543,,,-0.4295691039,-1.05208026,benign,0.6507895,,,0.65078946,0.349210543,-0.4295691039,-1.05208026,benign,0.6507895
3,0.99986136,0.0001386441,,,-0.0001386537,-8.88360033,benign,0.9998614,,,0.99986136,0.0001386441,-0.0001386537,-8.88360033,benign,0.9998614
4,0.83459789,0.1654021127,,,-0.1808052422,-1.79937572,benign,0.8345979,,,0.83459789,0.1654021127,-0.1808052422,-1.79937572,benign,0.8345979
5,0.99924117,0.0007588317,,,-0.0007591197,-7.18373061,benign,0.9992412,,,0.99924117,0.0007588317,-0.0007591197,-7.18373061,benign,0.9992412
6,0.04095817,0.9590418333,,,-3.1952040575,-0.04182058,malignant,0.9590418,,,0.04095817,0.9590418333,-3.1952040575,-0.04182058,malignant,0.9590418


In [14]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

KEY,Proba_benign_1,Proba_malignant_1,Score_benign_1,Score_malignant_1,LogProba_benign_1,LogProba_malignant_1,Decision_1,DecisionProba_1,Score_benign_2,Score_malignant_2,Proba_benign_2,Proba_malignant_2,LogProba_benign_2,LogProba_malignant_2,Decision_2,DecisionProba_2


In [15]:
print(c("DIFF_N_ROWS" , nrow(diffs_df)))
stopifnot(nrow(diffs_df) == 0)


[1] "DIFF_N_ROWS" "0"          


In [16]:
summary(df_sql_out)

      KEY        Score_benign   Score_malignant  Proba_benign    
 Min.   :  1.0   Mode:logical   Mode:logical    Min.   :0.02941  
 1st Qu.:175.5   NA's:699       NA's:699        1st Qu.:0.04092  
 Median :350.0                                  Median :0.99816  
 Mean   :350.0                                  Mean   :0.66571  
 3rd Qu.:524.5                                  3rd Qu.:0.99958  
 Max.   :699.0                                  Max.   :0.99995  
 Proba_malignant     LogProba_benign     LogProba_malignant       Decision  
 Min.   :0.0000452   Min.   :-3.526324   Min.   :-10.00401   benign   :459  
 1st Qu.:0.0004226   1st Qu.:-3.196120   1st Qu.: -7.76929   malignant:240  
 Median :0.0018355   Median :-0.001837   Median : -6.30043                  
 Mean   :0.3342871   Mean   :-1.099390   Mean   : -4.65004                  
 3rd Qu.:0.9590793   3rd Qu.:-0.000423   3rd Qu.: -0.04178                  
 Max.   :0.9705871   Max.   :-0.000045   Max.   : -0.02985                  

In [17]:
summary(df_r_out)

  Proba_benign     Proba_malignant          KEY        Score_benign  
 Min.   :0.02941   Min.   :0.0000452   Min.   :  1.0   Mode:logical  
 1st Qu.:0.04092   1st Qu.:0.0004226   1st Qu.:175.5   NA's:699      
 Median :0.99816   Median :0.0018355   Median :350.0                 
 Mean   :0.66571   Mean   :0.3342871   Mean   :350.0                 
 3rd Qu.:0.99958   3rd Qu.:0.9590793   3rd Qu.:524.5                 
 Max.   :0.99995   Max.   :0.9705871   Max.   :699.0                 
 Score_malignant LogProba_benign     LogProba_malignant       Decision  
 Mode:logical    Min.   :-3.526324   Min.   :-10.00401   benign   :459  
 NA's:699        1st Qu.:-3.196120   1st Qu.: -7.76929   malignant:240  
                 Median :-0.001837   Median : -6.30043                  
                 Mean   :-1.099390   Mean   : -4.65004                  
                 3rd Qu.:-0.000423   3rd Qu.: -0.04178                  
                 Max.   :-0.000045   Max.   : -0.02985                  