In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(httr, quiet=TRUE)



Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:
set.seed(1960)

create_model  =  function() {

    model  <- train(Species ~ ., data = iris, method = "nnet", trace = FALSE)
    
    return(model)
}

In [3]:
# dataset
model = create_model()

In [4]:
pred <- predict(model, as.matrix(iris[, -5]) , type="prob")
pred_labels <- predict(model, as.matrix(iris[, -5]) , type="raw")
sum(pred_labels != iris$Species)/length(pred_labels)


# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_nnet_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)

WITH "IL" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) AS "Feature_0", CAST("ADS"."Feature_1" AS FLOAT) AS "Feature_1", CAST("ADS"."Feature_2" AS FLOAT) AS "Feature_2", CAST("ADS"."Feature_3" AS FLOAT) AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"), 
"HL_BA_1" AS 
(SELECT "IL"."KEY" AS "KEY", 1.576477 * "IL"."Feature_0" + 1.740795 * "IL"."Feature_1" + -2.555659 * "IL"."Feature_2" + -3.257832 * "IL"."Feature_3" + 2.94493 AS "NEUR_1_1", 0.3633052 * "IL"."Feature_0" + 1.213003 * "IL"."Feature_1" + -1.974031 * "IL"."Feature_2" + -0.8775904 * "IL"."Feature_3" + 0.2182532 AS "NEUR_1_2", -0.3089975 * "IL"."Feature_0" + -0.9833805 * "IL"."Feature_1" + 1.421223 * "IL"."Feature_2" + 0.729276 * "IL"."Feature_3" + -0.2177701 AS "NEUR_1_3" 
FROM "IL"), 
"HL_1_logistic" AS 
(SELECT "HL_BA_1"."KEY" AS "KEY", 1.0 / (1.0 + exp(least(greatest(-100.0, -"HL_BA_1"."NEUR_1_1"), 100.0))) AS "NEUR_1_1", 1.0 / (1.0 + exp(least(greatest(-100.0, -"HL_BA_1"."NEUR_1_2"), 100.0))) AS "NEUR_1_2", 

# Execute the SQL Code

In [7]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [8]:
dataset = iris[,-5]

df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.4,3.9,1.7,0.4,6


In [9]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [10]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out)

KEY,Score_setosa,Score_versicolor,Score_virginica,Proba_setosa,Proba_versicolor,Proba_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
87,,,,0.008454715,0.97113989,0.0204054,-4.773031,-0.02928476,-3.89195581,versicolor,0.9711399
116,,,,0.002113786,0.03291219,0.96497403,-6.159275,-3.41391232,-0.03565409,virginica,0.964974
71,,,,0.011807202,0.42347135,0.56472145,-4.439046,-0.85926943,-0.57142268,virginica,0.5647215
68,,,,0.009479021,0.98000051,0.01052047,-4.658674,-0.02020219,-4.55443232,versicolor,0.9800005
51,,,,0.007877158,0.98127802,0.01084482,-4.843788,-0.01889945,-4.52406799,versicolor,0.981278
52,,,,0.009225486,0.97389379,0.01688073,-4.685785,-0.02645303,-4.08158279,versicolor,0.9738938


# R Caret Rpart Output

In [11]:
pred_proba  =  predict(model, as.matrix(iris[,-5]), type = "prob")
df_r_out = data.frame(pred_proba)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(dataset))
df_r_out$Score_setosa  =  NA
df_r_out$Score_versicolor  =  NA
df_r_out$Score_virginica  =  NA
df_r_out$LogProba_setosa  =  log(df_r_out$Proba_setosa)
df_r_out$LogProba_versicolor =  log(df_r_out$Proba_versicolor)
df_r_out$LogProba_virginica  =  log(df_r_out$Proba_virginica)
df_r_out$Decision =   predict(model, as.matrix(iris[,-5]), type = "raw")
df_r_out$DecisionProba =  apply(pred_proba, 1, function(x) max(x))
head(df_r_out)



Proba_setosa,Proba_versicolor,Proba_virginica,KEY,Score_setosa,Score_versicolor,Score_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
0.9881501,0.01143637,0.0004134949,1,,,,-0.01192064,-4.470956,-7.790865,setosa,0.9881501
0.9839552,0.01551062,0.0005342213,2,,,,-0.01617495,-4.166231,-7.5347,setosa,0.9839552
0.9866005,0.01293939,0.0004600961,3,,,,-0.01349007,-4.347479,-7.684075,setosa,0.9866005
0.9820602,0.01735468,0.0005850706,4,,,,-0.01810262,-4.053893,-7.443778,setosa,0.9820602
0.9884643,0.01113156,0.0004041086,5,,,,-0.01160272,-4.497971,-7.813827,setosa,0.9884643
0.9875193,0.01205037,0.000430342,6,,,,-0.01255925,-4.41866,-7.75093,setosa,0.9875193


# Compare R and SQL output

In [12]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2
1,0.9881501,0.01143637,0.0004134949,,,,-0.01192064,-4.470956,-7.790865,⋯,,,0.9881501,0.01143637,0.000413495,-0.01192064,-4.470956,-7.790865,setosa,0.9881501
2,0.9839552,0.01551062,0.0005342213,,,,-0.01617495,-4.166231,-7.5347,⋯,,,0.9839552,0.01551062,0.0005342215,-0.01617495,-4.166231,-7.5347,setosa,0.9839552
3,0.9866005,0.01293939,0.0004600961,,,,-0.01349007,-4.347479,-7.684075,⋯,,,0.9866005,0.01293939,0.0004600963,-0.01349007,-4.347479,-7.684075,setosa,0.9866005
4,0.9820602,0.01735468,0.0005850706,,,,-0.01810262,-4.053893,-7.443778,⋯,,,0.9820602,0.01735468,0.0005850708,-0.01810262,-4.053893,-7.443778,setosa,0.9820602
5,0.9884643,0.01113156,0.0004041086,,,,-0.01160272,-4.497971,-7.813827,⋯,,,0.9884643,0.01113156,0.0004041088,-0.01160272,-4.497971,-7.813827,setosa,0.9884643
6,0.9875193,0.01205037,0.000430342,,,,-0.01255925,-4.41866,-7.75093,⋯,,,0.9875193,0.01205036,0.0004303422,-0.01255925,-4.41866,-7.75093,setosa,0.9875193


In [13]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

“number of rows of result is not a multiple of vector length (arg 2)”

KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2


In [17]:
stopifnot(nrow(diffs_df) == 0)

In [15]:
summary(df_sql_out)

      KEY         Score_setosa   Score_versicolor Score_virginica
 Min.   :  1.00   Mode:logical   Mode:logical     Mode:logical   
 1st Qu.: 38.25   NA's:150       NA's:150         NA's:150       
 Median : 75.50                                                  
 Mean   : 75.50                                                  
 3rd Qu.:112.75                                                  
 Max.   :150.00                                                  
  Proba_setosa      Proba_versicolor   Proba_virginica     LogProba_setosa   
 Min.   :0.001213   Min.   :0.008885   Min.   :0.0003322   Min.   :-6.71468  
 1st Qu.:0.004039   1st Qu.:0.015907   1st Qu.:0.0005430   1st Qu.:-5.51188  
 Median :0.010727   Median :0.034865   Median :0.0202886   Median :-4.53497  
 Mean   :0.333398   Mean   :0.335251   Mean   :0.3313515   Mean   :-3.48702  
 3rd Qu.:0.983550   3rd Qu.:0.938495   3rd Qu.:0.9099537   3rd Qu.:-0.01659  
 Max.   :0.990782   Max.   :0.981278   Max.   :0.9822518   Max.   :-0.

In [16]:
summary(df_r_out)

  Proba_setosa      Proba_versicolor   Proba_virginica          KEY        
 Min.   :0.001213   Min.   :0.008885   Min.   :0.0003322   Min.   :  1.00  
 1st Qu.:0.004039   1st Qu.:0.015907   1st Qu.:0.0005430   1st Qu.: 38.25  
 Median :0.010727   Median :0.034865   Median :0.0202887   Median : 75.50  
 Mean   :0.333398   Mean   :0.335250   Mean   :0.3313516   Mean   : 75.50  
 3rd Qu.:0.983550   3rd Qu.:0.938495   3rd Qu.:0.9099539   3rd Qu.:112.75  
 Max.   :0.990782   Max.   :0.981278   Max.   :0.9822518   Max.   :150.00  
 Score_setosa   Score_versicolor Score_virginica LogProba_setosa   
 Mode:logical   Mode:logical     Mode:logical    Min.   :-6.71468  
 NA's:150       NA's:150         NA's:150        1st Qu.:-5.51188  
                                                 Median :-4.53497  
                                                 Mean   :-3.48702  
                                                 3rd Qu.:-0.01659  
                                                 Max.   :-0.