In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(httr)



Attaching package: ‘httr’


The following object is masked from ‘package:caret’:

    progress




# Build a Model

In [2]:
set.seed(1960)

create_model  =  function() {

    grid = data.frame(degree=2, nprune = 20)
    ctrl = trainControl(method="none")
    model  <- train(Species ~ ., data = iris, method = "earth", trControl = ctrl, tuneGrid = grid)
    
    return(model)
}

In [3]:
# dataset
model = create_model()

Loading required package: earth

Loading required package: Formula

Loading required package: plotmo

Loading required package: plotrix

Loading required package: TeachingDemos

“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”


In [4]:
pred <- predict(model, as.matrix(iris[, -5]) , type="prob")
pred_labels <- predict(model, as.matrix(iris[, -5]) , type="raw")
sum(pred_labels != iris$Species)/length(pred_labels)


# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_rpart_test_model", SerializedModel = b64_data , SQLDialect = "CPP" , Mode=" caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)

namespace  {

	std::vector<std::string> get_classes(){
		std::vector<std::string> lClasses = { "setosa", "versicolor", "virginica" };

		return lClasses;
	}

	std::vector<std::string> get_input_names(){
		std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "Feature_2", "Feature_3" };

		return lFeatures;
	}

	std::vector<std::string> get_output_names(){
		std::vector<std::string> lOutputs = { 
			"Score_setosa", "Score_versicolor", "Score_virginica",
			"Proba_setosa", "Proba_versicolor", "Proba_virginica",
			"LogProba_setosa", "LogProba_versicolor", "LogProba_virginica",
			"Decision", "DecisionProba" };

		return lOutputs;
	}

	tTable compute_classification_scores(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3) {
		auto lClasses = get_classes();

		double lScore_setosa = -4.738027109733029 * relu(Feature_3 - 1.9) + -2.092082269087399 * relu(Feature_2 - 5.1) * relu(Feature_3 - 1.2) + 2.0873495080197118 * relu(Feature_2 - 4.6) + 21.02407858

# Execute the CPP Code

In [7]:
    write_text_to_file = function(iCPPCode, oCPPFile) {
        fileConn<-file(oCPPFile)
        writeLines(c(iCPPCode), fileConn)
        close(fileConn)
    }

    add_cpp_main_function = function(iCPPCode, iCSVFile) {
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = paste(lCPPCode,  iCPPCode, sep="")
        lCPPCode = paste(lCPPCode, "\n\nint main() {\n", sep="")
        lCPPCode = paste(lCPPCode, "\tscore_csv_file(\"",  iCSVFile, "\");\n", sep="")
        lCPPCode = paste(lCPPCode, "\treturn 0;\n}\n", sep="")
        return(lCPPCode)
    }

    compile_cpp_code_as_executable = function(iName) {
        lCommand = paste("-Wall -Wno-unused-function -std=c++17 -g -o ", iName,  ".exe ",  iName,  ".cpp", sep="")
        cat(paste("EXECUTING " , "'",  lCommand,  "'", sep=""))
        result = system2("g++", args = lCommand)
        # print(result)
    }

    execute_cpp_model = function(iName, iCSVFile) {
        lOutName = paste(iName, ".out", sep="")
        result2 = system2(paste(iName, ".exe", sep="") ,  args = iCSVFile, stdout=lOutName)
        # cat(result2)
        return(result2)
    }
        
    execute_cpp_code = function(iCPPCode, iCSVFile) {
        lTimeStr = format(Sys.time(), "%Y-%m-%d_%H_%M_%S")
        lName = paste("/tmp/ml2cpp_r_", "sample_" , lTimeStr, sep="");
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, paste(lName, ".cpp", sep=""))
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        lOutName = paste(lName, ".out", sep="")
        return(lOutName)
    }

In [8]:
dataset = iris[,-5]

df_cpp_in = as.data.frame(dataset)
# cat(names(df_cpp_in))

names(df_cpp_in) = sprintf("Feature_%d",0:(ncol(df_cpp_in)-1))

write.csv(df_cpp_in, "/tmp/iris2.csv", row.names = FALSE, quote = FALSE)

lOutName = execute_cpp_code(lModelSQL, "/tmp/iris2.csv")
lOutName

EXECUTING '-Wall -Wno-unused-function -std=c++17 -g -o /tmp/ml2cpp_r_sample_2020-10-05_18_01_37.exe /tmp/ml2cpp_r_sample_2020-10-05_18_01_37.cpp'

In [9]:
df_cpp_out <- read.csv(file = lOutName)
df_cpp_out$KEY = seq.int(nrow(df_cpp_out))
head(df_cpp_out)

Unnamed: 0_level_0,idx,Score_setosa,Score_versicolor,Score_virginica,Proba_setosa,Proba_versicolor,Proba_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba,KEY
Unnamed: 0_level_1,<int>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<int>
1,0,,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1,1
2,1,,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1,2
3,2,,,,1,0,6.4e-13,-6.4e-13,-32.23619,-28.07492,setosa,1,3
4,3,,,,1,0,2.33e-12,-2.33e-12,-32.23619,-26.78355,setosa,1,4
5,4,,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1,5
6,5,,,,1,0,8.49e-12,-8.49e-12,-32.23619,-25.49218,setosa,1,6


# R Caret RandomForest Output

In [10]:
pred_proba  =  predict(model, as.matrix(iris[,-5]), type = "prob")
df_r_out = data.frame(pred_proba)
names(df_r_out) = sprintf("Proba_%s",model$levels)

df_r_out$KEY = seq.int(nrow(df_r_out))
df_r_out$Score_setosa  =  NA
df_r_out$Score_versicolor  =  NA
df_r_out$Score_virginica  =  NA
df_r_out$LogProba_setosa  =  log(df_r_out$Proba_setosa)
df_r_out$LogProba_versicolor =  log(df_r_out$Proba_versicolor)
df_r_out$LogProba_virginica  =  log(df_r_out$Proba_virginica)
df_r_out$Decision =   predict(model, as.matrix(iris[,-5]), type = "raw")
df_r_out$DecisionProba =  apply(pred_proba, 1, function(x) max(x))
head(df_r_out)



Unnamed: 0_level_0,Proba_setosa,Proba_versicolor,Proba_virginica,KEY,Score_setosa,Score_versicolor,Score_virginica,LogProba_setosa,LogProba_versicolor,LogProba_virginica,Decision,DecisionProba
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<int>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<fct>,<dbl>
1,1,2.220446e-16,1.223587e-12,1,,,,-1.22391e-12,-36.04365,-27.42923,setosa,1
2,1,2.220446e-16,1.223587e-12,2,,,,-1.22391e-12,-36.04365,-27.42923,setosa,1
3,1,2.220446e-16,6.415317e-13,3,,,,-6.417089e-13,-36.04365,-28.07492,setosa,1
4,1,2.220446e-16,2.333735e-12,4,,,,-2.333911e-12,-36.04365,-26.78355,setosa,1
5,1,2.220446e-16,1.223587e-12,5,,,,-1.22391e-12,-36.04365,-27.42923,setosa,1
6,1,2.220446e-16,8.489558e-12,6,,,,-8.489875e-12,-36.04365,-25.49218,setosa,1


# Compare R and SQL output

In [11]:
df_merge = merge(x = df_r_out, y = df_cpp_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

Unnamed: 0_level_0,KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,⋯,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>
1,1,1,2.220446e-16,1.223587e-12,,,,-1.22391e-12,-36.04365,-27.42923,⋯,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1
2,2,1,2.220446e-16,1.223587e-12,,,,-1.22391e-12,-36.04365,-27.42923,⋯,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1
3,3,1,2.220446e-16,6.415317e-13,,,,-6.417089e-13,-36.04365,-28.07492,⋯,,,1,0,6.4e-13,-6.4e-13,-32.23619,-28.07492,setosa,1
4,4,1,2.220446e-16,2.333735e-12,,,,-2.333911e-12,-36.04365,-26.78355,⋯,,,1,0,2.33e-12,-2.33e-12,-32.23619,-26.78355,setosa,1
5,5,1,2.220446e-16,1.223587e-12,,,,-1.22391e-12,-36.04365,-27.42923,⋯,,,1,0,1.22e-12,-1.22e-12,-32.23619,-27.42923,setosa,1
6,6,1,2.220446e-16,8.489558e-12,,,,-8.489875e-12,-36.04365,-25.49218,⋯,,,1,0,8.49e-12,-8.49e-12,-32.23619,-25.49218,setosa,1


In [12]:
diffs_df = df_merge[df_merge$Decision_1 != df_merge$Decision_2,]
head(diffs_df)

“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”


KEY,Proba_setosa_1,Proba_versicolor_1,Proba_virginica_1,Score_setosa_1,Score_versicolor_1,Score_virginica_1,LogProba_setosa_1,LogProba_versicolor_1,LogProba_virginica_1,⋯,Score_versicolor_2,Score_virginica_2,Proba_setosa_2,Proba_versicolor_2,Proba_virginica_2,LogProba_setosa_2,LogProba_versicolor_2,LogProba_virginica_2,Decision_2,DecisionProba_2
<int>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,⋯,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>


In [13]:
stopifnot(nrow(diffs_df) == 0)

In [14]:
summary(df_cpp_out)

      idx         Score_setosa   Score_versicolor Score_virginica
 Min.   :  0.00   Mode:logical   Mode:logical     Mode:logical   
 1st Qu.: 37.25   NA's:150       NA's:150         NA's:150       
 Median : 74.50                                                  
 Mean   : 74.50                                                  
 3rd Qu.:111.75                                                  
 Max.   :149.00                                                  
  Proba_setosa    Proba_versicolor Proba_virginica  LogProba_setosa 
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :-32.24  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:-28.38  
 Median :0.0000   Median :0.0000   Median :0.0000   Median :-27.50  
 Mean   :0.3333   Mean   :0.3333   Mean   :0.3333   Mean   :-19.04  
 3rd Qu.:1.0000   3rd Qu.:0.9991   3rd Qu.:1.0000   3rd Qu.:  0.00  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :  0.00  
 LogProba_versicolor LogProba_virginica   Decision     

In [15]:
summary(df_r_out)

  Proba_setosa    Proba_versicolor Proba_virginica       KEY        
 Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :  1.00  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 38.25  
 Median :0.0000   Median :0.0000   Median :0.0000   Median : 75.50  
 Mean   :0.3333   Mean   :0.3333   Mean   :0.3333   Mean   : 75.50  
 3rd Qu.:1.0000   3rd Qu.:0.9991   3rd Qu.:1.0000   3rd Qu.:112.75  
 Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :150.00  
 Score_setosa   Score_versicolor Score_virginica LogProba_setosa 
 Mode:logical   Mode:logical     Mode:logical    Min.   :-36.04  
 NA's:150       NA's:150         NA's:150        1st Qu.:-28.38  
                                                 Median :-27.50  
                                                 Mean   :-19.80  
                                                 3rd Qu.:  0.00  
                                                 Max.   :  0.00  
 LogProba_versicolor LogProba_virginica       Decision 