In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’


The following object is masked from ‘package:caret’:

    progress




# Build a Model

In [2]:
options(digits = 15)

## multiclass classification in iris dataset:

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "rf", maxnodes=6, ntree=64)    

    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)
# print(model)

In [4]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv                 Estimator                 Error                   
 Min.   : 5.0000000000   Min.   :13.0500498569   Min.   :-21.78936421840000  
 1st Qu.:17.0250000000   1st Qu.:17.6834445932   1st Qu.: -2.00326562836000  
 Median :21.2000000000   Median :21.1847038432   Median :  0.36179596996200  
 Mean   :22.5328063241   Mean   :22.5269983586   Mean   : -0.00580796550462  
 3rd Qu.:25.0000000000   3rd Qu.:24.3222606421   3rd Qu.:  2.25276588731000  
 Max.   :50.0000000000   Max.   :44.6482565751   Max.   : 12.61393227700000  

# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    # WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "CPP" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


namespace  {

	namespace EXT_SubModel_0 {
	
		typedef std::vector<double> tNodeData;
		std::map<int, tNodeData> Decision_Tree_Node_data = {
				{ 5 ,  {31.03333333333333 }} ,
				{ 6 ,  {46.00689655172414 }} ,
				{ 7 ,  {50.0 }} ,
				{ 8 ,  {22.60267857142857 }} ,
				{ 9 ,  {21.56086956521739 }} ,
				{ 10 ,  {14.327096774193544 }} 
		};
		
	
		int get_decision_tree_node_index(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3, std::any Feature_4, std::any Feature_5, std::any Feature_6, std::any Feature_7, std::any Feature_8, std::any Feature_9, std::any Feature_10, std::any Feature_11, std::any Feature_12) {
			int lNodeIndex = (Feature_5 <= 6.754) ? ( (Feature_12 <= 14.395) ? ( (Feature_12 <= 4.025) ? ( 7 ) : ( 8 ) ) : ( (Feature_4 <= 0.531) ? ( 9 ) : ( 10 ) ) ) : ( (Feature_5 <= 7.445) ? ( 5 ) : ( 6 ) );
		
			return lNodeIndex;
		}
		
	
		std::vector<std::string> get_input_names(){
			std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "F

# Execute the CPP Code

In [7]:
    write_text_to_file = function(iCPPCode, oCPPFile) {
        fileConn<-file(oCPPFile)
        writeLines(c(iCPPCode), fileConn)
        close(fileConn)
    }

    add_cpp_main_function = function(iCPPCode, iCSVFile) {
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = paste(lCPPCode,  iCPPCode, sep="")
        lCPPCode = paste(lCPPCode, "\n\nint main() {\n", sep="")
        lCPPCode = paste(lCPPCode, "\tscore_csv_file(\"",  iCSVFile, "\");\n", sep="")
        lCPPCode = paste(lCPPCode, "\treturn 0;\n}\n", sep="")
        return(lCPPCode)
    }

    compile_cpp_code_as_executable = function(iName) {
        lCommand = paste("-Wall -Wno-unused-function -std=c++17 -g -o ", iName,  ".exe ",  iName,  ".cpp", sep="")
        cat(paste("EXECUTING " , "'",  lCommand,  "'", sep=""))
        result = system2("g++", args = lCommand)
        # print(result)
    }


    execute_cpp_model = function(iName, iCSVFile) {
        lOutName = paste(iName, ".out", sep="")
        result2 = system2(paste(iName, ".exe", sep="") ,  args = iCSVFile, stdout=lOutName)
        cat(result2)
        return(result2)
    }
        
    execute_cpp_code = function(iCPPCode, iCSVFile) {
        lTimeStr = format(Sys.time(), "%Y-%m-%d_%H_%M_%S")
        lName = paste("/tmp/ml2cpp_r_", "sample_" , lTimeStr, sep="");
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, paste(lName, ".cpp", sep=""))
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        lOutName = paste(lName, ".out", sep="")
        return(lOutName)
    }


In [8]:

df_cpp_in = as.data.frame(dataset)
# cat(names(df_cpp_in))

names(df_cpp_in) = sprintf("Feature_%d",0:(ncol(df_cpp_in)-1))

write.csv(df_cpp_in, "/tmp/boston2.csv", quote = FALSE)

lOutName = execute_cpp_code(lModelSQL, "/tmp/boston2.csv")
lOutName

EXECUTING '-Wall -Wno-unused-function -std=c++17 -g -o /tmp/ml2cpp_r_sample_2020-09-29_18_32_13.exe /tmp/ml2cpp_r_sample_2020-09-29_18_32_13.cpp'0

In [9]:
df_cpp_out <- read.csv(file = lOutName)
df_cpp_out$KEY = seq.int(nrow(df_cpp_out))
head(df_cpp_out)


Unnamed: 0_level_0,idx,Estimator,KEY
Unnamed: 0_level_1,<int>,<dbl>,<int>
1,0,25.7096870641091,1
2,1,23.0852487090328,2
3,2,34.5743804306574,3
4,3,34.120843255677,4
5,4,31.9095949658386,5
6,5,25.3071515060204,6


In [10]:
# df_sql_out

# R RandomForest Output

In [11]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Unnamed: 0_level_0,Estimator,KEY
Unnamed: 0_level_1,<dbl>,<int>
1,25.7096870641091,1
2,23.0852487090328,2
3,34.5743804306574,3
4,34.120843255677,4
5,31.9095949658386,5
6,25.3071515060204,6


# Compare R and SQL output

In [12]:
df_merge = merge(x = df_r_out, y = df_cpp_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

Unnamed: 0_level_0,KEY,Estimator_1,idx,Estimator_2
Unnamed: 0_level_1,<int>,<dbl>,<int>,<dbl>
1,1,25.7096870641091,0,25.7096870641091
2,2,23.0852487090328,1,23.0852487090328
3,3,34.5743804306574,2,34.5743804306574
4,4,34.120843255677,3,34.120843255677
5,5,31.9095949658386,4,31.9095949658386
6,6,25.3071515060204,5,25.3071515060204


In [13]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


Unnamed: 0_level_0,KEY,Estimator_1,idx,Estimator_2,Error,AbsError
Unnamed: 0_level_1,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1,1,25.7096870641091,0,25.7096870641091,3.5527136788005e-15,3.5527136788005e-15
2,2,23.0852487090328,1,23.0852487090328,-3.5527136788005e-15,3.5527136788005e-15
3,3,34.5743804306574,2,34.5743804306574,0.0,0.0
4,4,34.120843255677,3,34.120843255677,-2.1316282072803002e-14,2.1316282072803002e-14
5,5,31.9095949658386,4,31.9095949658386,1.4210854715202e-14,1.4210854715202e-14
6,6,25.3071515060204,5,25.3071515060204,-1.0658141036401501e-14,1.0658141036401501e-14


In [14]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0000001,]
df_merge_largest_errors

Unnamed: 0_level_0,KEY,Estimator_1,idx,Estimator_2,Error,AbsError
Unnamed: 0_level_1,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
176,176,25.5512241404877,175,25.4609248739061,0.0902992665816349,0.0902992665816349


In [15]:
nrow(df_merge_largest_errors)
stopifnot(nrow(df_merge_largest_errors) <= 2)


In [16]:
summary(df_cpp_out)

      idx           Estimator                  KEY        
 Min.   :  0.00   Min.   :13.0500498569   Min.   :  1.00  
 1st Qu.:126.25   1st Qu.:17.6834445932   1st Qu.:127.25  
 Median :252.50   Median :21.1847038432   Median :253.50  
 Mean   :252.50   Mean   :22.5268199016   Mean   :253.50  
 3rd Qu.:378.75   3rd Qu.:24.3222606421   3rd Qu.:379.75  
 Max.   :505.00   Max.   :44.6482565751   Max.   :506.00  

In [17]:
summary(df_r_out)

   Estimator                  KEY        
 Min.   :13.0500498569   Min.   :  1.00  
 1st Qu.:17.6834445932   1st Qu.:127.25  
 Median :21.1847038432   Median :253.50  
 Mean   :22.5269983586   Mean   :253.50  
 3rd Qu.:24.3222606421   3rd Qu.:379.75  
 Max.   :44.6482565751   Max.   :506.00  

In [18]:
summary(df_merge)

      KEY          Estimator_1                 idx        
 Min.   :  1.00   Min.   :13.0500498569   Min.   :  0.00  
 1st Qu.:127.25   1st Qu.:17.6834445932   1st Qu.:126.25  
 Median :253.50   Median :21.1847038432   Median :252.50  
 Mean   :253.50   Mean   :22.5269983586   Mean   :252.50  
 3rd Qu.:379.75   3rd Qu.:24.3222606421   3rd Qu.:378.75  
 Max.   :506.00   Max.   :44.6482565751   Max.   :505.00  
  Estimator_2                Error                   
 Min.   :13.0500498569   Min.   :-3.55000000000e-14  
 1st Qu.:17.6834445932   1st Qu.:-3.60000000000e-15  
 Median :21.1847038432   Median : 0.00000000000e+00  
 Mean   :22.5268199016   Mean   : 1.78457048581e-04  
 3rd Qu.:24.3222606421   3rd Qu.: 3.60000000000e-15  
 Max.   :44.6482565751   Max.   : 9.02992665816e-02  
    AbsError                
 Min.   :0.00000000000e+00  
 1st Qu.:3.60000000000e-15  
 Median :3.60000000000e-15  
 Mean   :1.78457048586e-04  
 3rd Qu.:7.10000000000e-15  
 Max.   :9.02992665816e-02  