In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’


The following object is masked from ‘package:caret’:

    progress




# Build a Model

In [2]:
options(digits = 15)

## multiclass classification in iris dataset:

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

dataset = BostonHousing[, -14] 

create_model  =  function() {

    grid = data.frame(degree=2, nprune = 20)
    ctrl = trainControl(method="none")
    model <- train(medv ~ ., data = BostonHousing, method = "earth", preProcess=c("pca"), trControl = ctrl, tuneGrid = grid)    
    
    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)
# print(model)

Loading required package: earth

Loading required package: Formula

Loading required package: plotmo

Loading required package: plotrix

Loading required package: TeachingDemos



In [4]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv                     Estimator.y                 Error.y         
 Min.   : 5.0000000000   Min.   : 3.41514338182   Min.   :-28.64878580020  
 1st Qu.:17.0250000000   1st Qu.:16.87419040090   1st Qu.: -1.46435170462  
 Median :21.2000000000   Median :21.24411931190   Median :  0.43840076751  
 Mean   :22.5328063241   Mean   :22.53280632410   Mean   :  0.00000000000  
 3rd Qu.:25.0000000000   3rd Qu.:26.05895092910   3rd Qu.:  2.10026903498  
 Max.   :50.0000000000   Max.   :53.93315968310   Max.   : 10.05355276410  

# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "earth_test_model", SerializedModel = b64_data , SQLDialect = "CPP" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


namespace  {

	namespace prep_scale_pca {
	
		namespace scale {
		
			std::vector<std::string> get_input_names(){
				std::vector<std::string> lFeatures = { "Feature_0", "Feature_1", "Feature_2", "Feature_3", "Feature_4", "Feature_5", "Feature_6", "Feature_7", "Feature_8", "Feature_9", "Feature_10", "Feature_11", "Feature_12" };
		
				return lFeatures;
			}
		
			std::vector<std::string> get_output_names(){
				std::vector<std::string> lOutputs = { "Feature_0", "Feature_1", "Feature_2", "Feature_3", "Feature_4", "Feature_5", "Feature_6", "Feature_7", "Feature_8", "Feature_9", "Feature_10", "Feature_11", "Feature_12" };
		
				return lOutputs;
			}
		
			tTable compute_features(std::any Feature_0, std::any Feature_1, std::any Feature_2, std::any Feature_3, std::any Feature_4, std::any Feature_5, std::any Feature_6, std::any Feature_7, std::any Feature_8, std::any Feature_9, std::any Feature_10, std::any Feature_11, std::any Feature_12) {
		
				tTable lTable;
		
				lTable["Feature_0"]

# Execute the CPP Code

In [7]:
    write_text_to_file = function(iCPPCode, oCPPFile) {
        fileConn<-file(oCPPFile)
        writeLines(c(iCPPCode), fileConn)
        close(fileConn)
    }

    add_cpp_main_function = function(iCPPCode, iCSVFile) {
        lCPPCode = "#include \"Generic.i\"\n\n"
        lCPPCode = paste(lCPPCode,  iCPPCode, sep="")
        lCPPCode = paste(lCPPCode, "\n\nint main() {\n", sep="")
        lCPPCode = paste(lCPPCode, "\tscore_csv_file(\"",  iCSVFile, "\");\n", sep="")
        lCPPCode = paste(lCPPCode, "\treturn 0;\n}\n", sep="")
        return(lCPPCode)
    }

    compile_cpp_code_as_executable = function(iName) {
        lCommand = paste("-Wall -Wno-unused-function -std=c++17 -g -o ", iName,  ".exe ",  iName,  ".cpp", sep="")
        cat(paste("EXECUTING " , "'",  lCommand,  "'", sep=""))
        result = system2("g++", args = lCommand)
        # print(result)
    }


    execute_cpp_model = function(iName, iCSVFile) {
        lOutName = paste(iName, ".out", sep="")
        result2 = system2(paste(iName, ".exe", sep="") ,  args = iCSVFile, stdout=lOutName)
        cat(result2)
        return(result2)
    }
        
    execute_cpp_code = function(iCPPCode, iCSVFile) {
        lTimeStr = format(Sys.time(), "%Y-%m-%d_%H_%M_%S")
        lName = paste("/tmp/ml2cpp_r_", "sample_" , lTimeStr, sep="");
        lCPPCode = add_cpp_main_function(iCPPCode, iCSVFile)
        write_text_to_file(lCPPCode, paste(lName, ".cpp", sep=""))
        compile_cpp_code_as_executable(lName)
        result = execute_cpp_model(lName, iCSVFile)
        lOutName = paste(lName, ".out", sep="")
        return(lOutName)
    }


In [8]:

df_cpp_in = as.data.frame(dataset)
# cat(names(df_cpp_in))

names(df_cpp_in) = sprintf("Feature_%d",0:(ncol(df_cpp_in)-1))

write.csv(df_cpp_in, "/tmp/boston2.csv", quote = FALSE)

lOutName = execute_cpp_code(lModelSQL, "/tmp/boston2.csv")
lOutName

EXECUTING '-Wall -Wno-unused-function -std=c++17 -g -o /tmp/ml2cpp_r_sample_2020-10-05_17_56_41.exe /tmp/ml2cpp_r_sample_2020-10-05_17_56_41.cpp'0

In [9]:
df_cpp_out <- read.csv(file = lOutName)
df_cpp_out$KEY = seq.int(nrow(df_cpp_out))
head(df_cpp_out)


Unnamed: 0_level_0,idx,Estimator,KEY
Unnamed: 0_level_1,<int>,<dbl>,<int>
1,0,29.80887049622,1
2,1,23.4707513255756,2
3,2,32.2971504927786,3
4,3,30.3006338134459,4
5,4,28.9106260595744,5
6,5,23.5111805672851,6


In [10]:
# df_sql_out

# R RandomForest Output

In [11]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Unnamed: 0_level_0,Estimator,KEY
Unnamed: 0_level_1,<dbl>,<int>
1,29.80887049622,1
2,23.4707513255756,2
3,32.2971504927786,3
4,30.3006338134459,4
5,28.9106260595744,5
6,23.5111805672851,6


# Compare R and SQL output

In [12]:
df_merge = merge(x = df_r_out, y = df_cpp_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

Unnamed: 0_level_0,KEY,Estimator_1,idx,Estimator_2
Unnamed: 0_level_1,<int>,<dbl>,<int>,<dbl>
1,1,29.80887049622,0,29.80887049622
2,2,23.4707513255756,1,23.4707513255756
3,3,32.2971504927786,2,32.2971504927786
4,4,30.3006338134459,3,30.3006338134459
5,5,28.9106260595744,4,28.9106260595744
6,6,23.5111805672851,5,23.5111805672851


In [13]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


Unnamed: 0_level_0,KEY,Estimator_1,idx,Estimator_2,Error,AbsError
Unnamed: 0_level_1,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1,1,29.80887049622,0,29.80887049622,0.0,0.0
2,2,23.4707513255756,1,23.4707513255756,-7.105427357601e-15,7.105427357601e-15
3,3,32.2971504927786,2,32.2971504927786,-1.4210854715202e-14,1.4210854715202e-14
4,4,30.3006338134459,3,30.3006338134459,0.0,0.0
5,5,28.9106260595744,4,28.9106260595744,3.5527136788005e-15,3.5527136788005e-15
6,6,23.5111805672851,5,23.5111805672851,-7.105427357601e-15,7.105427357601e-15


In [14]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0000001,]
df_merge_largest_errors

KEY,Estimator_1,idx,Estimator_2,Error,AbsError
<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>


In [15]:
nrow(df_merge_largest_errors)
stopifnot(nrow(df_merge_largest_errors) <= 2)


In [16]:
summary(df_cpp_out)

      idx           Estimator                   KEY        
 Min.   :  0.00   Min.   : 3.41514338182   Min.   :  1.00  
 1st Qu.:126.25   1st Qu.:16.87419040090   1st Qu.:127.25  
 Median :252.50   Median :21.24411931190   Median :253.50  
 Mean   :252.50   Mean   :22.53280632410   Mean   :253.50  
 3rd Qu.:378.75   3rd Qu.:26.05895092910   3rd Qu.:379.75  
 Max.   :505.00   Max.   :53.93315968310   Max.   :506.00  

In [17]:
summary(df_r_out)

   Estimator                   KEY        
 Min.   : 3.41514338182   Min.   :  1.00  
 1st Qu.:16.87419040090   1st Qu.:127.25  
 Median :21.24411931190   Median :253.50  
 Mean   :22.53280632410   Mean   :253.50  
 3rd Qu.:26.05895092910   3rd Qu.:379.75  
 Max.   :53.93315968310   Max.   :506.00  

In [18]:
summary(df_merge)

      KEY          Estimator_1                  idx        
 Min.   :  1.00   Min.   : 3.41514338182   Min.   :  0.00  
 1st Qu.:127.25   1st Qu.:16.87419040090   1st Qu.:126.25  
 Median :253.50   Median :21.24411931190   Median :252.50  
 Mean   :253.50   Mean   :22.53280632410   Mean   :252.50  
 3rd Qu.:379.75   3rd Qu.:26.05895092910   3rd Qu.:378.75  
 Max.   :506.00   Max.   :53.93315968310   Max.   :505.00  
  Estimator_2                 Error                   
 Min.   : 3.41514338182   Min.   :-2.84217094304e-14  
 1st Qu.:16.87419040090   1st Qu.:-5.32907051820e-15  
 Median :21.24411931190   Median :-1.77635683940e-15  
 Mean   :22.53280632410   Mean   :-2.31962012181e-15  
 3rd Qu.:26.05895092910   3rd Qu.: 0.00000000000e+00  
 Max.   :53.93315968310   Max.   : 1.42108547152e-14  
    AbsError                
 Min.   :0.00000000000e+00  
 1st Qu.:0.00000000000e+00  
 Median :3.55271367880e-15  
 Mean   :4.56639557085e-15  
 3rd Qu.:7.10542735760e-15  
 Max.   :2.8421709430