In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)
library(party, quietly = TRUE);



Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress


Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric



# Build a Model

In [2]:


data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "cforest", 
                   controls=cforest_control(maxdepth=3, ntree=512, mtry=3)
                   )

    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)
# print(model)

In [4]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv         Estimator         Error          
 Min.   : 5.00   Min.   :11.95   Min.   :-25.63816  
 1st Qu.:17.02   1st Qu.:17.75   1st Qu.: -1.71140  
 Median :21.20   Median :21.12   Median :  0.40157  
 Mean   :22.53   Mean   :22.49   Mean   : -0.03809  
 3rd Qu.:25.00   3rd Qu.:25.13   3rd Qu.:  2.03438  
 Max.   :50.00   Max.   :45.17   Max.   : 11.83786  

# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
N = nchar(lModelSQL)
L = 2000
cat(substr(lModelSQL, 0, L) , "\n ... \n" , 
    substr(lModelSQL, N/2 - L/2, N/2 + L/2) , "\n ... \n" , 
    substr(lModelSQL, N-L, N), "\n")


WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_5" <= 6.897) THEN CASE WHEN ("ADS"."Feature_6" <= 73.90000000000002) THEN CASE WHEN ("ADS"."Feature_5" <= 6.59) THEN 4 ELSE 5 END ELSE CASE WHEN ("ADS"."Feature_12" <= 14.36) THEN 7 ELSE 8 END END ELSE CASE WHEN ("ADS"."Feature_5" <= 7.42) THEN CASE WHEN ("ADS"."Feature_10" <= 19.1) THEN 11 ELSE 12 END ELSE CASE WHEN ("ADS"."Feature_0" <= 0.5341199999999999) THEN 14 ELSE 15 END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."E" AS "E" 
FROM (SELECT 4 AS nid, 22.127388535031802 AS "E" UNION ALL SELECT 5 AS nid, 28.861111111111104 AS "E" UNION ALL SELECT 7 AS nid, 23.058241758241802 AS "E" UNION ALL SELECT 8 AS nid, 14.2073170731707 AS "E" UNION ALL SELECT 11 AS nid, 33.5444444444445 AS "E" UNION ALL SELECT 12 AS nid, 25.0285714285714 AS "E" UNION ALL SELECT 14 AS nid, 46.5375 AS "E" UNION ALL SELECT 15 AS nid, 48.375 AS "E") AS 

# Execute the SQL Code

In [8]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [9]:
df_sql = dataset
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

# df_sql

In [10]:
colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [11]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

Unnamed: 0,KEY,Estimator
374,1,27.55196
84,2,23.1835
168,3,35.12198
56,4,34.20214
320,5,32.16675
64,6,25.70074


In [12]:
# df_sql_out

# R CFOREST Output

In [13]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Estimator,KEY
27.46501,1
23.1835,2
35.12198,3
34.07547,4
32.16675,5
25.70074,6


# Compare R and SQL output

In [14]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Estimator_1,Estimator_2
1,27.46501,27.55196
2,23.1835,23.1835
3,35.12198,35.12198
4,34.07547,34.20214
5,32.16675,32.16675
6,25.70074,25.70074


In [15]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


KEY,Estimator_1,Estimator_2,Error,AbsError
1,27.46501,27.55196,-0.08695169,0.08695169
2,23.1835,23.1835,-3.552714e-15,3.552714e-15
3,35.12198,35.12198,2.131628e-14,2.131628e-14
4,34.07547,34.20214,-0.1266736,0.1266736
5,32.16675,32.16675,7.105427e-15,7.105427e-15
6,25.70074,25.70074,0.0,0.0


In [16]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0001,]
df_merge_largest_errors

Unnamed: 0,KEY,Estimator_1,Estimator_2,Error,AbsError
1,1,27.46501,27.55196,-0.086951687,0.086951687
4,4,34.07547,34.20214,-0.126673632,0.126673632
30,30,21.66017,21.76722,-0.107049298,0.107049298
98,98,41.33864,41.37183,-0.033191436,0.033191436
112,112,22.3597,22.34841,0.011292529,0.011292529
119,119,18.95775,18.95037,0.007379908,0.007379908
131,131,20.24134,20.27752,-0.036177605,0.036177605
168,168,20.74487,20.73102,0.013853038,0.013853038
203,203,42.78615,42.78925,-0.003096977,0.003096977
212,212,18.06877,18.0575,0.011263186,0.011263186


In [18]:
nrow(df)
nrow(df_merge_largest_errors)
# stopifnot(nrow(df_merge_largest_errors) <= 10)


In [19]:
summary(df_sql_out)

      KEY          Estimator    
 Min.   :  1.0   Min.   :11.95  
 1st Qu.:127.2   1st Qu.:17.75  
 Median :253.5   Median :21.12  
 Mean   :253.5   Mean   :22.50  
 3rd Qu.:379.8   3rd Qu.:25.08  
 Max.   :506.0   Max.   :45.17  

In [20]:
summary(df_r_out)

   Estimator          KEY       
 Min.   :11.95   Min.   :  1.0  
 1st Qu.:17.75   1st Qu.:127.2  
 Median :21.12   Median :253.5  
 Mean   :22.49   Mean   :253.5  
 3rd Qu.:25.13   3rd Qu.:379.8  
 Max.   :45.17   Max.   :506.0  

In [21]:
summary(df_merge)

      KEY         Estimator_1     Estimator_2        Error           
 Min.   :  1.0   Min.   :11.95   Min.   :11.95   Min.   :-0.3045622  
 1st Qu.:127.2   1st Qu.:17.75   1st Qu.:17.75   1st Qu.: 0.0000000  
 Median :253.5   Median :21.12   Median :21.12   Median : 0.0000000  
 Mean   :253.5   Mean   :22.49   Mean   :22.50   Mean   :-0.0003372  
 3rd Qu.:379.8   3rd Qu.:25.13   3rd Qu.:25.08   3rd Qu.: 0.0000000  
 Max.   :506.0   Max.   :45.17   Max.   :45.17   Max.   : 0.3212879  
    AbsError       
 Min.   :0.000000  
 1st Qu.:0.000000  
 Median :0.000000  
 Mean   :0.005155  
 3rd Qu.:0.000000  
 Max.   :0.321288  