In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:

# a character vector specifying the type of processing. 
# Possible values are 
# "BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", 
# "knnImpute", "bagImpute", "medianImpute", "pca", "ica", "spatialSign", 
# "corr", "zv", "nzv", and "conditionalX"

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)
BostonHousing$zn = 1230
BostonHousing$b = rnorm(nrow(BostonHousing), mean = 0, sd = 0.000001)
BostonHousing$b_corr = BostonHousing$b + 10 
BostonHousing$lstat = 5465.1230

dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "rpart", preProcess=c("zv", "nzv", "corr"))    

    return(model)
}


In [3]:
head(dataset)

crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,b_corr
0.00632,1230,2.31,1,0.538,6.575,65.2,4.09,1,296,15.3,1.403801e-07,5465.123,10.0
0.02731,1230,7.07,1,0.469,6.421,78.9,4.9671,2,242,17.8,-3.703998e-07,5465.123,10.0
0.02729,1230,7.07,1,0.469,7.185,61.1,4.9671,2,242,17.8,-3.183261e-06,5465.123,9.999997
0.03237,1230,2.18,1,0.458,6.998,45.8,6.0622,3,222,18.7,-1.134654e-06,5465.123,9.999999
0.06905,1230,2.18,1,0.458,7.147,54.2,6.0622,3,222,18.7,5.064509e-07,5465.123,10.000001
0.02985,1230,2.18,1,0.458,6.43,58.7,6.0622,3,222,18.7,-4.918303e-07,5465.123,10.0


In [4]:
model = create_model()
# cat(model$feature_names)
# print(model)

“There were missing values in resampled performance measures.”

In [5]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv         Estimator         Error         
 Min.   : 5.00   Min.   :13.74   Min.   :-28.2619  
 1st Qu.:17.02   1st Qu.:21.74   1st Qu.: -2.6616  
 Median :21.20   Median :21.74   Median :  0.5381  
 Mean   :22.53   Mean   :22.53   Mean   :  0.0000  
 3rd Qu.:25.00   3rd Qu.:21.74   3rd Qu.:  3.3381  
 Max.   :50.00   Max.   :37.24   Max.   : 26.8382  

# SQL Code Generation

In [6]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [7]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


WITH "ADS_pre_1_OUT" AS 
(SELECT "ADS"."KEY" AS "KEY", "ADS"."Feature_0" AS crim, "ADS"."Feature_2" AS indus, "ADS"."Feature_3" AS chas, "ADS"."Feature_4" AS nox, "ADS"."Feature_5" AS rm, "ADS"."Feature_6" AS age, "ADS"."Feature_7" AS dis, "ADS"."Feature_8" AS rad, "ADS"."Feature_10" AS ptratio, "ADS"."Feature_11" AS b 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_lookup" AS 
(SELECT "ADS_pre_1_OUT"."KEY" AS "KEY", CASE WHEN ("ADS_pre_1_OUT".rm < 6.941) THEN CASE WHEN ("ADS_pre_1_OUT".nox >= 0.6695) THEN 4 ELSE 5 END ELSE 3 END AS node_id_2 
FROM "ADS_pre_1_OUT"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."E" AS "E" 
FROM (SELECT 3 AS nid, 37.23815789473685 AS "E" UNION ALL SELECT 4 AS nid, 13.73917525773196 AS "E" UNION ALL SELECT 5 AS nid, 21.738138138138147 AS "E") AS "Values"), 
"DT_Output" AS 
(SELECT "DT_node_lookup"."KEY" AS "KEY", "DT_node_lookup".node_id_2 AS node_id_2, "DT_node_data".nid AS nid, "DT_node_data"."E" AS "E" 
FROM "DT_node_lookup" LEFT OUTER JOIN "DT_no

# Execute the SQL Code

In [8]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [9]:
df_sql = dataset
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

# df_sql

In [10]:
colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [11]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

KEY,Estimator
1,21.73814
2,21.73814
3,37.23816
4,37.23816
5,37.23816
6,21.73814


In [12]:
df_sql_out

KEY,Estimator
1,21.73814
2,21.73814
3,37.23816
4,37.23816
5,37.23816
6,21.73814
7,21.73814
8,21.73814
9,21.73814
10,21.73814


# R RPART Output

In [13]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Estimator,KEY
21.73814,1
21.73814,2
37.23816,3
37.23816,4
37.23816,5
21.73814,6


# Compare R and SQL output

In [14]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Estimator_1,Estimator_2
1,21.73814,21.73814
2,21.73814,21.73814
3,37.23816,37.23816
4,37.23816,37.23816
5,37.23816,37.23816
6,21.73814,21.73814


In [15]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


KEY,Estimator_1,Estimator_2,Error,AbsError
1,21.73814,21.73814,7.105427e-15,7.105427e-15
2,21.73814,21.73814,7.105427e-15,7.105427e-15
3,37.23816,37.23816,0.0,0.0
4,37.23816,37.23816,0.0,0.0
5,37.23816,37.23816,0.0,0.0
6,21.73814,21.73814,7.105427e-15,7.105427e-15


In [16]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0001,]
head(df_merge_largest_errors)

KEY,Estimator_1,Estimator_2,Error,AbsError


In [17]:
stopifnot(nrow(df_merge_largest_errors) == 0)

In [18]:
summary(df_sql_out)

      KEY          Estimator    
 Min.   :  1.0   Min.   :13.74  
 1st Qu.:127.2   1st Qu.:21.74  
 Median :253.5   Median :21.74  
 Mean   :253.5   Mean   :22.53  
 3rd Qu.:379.8   3rd Qu.:21.74  
 Max.   :506.0   Max.   :37.24  

In [19]:
summary(df_r_out)

   Estimator          KEY       
 Min.   :13.74   Min.   :  1.0  
 1st Qu.:21.74   1st Qu.:127.2  
 Median :21.74   Median :253.5  
 Mean   :22.53   Mean   :253.5  
 3rd Qu.:21.74   3rd Qu.:379.8  
 Max.   :37.24   Max.   :506.0  

In [20]:
summary(df_merge)

      KEY         Estimator_1     Estimator_2        Error          
 Min.   :  1.0   Min.   :13.74   Min.   :13.74   Min.   :0.000e+00  
 1st Qu.:127.2   1st Qu.:21.74   1st Qu.:21.74   1st Qu.:3.553e-15  
 Median :253.5   Median :21.74   Median :21.74   Median :7.105e-15  
 Mean   :253.5   Mean   :22.53   Mean   :22.53   Mean   :5.357e-15  
 3rd Qu.:379.8   3rd Qu.:21.74   3rd Qu.:21.74   3rd Qu.:7.105e-15  
 Max.   :506.0   Max.   :37.24   Max.   :37.24   Max.   :7.105e-15  
    AbsError        
 Min.   :0.000e+00  
 1st Qu.:3.553e-15  
 Median :7.105e-15  
 Mean   :5.357e-15  
 3rd Qu.:7.105e-15  
 Max.   :7.105e-15  

In [21]:
model$finalModel

n= 506 

node), split, n, deviance, yval
      * denotes terminal node

1) root 506 42716.300 22.53281  
  2) rm< 6.941 430 17317.320 19.93372  
    4) nox>=0.6695 97  2214.391 13.73918 *
    5) nox< 0.6695 333 10296.590 21.73814 *
  3) rm>=6.941 76  6059.419 37.23816 *

In [22]:
prep = model$preProcess

In [23]:
model

CART 

506 samples
 14 predictor

Pre-processing: remove (4) 
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
Resampling results across tuning parameters:

  cp          RMSE      Rsquared   MAE     
  0.07165784  6.238739  0.5470006  4.330039
  0.11251782  6.804286  0.4640030  4.861857
  0.45274420  8.103048  0.3823074  5.877665

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.07165784.

In [24]:
prep$method$remove

In [25]:
class(prep$method$remove)

In [26]:
attributes(prep$method)$names

In [27]:
prep

Created from 506 samples and 4 variables

Pre-processing:
  - ignored (0)
  - removed (4)


In [28]:
prep$method