In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:

## multiclass classification in iris dataset:

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "ctree")    

    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)
# print(model)

In [4]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv         Estimator          Error          
 Min.   : 5.00   Min.   : 8.039   Min.   :-11.28571  
 1st Qu.:17.02   1st Qu.:16.700   1st Qu.: -1.58643  
 Median :21.20   Median :20.816   Median :  0.06795  
 Mean   :22.53   Mean   :22.533   Mean   :  0.00000  
 3rd Qu.:25.00   3rd Qu.:25.118   3rd Qu.:  1.31250  
 Max.   :50.00   Max.   :48.300   Max.   : 18.97143  

# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_12" <= 9.71) THEN CASE WHEN ("ADS"."Feature_5" <= 7.42) THEN CASE WHEN ("ADS"."Feature_0" <= 4.55587) THEN CASE WHEN ("ADS"."Feature_5" <= 6.718) THEN CASE WHEN ("ADS"."Feature_12" <= 7.6) THEN CASE WHEN ("ADS"."Feature_7" <= 6.2196) THEN CASE WHEN ("ADS"."Feature_12" <= 5.68) THEN 8 ELSE CASE WHEN ("ADS"."Feature_8" <= 4.0) THEN 10 ELSE 11 END END ELSE CASE WHEN ("ADS"."Feature_5" <= 6.345) THEN 13 ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_5" <= 6.12) THEN CASE WHEN ("ADS"."Feature_10" <= 19.1) THEN 17 ELSE 18 END ELSE CASE WHEN ("ADS"."Feature_9" <= 293.0) THEN 20 ELSE 21 END END END ELSE CASE WHEN ("ADS"."Feature_5" <= 6.939) THEN CASE WHEN ("ADS"."Feature_10" <= 17.6) THEN 24 ELSE 25 END ELSE CASE WHEN ("ADS"."Feature_0" <= 0.0686) THEN CASE WHEN ("ADS"."Feature_6" <= 49.3) THEN 28 ELSE 29 END ELSE 30 END END END ELSE 31 END ELSE CASE WHEN ("ADS"."Feature_10" <= 17.4) THEN CASE WHEN ("ADS"."Featu

# Execute the SQL Code

In [7]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [8]:
df_sql = dataset
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

# df_sql

In [9]:
colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [10]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

KEY,Estimator
1,27.76154
2,25.14
3,30.61429
4,33.46923
5,34.33889
6,27.76154


In [11]:
# df_sql_out

# R RPART Output

In [12]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Estimator,KEY
27.76154,1
25.14,2
30.61429,3
33.46923,4
34.33889,5
27.76154,6


# Compare R and SQL output

In [13]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Estimator_1,Estimator_2
1,27.76154,27.76154
2,25.14,25.14
3,30.61429,30.61429
4,33.46923,33.46923
5,34.33889,34.33889
6,27.76154,27.76154


In [14]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


KEY,Estimator_1,Estimator_2,Error,AbsError
1,27.76154,27.76154,-3.552714e-14,3.552714e-14
2,25.14,25.14,-3.552714e-15,3.552714e-15
3,30.61429,30.61429,2.131628e-14,2.131628e-14
4,33.46923,33.46923,-2.842171e-14,2.842171e-14
5,34.33889,34.33889,-7.105427e-15,7.105427e-15
6,27.76154,27.76154,-3.552714e-14,3.552714e-14


In [15]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0001,]
df_merge_largest_errors

KEY,Estimator_1,Estimator_2,Error,AbsError


In [16]:
nrow(df_merge_largest_errors)
stopifnot(nrow(df_merge_largest_errors) == 0)


In [17]:
summary(df_sql_out)

      KEY          Estimator     
 Min.   :  1.0   Min.   : 8.039  
 1st Qu.:127.2   1st Qu.:16.700  
 Median :253.5   Median :20.816  
 Mean   :253.5   Mean   :22.533  
 3rd Qu.:379.8   3rd Qu.:25.118  
 Max.   :506.0   Max.   :48.300  

In [18]:
summary(df_r_out)

   Estimator           KEY       
 Min.   : 8.039   Min.   :  1.0  
 1st Qu.:16.700   1st Qu.:127.2  
 Median :20.816   Median :253.5  
 Mean   :22.533   Mean   :253.5  
 3rd Qu.:25.118   3rd Qu.:379.8  
 Max.   :48.300   Max.   :506.0  

In [19]:
summary(df_merge)

      KEY         Estimator_1      Estimator_2         Error           
 Min.   :  1.0   Min.   : 8.039   Min.   : 8.039   Min.   :-4.263e-14  
 1st Qu.:127.2   1st Qu.:16.700   1st Qu.:16.700   1st Qu.:-2.132e-14  
 Median :253.5   Median :20.816   Median :20.816   Median :-3.553e-15  
 Mean   :253.5   Mean   :22.533   Mean   :22.533   Mean   :-2.422e-15  
 3rd Qu.:379.8   3rd Qu.:25.118   3rd Qu.:25.118   3rd Qu.: 1.243e-14  
 Max.   :506.0   Max.   :48.300   Max.   :48.300   Max.   : 4.619e-14  
    AbsError        
 Min.   :0.000e+00  
 1st Qu.:3.553e-15  
 Median :1.421e-14  
 Mean   :1.743e-14  
 3rd Qu.:2.842e-14  
 Max.   :4.619e-14  

In [20]:
model$finalModel


	 Conditional inference tree with 40 terminal nodes

Response:  .outcome 
Inputs:  crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, b, lstat 
Number of observations:  506 

1) lstat <= 9.71; criterion = 1, statistic = 274.794
  2) rm <= 7.42; criterion = 1, statistic = 104.692
    3) crim <= 4.55587; criterion = 1, statistic = 39.897
      4) rm <= 6.718; criterion = 1, statistic = 84.26
        5) lstat <= 7.6; criterion = 0.998, statistic = 14.073
          6) dis <= 6.2196; criterion = 0.984, statistic = 10.39
            7) lstat <= 5.68; criterion = 0.939, statistic = 7.939
              8)*  weights = 13 
            7) lstat > 5.68
              9) rad <= 4; criterion = 0.027, statistic = 1.366
                10)*  weights = 17 
              9) rad > 4
                11)*  weights = 13 
          6) dis > 6.2196
            12) rm <= 6.345; criterion = 0.956, statistic = 8.564
              13)*  weights = 10 
            12) rm > 6.345
              14)*  weight