In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:

# a character vector specifying the type of processing. 
# Possible values are 
# "BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", 
# "knnImpute", "bagImpute", "medianImpute", "pca", "ica", "spatialSign", 
# "corr", "zv", "nzv", and "conditionalX"

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

BostonHousing$crim <- ifelse(BostonHousing$crim > 0.5, NA, BostonHousing$crim)
BostonHousing$age <- ifelse(BostonHousing$age > 93, NA, BostonHousing$age)
dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "rpart", 
                   na.action = na.pass,
                   preProcess=c("medianImpute"))    

    return(model)
}


In [3]:
summary(BostonHousing)

      crim               zn             indus            chas      
 Min.   :0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :1.000  
 1st Qu.:0.04992   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:1.000  
 Median :0.09338   Median :  0.00   Median : 9.69   Median :1.000  
 Mean   :0.13063   Mean   : 11.36   Mean   :11.14   Mean   :1.069  
 3rd Qu.:0.17714   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:1.000  
 Max.   :0.49298   Max.   :100.00   Max.   :27.74   Max.   :2.000  
 NA's   :212                                                       
      nox               rm             age             dis        
 Min.   :0.3850   Min.   :3.561   Min.   : 2.90   Min.   : 1.130  
 1st Qu.:0.4490   1st Qu.:5.886   1st Qu.:35.85   1st Qu.: 2.100  
 Median :0.5380   Median :6.208   Median :59.70   Median : 3.207  
 Mean   :0.5547   Mean   :6.285   Mean   :57.74   Mean   : 3.795  
 3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.:82.53   3rd Qu.: 5.188  
 Max.   :0.8710   Max.   :8.780   Max.   :93.00   Max.

In [4]:
model = create_model()
# cat(model$feature_names)
# print(model)

“There were missing values in resampled performance measures.”

In [5]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw", na.action = na.pass)
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv         Estimator         Error         
 Min.   : 5.00   Min.   :14.96   Min.   :-26.6502  
 1st Qu.:17.02   1st Qu.:14.96   1st Qu.: -2.6985  
 Median :21.20   Median :23.35   Median :  0.6529  
 Mean   :22.53   Mean   :22.53   Mean   :  0.0000  
 3rd Qu.:25.00   3rd Qu.:23.35   3rd Qu.:  3.1560  
 Max.   :50.00   Max.   :37.24   Max.   : 26.8382  

# SQL Code Generation

In [6]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [7]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


WITH "ADS_pre_1_OUT" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_0" IS NULL) THEN 0.093385 ELSE "ADS"."Feature_0" END AS crim, CASE WHEN ("ADS"."Feature_1" IS NULL) THEN 0.0 ELSE "ADS"."Feature_1" END AS zn, CASE WHEN ("ADS"."Feature_2" IS NULL) THEN 9.69 ELSE "ADS"."Feature_2" END AS indus, CASE WHEN ("ADS"."Feature_3" IS NULL) THEN 1.0 ELSE "ADS"."Feature_3" END AS chas, CASE WHEN ("ADS"."Feature_4" IS NULL) THEN 0.538 ELSE "ADS"."Feature_4" END AS nox, CASE WHEN ("ADS"."Feature_5" IS NULL) THEN 6.2085 ELSE "ADS"."Feature_5" END AS rm, CASE WHEN ("ADS"."Feature_6" IS NULL) THEN 59.7 ELSE "ADS"."Feature_6" END AS age, CASE WHEN ("ADS"."Feature_7" IS NULL) THEN 3.20745 ELSE "ADS"."Feature_7" END AS dis, CASE WHEN ("ADS"."Feature_8" IS NULL) THEN 5.0 ELSE "ADS"."Feature_8" END AS rad, CASE WHEN ("ADS"."Feature_9" IS NULL) THEN 330.0 ELSE "ADS"."Feature_9" END AS tax, CASE WHEN ("ADS"."Feature_10" IS NULL) THEN 19.05 ELSE "ADS"."Feature_10" END AS ptratio, CASE WHEN ("ADS

# Execute the SQL Code

In [8]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [9]:
df_sql = dataset
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

# df_sql

In [10]:
colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [11]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

KEY,Estimator
1,23.3498
2,23.3498
3,37.23816
4,37.23816
5,37.23816
6,23.3498


In [12]:
#df_sql_out

# R RPART Output

In [13]:
estimator  =  predict(model, dataset, type = "raw", na.action = na.pass)
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Estimator,KEY
23.3498,1
23.3498,2
37.23816,3
37.23816,4
37.23816,5
23.3498,6


# Compare R and SQL output

In [14]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Estimator_1,Estimator_2
1,23.3498,23.3498
2,23.3498,23.3498
3,37.23816,37.23816
4,37.23816,37.23816
5,37.23816,37.23816
6,23.3498,23.3498


In [15]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


KEY,Estimator_1,Estimator_2,Error,AbsError
1,23.3498,23.3498,0.0,0.0
2,23.3498,23.3498,0.0,0.0
3,37.23816,37.23816,-7.105427e-15,7.105427e-15
4,37.23816,37.23816,-7.105427e-15,7.105427e-15
5,37.23816,37.23816,-7.105427e-15,7.105427e-15
6,23.3498,23.3498,0.0,0.0


In [16]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0001,]
df_merge_largest_errors

KEY,Estimator_1,Estimator_2,Error,AbsError


In [17]:
stopifnot(nrow(df_merge_largest_errors) == 0)

In [18]:
summary(df_sql_out)

      KEY          Estimator    
 Min.   :  1.0   Min.   :14.96  
 1st Qu.:127.2   1st Qu.:14.96  
 Median :253.5   Median :23.35  
 Mean   :253.5   Mean   :22.53  
 3rd Qu.:379.8   3rd Qu.:23.35  
 Max.   :506.0   Max.   :37.24  

In [19]:
summary(df_r_out)

   Estimator          KEY       
 Min.   :14.96   Min.   :  1.0  
 1st Qu.:14.96   1st Qu.:127.2  
 Median :23.35   Median :253.5  
 Mean   :22.53   Mean   :253.5  
 3rd Qu.:23.35   3rd Qu.:379.8  
 Max.   :37.24   Max.   :506.0  

In [20]:
summary(df_merge)

      KEY         Estimator_1     Estimator_2        Error           
 Min.   :  1.0   Min.   :14.96   Min.   :14.96   Min.   :-7.105e-15  
 1st Qu.:127.2   1st Qu.:14.96   1st Qu.:14.96   1st Qu.:-1.776e-15  
 Median :253.5   Median :23.35   Median :23.35   Median : 0.000e+00  
 Mean   :253.5   Mean   :22.53   Mean   :22.53   Mean   :-1.682e-15  
 3rd Qu.:379.8   3rd Qu.:23.35   3rd Qu.:23.35   3rd Qu.: 0.000e+00  
 Max.   :506.0   Max.   :37.24   Max.   :37.24   Max.   : 0.000e+00  
    AbsError        
 Min.   :0.000e+00  
 1st Qu.:0.000e+00  
 Median :0.000e+00  
 Mean   :1.682e-15  
 3rd Qu.:1.776e-15  
 Max.   :7.105e-15  

In [21]:
model$finalModel

n= 506 

node), split, n, deviance, yval
      * denotes terminal node

1) root 506 42716.300 22.53281  
  2) rm< 6.941 430 17317.320 19.93372  
    4) lstat>=14.4 175  3373.251 14.95600 *
    5) lstat< 14.4 255  6632.217 23.34980 *
  3) rm>=6.941 76  6059.419 37.23816 *

In [22]:
prep = model$preProcess

In [23]:
model

CART 

506 samples
 13 predictor

Pre-processing: median imputation (13) 
Resampling: Bootstrapped (25 reps) 
Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
Resampling results across tuning parameters:

  cp          RMSE      Rsquared   MAE     
  0.07165784  5.696930  0.6193436  4.071699
  0.17117244  6.858945  0.4444407  5.098236
  0.45274420  7.704934  0.3871354  5.649512

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.07165784.

In [24]:
prep$method

In [25]:
prep$median