In [1]:
library(caret, quiet = TRUE)
library(base64enc)
library(httr, quiet = TRUE)

library(mlbench)


Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:

# a character vector specifying the type of processing. 
# Possible values are 
# "BoxCox", "YeoJohnson", "expoTrans", "center", "scale", "range", 
# "knnImpute", "bagImpute", "medianImpute", "pca", "ica", "spatialSign", 
# "corr", "zv", "nzv", and "conditionalX"

data(BostonHousing)
BostonHousing$chas = as.numeric(BostonHousing$chas)

set.seed(1960)

dataset = BostonHousing[, -14] 

create_model  =  function() {

    model <- train(medv ~ ., data = BostonHousing, method = "rpart", preProcess=c("range"))    

    return(model)
}


In [3]:
model = create_model()
# cat(model$feature_names)
# print(model)

“There were missing values in resampled performance measures.”

In [4]:
pred_labels <- predict(model, BostonHousing[, -14] , type="raw")
df = data.frame(BostonHousing[,14])
names(df) = c("medv")
df$Estimator = pred_labels
df$Error = df$Estimator - df$medv
MAPE = mean(abs(df$Error / df$medv))
summary(df)
MAPE

      medv         Estimator         Error         
 Min.   : 5.00   Min.   :14.96   Min.   :-26.6502  
 1st Qu.:17.02   1st Qu.:14.96   1st Qu.: -2.6985  
 Median :21.20   Median :23.35   Median :  0.6529  
 Mean   :22.53   Mean   :22.53   Mean   :  0.0000  
 3rd Qu.:25.00   3rd Qu.:23.35   3rd Qu.:  3.1560  
 Max.   :50.00   Max.   :37.24   Max.   : 26.8382  

# SQL Code Generation

In [5]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "xgboost_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [6]:
lModelSQL = test_ws_sql_gen(model)
cat(lModelSQL)


WITH "ADS_pre_1_OUT" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) * 0.01123975889368402 + -7.1035276208083e-05 AS crim, CAST("ADS"."Feature_1" AS FLOAT) * 0.01 + -0.0 AS zn, CAST("ADS"."Feature_2" AS FLOAT) * 0.036656891495601175 + -0.016862170087976542 AS indus, CAST("ADS"."Feature_3" AS FLOAT) * 1.0 + -1.0 AS chas, CAST("ADS"."Feature_4" AS FLOAT) * 2.05761316872428 + -0.7921810699588477 AS nox, CAST("ADS"."Feature_5" AS FLOAT) * 0.19160758766047137 + -0.6823146196589386 AS rm, CAST("ADS"."Feature_6" AS FLOAT) * 0.010298661174047374 + -0.029866117404737384 AS age, CAST("ADS"."Feature_7" AS FLOAT) * 0.09093471796597223 + -0.10271985741436222 AS dis, CAST("ADS"."Feature_8" AS FLOAT) * 0.043478260869565216 + -0.043478260869565216 AS rad, CAST("ADS"."Feature_9" AS FLOAT) * 0.0019083969465648854 + -0.3568702290076336 AS tax, CAST("ADS"."Feature_10" AS FLOAT) * 0.10638297872340426 + -1.3404255319148937 AS ptratio, CAST("ADS"."Feature_11" AS FLOAT) * 0.0025215593322910

# Execute the SQL Code

In [7]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [8]:
df_sql = dataset
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

# df_sql

In [9]:
colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [10]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out[order(df_sql_out$KEY),])

KEY,Estimator
1,23.3498
2,23.3498
3,37.23816
4,37.23816
5,37.23816
6,23.3498


In [11]:
# df_sql_out

# R RPART Output

In [12]:
estimator  =  predict(model, dataset, type = "raw")
df_r_out = data.frame(estimator)
names(df_r_out) = c("Estimator")

df_r_out$KEY = seq.int(nrow(dataset))
head(df_r_out)


Estimator,KEY
23.3498,1
23.3498,2
37.23816,3
37.23816,4
37.23816,5
23.3498,6


# Compare R and SQL output

In [13]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_1","_2"))
head(df_merge)

KEY,Estimator_1,Estimator_2
1,23.3498,23.3498
2,23.3498,23.3498
3,37.23816,37.23816
4,37.23816,37.23816
5,37.23816,37.23816
6,23.3498,23.3498


In [14]:
df_merge$Error = df_merge$Estimator_1 - df_merge$Estimator_2
df_merge$AbsError = abs(df_merge$Error)
head(df_merge)


KEY,Estimator_1,Estimator_2,Error,AbsError
1,23.3498,23.3498,-7.105427e-15,7.105427e-15
2,23.3498,23.3498,-7.105427e-15,7.105427e-15
3,37.23816,37.23816,0.0,0.0
4,37.23816,37.23816,0.0,0.0
5,37.23816,37.23816,0.0,0.0
6,23.3498,23.3498,-7.105427e-15,7.105427e-15


In [15]:
df_merge_largest_errors = df_merge[df_merge$AbsError > 0.0001,]
df_merge_largest_errors

KEY,Estimator_1,Estimator_2,Error,AbsError


In [16]:
stopifnot(nrow(df_merge_largest_errors) == 0)

In [17]:
summary(df_sql_out)

      KEY          Estimator    
 Min.   :  1.0   Min.   :14.96  
 1st Qu.:127.2   1st Qu.:14.96  
 Median :253.5   Median :23.35  
 Mean   :253.5   Mean   :22.53  
 3rd Qu.:379.8   3rd Qu.:23.35  
 Max.   :506.0   Max.   :37.24  

In [18]:
summary(df_r_out)

   Estimator          KEY       
 Min.   :14.96   Min.   :  1.0  
 1st Qu.:14.96   1st Qu.:127.2  
 Median :23.35   Median :253.5  
 Mean   :22.53   Mean   :253.5  
 3rd Qu.:23.35   3rd Qu.:379.8  
 Max.   :37.24   Max.   :506.0  

In [19]:
summary(df_merge)

      KEY         Estimator_1     Estimator_2        Error           
 Min.   :  1.0   Min.   :14.96   Min.   :14.96   Min.   :-7.105e-15  
 1st Qu.:127.2   1st Qu.:14.96   1st Qu.:14.96   1st Qu.:-7.105e-15  
 Median :253.5   Median :23.35   Median :23.35   Median :-7.105e-15  
 Mean   :253.5   Mean   :22.53   Mean   :22.53   Mean   :-4.195e-15  
 3rd Qu.:379.8   3rd Qu.:23.35   3rd Qu.:23.35   3rd Qu.:-1.776e-15  
 Max.   :506.0   Max.   :37.24   Max.   :37.24   Max.   : 0.000e+00  
    AbsError        
 Min.   :0.000e+00  
 1st Qu.:1.776e-15  
 Median :7.105e-15  
 Mean   :4.195e-15  
 3rd Qu.:7.105e-15  
 Max.   :7.105e-15  

In [20]:
model$finalModel

n= 506 

node), split, n, deviance, yval
      * denotes terminal node

1) root 506 42716.300 22.53281  
  2) rm< 0.6476336 430 17317.320 19.93372  
    4) lstat>=0.3496137 175  3373.251 14.95600 *
    5) lstat< 0.3496137 255  6632.217 23.34980 *
  3) rm>=0.6476336 76  6059.419 37.23816 *

In [21]:
prep = model$preProcess

In [22]:
data.frame(t(prep$ranges))

Unnamed: 0,X1,X2
crim,0.00632,88.9762
zn,0.0,100.0
indus,0.46,27.74
chas,1.0,2.0
nox,0.385,0.871
rm,3.561,8.78
age,2.9,100.0
dis,1.1296,12.1265
rad,1.0,24.0
tax,187.0,711.0


In [23]:
colnames(prep$ranges)

In [24]:
prep$method$range