In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(httr, quiet=TRUE)



Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:
set.seed(1960)

iris$Sepal.Length <- ifelse(iris$Sepal.Length > 5.3, NA, iris$Sepal.Length)

create_model  =  function() {

    model  <- preProcess(iris[, -5], method = c("medianImpute"))
    
    return(model)
}

In [3]:
# dataset
model = create_model()

In [4]:
pred <- predict(model, as.matrix(iris[, -5]))
head(pred)

Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
5.1,3.5,1.4,0.2
4.9,3.0,1.4,0.2
4.7,3.2,1.3,0.2
4.6,3.1,1.5,0.2
5.0,3.6,1.4,0.2
5.0,3.9,1.7,0.4


In [5]:
print(model)

Created from 46 samples and 4 variables

Pre-processing:
  - ignored (0)
  - median imputation (4)



In [6]:
cbind(model$median)

0,1
Sepal.Length,5.0
Sepal.Width,3.0
Petal.Length,4.35
Petal.Width,1.3


In [7]:
model$method

# SQL Code Generation

In [8]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_rpart_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [9]:
lModelSQL = test_ws_sql_gen(model)

In [10]:
cat(lModelSQL)


SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_0" IS NULL) THEN 1.3 ELSE "ADS"."Feature_0" END AS "Feature_0", CASE WHEN ("ADS"."Feature_1" IS NULL) THEN 5.0 ELSE "ADS"."Feature_1" END AS "Feature_1", CASE WHEN ("ADS"."Feature_2" IS NULL) THEN 4.35 ELSE "ADS"."Feature_2" END AS "Feature_2", CASE WHEN ("ADS"."Feature_3" IS NULL) THEN 3.0 ELSE "ADS"."Feature_3" END AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"

# Execute the SQL Code

In [11]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [12]:
dataset = iris[,-5]

df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
,3.9,1.7,0.4,6


In [13]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [14]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out)

KEY,Feature_0,Feature_1,Feature_2,Feature_3
1,5.1,3.5,1.4,0.2
2,4.9,3.0,1.4,0.2
3,4.7,3.2,1.3,0.2
4,4.6,3.1,1.5,0.2
5,5.0,3.6,1.4,0.2
6,1.3,3.9,1.7,0.4


# R Preprocess Output

In [15]:
preprocessed  =  predict(model, iris[,-5])
df_r_out = data.frame(preprocessed)
names(df_r_out) = sprintf("Feature_%d",0:(ncol(df_r_out) - 1))

df_r_out$KEY = seq.int(nrow(dataset))

head(df_r_out)



Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.0,3.9,1.7,0.4,6


# Compare R and SQL output

In [16]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_R","_SQL"))
head(df_merge)

KEY,Feature_0_R,Feature_1_R,Feature_2_R,Feature_3_R,Feature_0_SQL,Feature_1_SQL,Feature_2_SQL,Feature_3_SQL
1,5.1,3.5,1.4,0.2,5.1,3.5,1.4,0.2
2,4.9,3.0,1.4,0.2,4.9,3.0,1.4,0.2
3,4.7,3.2,1.3,0.2,4.7,3.2,1.3,0.2
4,4.6,3.1,1.5,0.2,4.6,3.1,1.5,0.2
5,5.0,3.6,1.4,0.2,5.0,3.6,1.4,0.2
6,5.0,3.9,1.7,0.4,1.3,3.9,1.7,0.4


In [17]:
diffs_df = df_merge[df_merge$Feature_1_R != df_merge$Feature_1_SQL,]
head(diffs_df)

KEY,Feature_0_R,Feature_1_R,Feature_2_R,Feature_3_R,Feature_0_SQL,Feature_1_SQL,Feature_2_SQL,Feature_3_SQL


In [18]:
stopifnot(nrow(diffs_df) == 0)

In [19]:
summary(df_sql_out)

      KEY           Feature_0       Feature_1       Feature_2    
 Min.   :  1.00   Min.   :1.300   Min.   :2.000   Min.   :1.000  
 1st Qu.: 38.25   1st Qu.:1.300   1st Qu.:2.800   1st Qu.:1.600  
 Median : 75.50   Median :1.300   Median :3.000   Median :4.350  
 Mean   : 75.50   Mean   :2.403   Mean   :3.057   Mean   :3.758  
 3rd Qu.:112.75   3rd Qu.:4.600   3rd Qu.:3.300   3rd Qu.:5.100  
 Max.   :150.00   Max.   :5.300   Max.   :4.400   Max.   :6.900  
   Feature_3    
 Min.   :0.100  
 1st Qu.:0.300  
 Median :1.300  
 Mean   :1.199  
 3rd Qu.:1.800  
 Max.   :2.500  

In [20]:
summary(df_r_out)

   Feature_0       Feature_1       Feature_2       Feature_3    
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.000   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.000   Median :3.000   Median :4.350   Median :1.300  
 Mean   :4.968   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:5.000   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :5.300   Max.   :4.400   Max.   :6.900   Max.   :2.500  
      KEY        
 Min.   :  1.00  
 1st Qu.: 38.25  
 Median : 75.50  
 Mean   : 75.50  
 3rd Qu.:112.75  
 Max.   :150.00  

In [21]:
prep = model

In [22]:
prep

Created from 46 samples and 4 variables

Pre-processing:
  - ignored (0)
  - median imputation (4)


In [23]:
prep$dim

In [24]:
prep$median

In [25]:
attributes(prep)

In [26]:
prep$method$medianImpute