In [1]:
library(caret, quiet=TRUE);
library(base64enc)
library(httr, quiet=TRUE)



Attaching package: ‘httr’

The following object is masked from ‘package:caret’:

    progress



# Build a Model

In [2]:
set.seed(1960)

create_model  =  function() {

    model  <- preProcess(iris[, -5], method = c("pca"))
    
    return(model)
}

In [3]:
# dataset
model = create_model()

In [4]:
pred <- predict(model, as.matrix(iris[, -5]))
head(pred)

PC1,PC2
-2.257141,-0.4784238
-2.074013,0.6718827
-2.356335,0.3407664
-2.291707,0.5953999
-2.381863,-0.6446757
-2.068701,-1.4842053


# SQL Code Generation

In [9]:

test_ws_sql_gen = function(mod) {
    WS_URL = "https://sklearn2sql.herokuapp.com/model"
    WS_URL = "http://localhost:1888/model"
    model_serialized <- serialize(mod, NULL)
    b64_data = base64encode(model_serialized)
    data = list(Name = "caret_rpart_test_model", SerializedModel = b64_data , SQLDialect = "postgresql" , Mode="caret")
    r = POST(WS_URL, body = data, encode = "json")
    # print(r)
    content = content(r)
    # print(content)
    lSQL = content$model$SQLGenrationResult[[1]]$SQL # content["model"]["SQLGenrationResult"][0]["SQL"]
    return(lSQL);
}

In [10]:
lModelSQL = test_ws_sql_gen(model)

In [11]:
cat(lModelSQL)


WITH "CenteredDataForPCA" AS 
(SELECT "ADS"."KEY" AS "KEY", (CAST("ADS"."Feature_0" AS FLOAT) - 5.843333333333334) / 0.8280661279778629 AS "Feature_0", (CAST("ADS"."Feature_1" AS FLOAT) - 3.0573333333333332) / 0.4358662849366982 AS "Feature_1", (CAST("ADS"."Feature_2" AS FLOAT) - 3.758) / 1.7652982332594664 AS "Feature_2", (CAST("ADS"."Feature_3" AS FLOAT) - 1.199333333333333) / 0.7622376689603465 AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS")
 SELECT "CenteredDataForPCA"."KEY" AS "KEY", "CenteredDataForPCA"."Feature_0" * 0.5210659146701195 + "CenteredDataForPCA"."Feature_1" * -0.2693474425059429 + "CenteredDataForPCA"."Feature_2" * 0.5804130957962945 + "CenteredDataForPCA"."Feature_3" * 0.5648565357793613 AS pca_1, "CenteredDataForPCA"."Feature_0" * -0.3774176155645673 + "CenteredDataForPCA"."Feature_1" * -0.9232956595407152 + "CenteredDataForPCA"."Feature_2" * -0.024491609085586008 + "CenteredDataForPCA"."Feature_3" * -0.06694198696805842 AS pca_2 
FROM "CenteredDataForPCA"

# Execute the SQL Code

In [12]:
library(RODBC)
conn = odbcConnect("pgsql", uid="db", pwd="db", case="nochange")
odbcSetAutoCommit(conn , autoCommit = TRUE)

In [13]:
dataset = iris[,-5]

df_sql = as.data.frame(dataset)
names(df_sql) = sprintf("Feature_%d",0:(ncol(df_sql)-1))
df_sql$KEY = seq.int(nrow(dataset))

sqlDrop(conn , "INPUT_DATA" , errors = FALSE)
sqlSave(conn, df_sql, tablename = "INPUT_DATA", verbose = FALSE)

head(df_sql)

Feature_0,Feature_1,Feature_2,Feature_3,KEY
5.1,3.5,1.4,0.2,1
4.9,3.0,1.4,0.2,2
4.7,3.2,1.3,0.2,3
4.6,3.1,1.5,0.2,4
5.0,3.6,1.4,0.2,5
5.4,3.9,1.7,0.4,6


In [14]:
# colnames(df_sql)
# odbcGetInfo(conn)
# sqlTables(conn)

In [15]:
df_sql_out = sqlQuery(conn, lModelSQL)
head(df_sql_out)

KEY,pca_1,pca_2
1,-2.257141,-0.4784238
2,-2.074013,0.6718827
3,-2.356335,0.3407664
4,-2.291707,0.5953999
5,-2.381863,-0.6446757
6,-2.068701,-1.4842053


# R Caret Rpart Output

In [16]:
preprocessed  =  predict(model, iris[,-5])
df_r_out = data.frame(preprocessed)
names(df_r_out) = sprintf("pca_%d",1:ncol(df_r_out))

df_r_out$KEY = seq.int(nrow(dataset))

head(df_r_out)

pca_1,pca_2,KEY
-2.257141,-0.4784238,1
-2.074013,0.6718827,2
-2.356335,0.3407664,3
-2.291707,0.5953999,4
-2.381863,-0.6446757,5
-2.068701,-1.4842053,6


# Compare R and SQL output

In [17]:
df_merge = merge(x = df_r_out, y = df_sql_out, by = "KEY", all = TRUE, , suffixes = c("_R","_SQL"))
head(df_merge)

KEY,pca_1_R,pca_2_R,pca_1_SQL,pca_2_SQL
1,-2.257141,-0.4784238,-2.257141,-0.4784238
2,-2.074013,0.6718827,-2.074013,0.6718827
3,-2.356335,0.3407664,-2.356335,0.3407664
4,-2.291707,0.5953999,-2.291707,0.5953999
5,-2.381863,-0.6446757,-2.381863,-0.6446757
6,-2.068701,-1.4842053,-2.068701,-1.4842053


In [18]:
df_merge$Error1 = df_merge$pca_1_R - df_merge$pca_1_SQL
df_merge$Error2 = df_merge$pca_2_R - df_merge$pca_2_SQL
df_merge$AbsError = abs(df_merge$Error1) + abs(df_merge$Error2)
head(df_merge)

KEY,pca_1_R,pca_2_R,pca_1_SQL,pca_2_SQL,Error1,Error2,AbsError
1,-2.257141,-0.4784238,-2.257141,-0.4784238,0.0,1.110223e-16,1.110223e-16
2,-2.074013,0.6718827,-2.074013,0.6718827,-4.440892e-16,0.0,4.440892e-16
3,-2.356335,0.3407664,-2.356335,0.3407664,-4.440892e-16,5.5511150000000004e-17,4.996004e-16
4,-2.291707,0.5953999,-2.291707,0.5953999,0.0,0.0,0.0
5,-2.381863,-0.6446757,-2.381863,-0.6446757,0.0,1.110223e-16,1.110223e-16
6,-2.068701,-1.4842053,-2.068701,-1.4842053,-4.440892e-16,2.220446e-16,6.661338e-16


In [19]:
diffs_df = df_merge[df_merge$AbsError > 0.00001,]
head(diffs_df)

KEY,pca_1_R,pca_2_R,pca_1_SQL,pca_2_SQL,Error1,Error2,AbsError


In [20]:
stopifnot(nrow(diffs_df) == 0)

In [21]:
summary(df_sql_out)

      KEY             pca_1             pca_2         
 Min.   :  1.00   Min.   :-2.7651   Min.   :-2.67732  
 1st Qu.: 38.25   1st Qu.:-2.0957   1st Qu.:-0.59205  
 Median : 75.50   Median : 0.4169   Median :-0.01744  
 Mean   : 75.50   Mean   : 0.0000   Mean   : 0.00000  
 3rd Qu.:112.75   3rd Qu.: 1.3385   3rd Qu.: 0.59649  
 Max.   :150.00   Max.   : 3.2996   Max.   : 2.64521  

In [22]:
summary(df_r_out)

     pca_1             pca_2               KEY        
 Min.   :-2.7651   Min.   :-2.67732   Min.   :  1.00  
 1st Qu.:-2.0957   1st Qu.:-0.59205   1st Qu.: 38.25  
 Median : 0.4169   Median :-0.01744   Median : 75.50  
 Mean   : 0.0000   Mean   : 0.00000   Mean   : 75.50  
 3rd Qu.: 1.3385   3rd Qu.: 0.59649   3rd Qu.:112.75  
 Max.   : 3.2996   Max.   : 2.64521   Max.   :150.00  