In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer, StandardScaler, MinMaxScaler
from sklearn.preprocessing import RobustScaler, MaxAbsScaler

from sklearn.pipeline import FeatureUnion

imputers = [('imputer', Imputer())]
scalers =  [('std_scaler', StandardScaler()), ('minmax_scaler', MinMaxScaler()),
            ('maxabs_scaler', MaxAbsScaler()), ('robust_scaler', RobustScaler())];
pcas = [('pca_4comps', PCA(n_components = 4))]
random_state = np.random.RandomState(0)
clf = FeatureUnion(scalers + pcas)

clf.fit(X, Y)

FeatureUnion(n_jobs=1,
       transformer_list=[('std_scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('minmax_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('maxabs_scaler', MaxAbsScaler(copy=True)), ('robust_scaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('pca_4comps', PCA(copy=True, iterated_power='auto', n_components=4, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False))],
       transformer_weights=None)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model" # "https://sklearn2sql.herokuapp.com/"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:12000])

WITH "std_scaler_CTE" AS 
(SELECT "ADS"."KEY" AS join_key_0, (CAST("ADS"."Feature_0" AS FLOAT) - 5.84333333333) / 0.825301291785 AS std_scaler_output_1, (CAST("ADS"."Feature_1" AS FLOAT) - 3.054) / 0.432146580071 AS std_scaler_output_2, (CAST("ADS"."Feature_2" AS FLOAT) - 3.75866666667) / 1.75852918341 AS std_scaler_output_3, (CAST("ADS"."Feature_3" AS FLOAT) - 1.19866666667) / 0.760612618588 AS std_scaler_output_4 
FROM "INPUT_DATA" AS "ADS"), 
"minmax_scaler_CTE" AS 
(SELECT "ADS"."KEY" AS join_key_1, "ADS"."Feature_0" * 0.277777777778 + -1.19444444444 AS minmax_scaler_output_1, "ADS"."Feature_1" * 0.416666666667 + -0.833333333333 AS minmax_scaler_output_2, "ADS"."Feature_2" * 0.169491525424 + -0.169491525424 AS minmax_scaler_output_3, "ADS"."Feature_3" * 0.416666666667 + -0.0416666666667 AS minmax_scaler_output_4 
FROM "INPUT_DATA" AS "ADS"), 
"maxabs_scaler_CTE" AS 
(SELECT "ADS"."KEY" AS join_key_2, CAST("ADS"."Feature_0" AS FLOAT) / 7.9 AS maxabs_scaler_output_1, CAST("ADS"."Feat

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,std_scaler_output_1,std_scaler_output_2,std_scaler_output_3,std_scaler_output_4,minmax_scaler_output_1,minmax_scaler_output_2,minmax_scaler_output_3,minmax_scaler_output_4,maxabs_scaler_output_1,...,maxabs_scaler_output_3,maxabs_scaler_output_4,robust_scaler_output_1,robust_scaler_output_2,robust_scaler_output_3,robust_scaler_output_4,pca_4comps_output_1,pca_4comps_output_2,pca_4comps_output_3,pca_4comps_output_4
114,114,-0.052506,-0.587764,0.762759,1.579429,0.416667,0.333333,0.694915,0.958333,0.734177,...,0.73913,0.96,0.0,-0.4,0.214286,0.733333,1.585267,-0.539307,0.63057,0.327455
74,74,0.674501,-0.356361,0.307833,0.133226,0.583333,0.375,0.559322,0.5,0.810127,...,0.623188,0.52,0.461538,-0.2,-0.014286,0.0,0.714008,0.150379,-0.320372,0.042941
9,9,-1.143017,0.106445,-1.284407,-1.44445,0.166667,0.458333,0.084746,0.0,0.620253,...,0.217391,0.04,-0.692308,0.2,-0.814286,-0.8,-2.673845,-0.106692,-0.191533,-0.055891
88,88,-0.294842,-0.124958,0.194102,0.133226,0.361111,0.416667,0.525424,0.5,0.708861,...,0.594203,0.52,-0.153846,0.0,-0.071429,0.0,0.245195,-0.266728,0.189562,-0.147328
25,25,-1.021849,-0.124958,-1.227541,-1.312977,0.194444,0.416667,0.101695,0.041667,0.632911,...,0.231884,0.08,-0.615385,0.0,-0.785714,-0.733333,-2.507917,-0.139056,-0.247116,0.035384
5,5,-0.537178,1.957669,-1.170675,-1.050031,0.305556,0.791667,0.118644,0.125,0.683544,...,0.246377,0.16,-0.307692,1.8,-0.757143,-0.6,-2.279897,0.747783,0.174326,-0.027147
48,48,-0.658345,1.494863,-1.284407,-1.312977,0.277778,0.708333,0.084746,0.041667,0.670886,...,0.217391,0.08,-0.384615,1.4,-0.814286,-0.733333,-2.542686,0.586281,-0.011175,-0.048334
117,117,2.249683,1.726266,1.67261,1.316483,0.944444,0.75,0.966102,0.875,0.974684,...,0.971014,0.88,1.461538,1.6,0.671429,0.6,3.488765,1.171545,0.12932,-0.31163
83,83,0.18983,-0.819166,0.762759,0.527645,0.472222,0.291667,0.694915,0.625,0.759494,...,0.73913,0.64,0.153846,-0.6,0.214286,0.2,1.378737,-0.421205,0.01548,-0.177581
105,105,2.128516,-0.124958,1.615744,1.18501,0.916667,0.416667,0.949153,0.833333,0.962025,...,0.956522,0.84,1.384615,0.0,0.642857,0.533333,3.39688,0.547168,-0.351873,-0.11122


In [10]:
sql_output.describe()

Unnamed: 0,KEY,std_scaler_output_1,std_scaler_output_2,std_scaler_output_3,std_scaler_output_4,minmax_scaler_output_1,minmax_scaler_output_2,minmax_scaler_output_3,minmax_scaler_output_4,maxabs_scaler_output_1,...,maxabs_scaler_output_3,maxabs_scaler_output_4,robust_scaler_output_1,robust_scaler_output_2,robust_scaler_output_3,robust_scaler_output_4,pca_4comps_output_1,pca_4comps_output_2,pca_4comps_output_3,pca_4comps_output_4
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,...,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,74.5,4.038062e-12,5.447494e-16,-1.895539e-12,-4.38168e-12,0.4287037,0.4391667,0.467571,0.457778,0.739662,...,0.544734,0.479467,0.033333,0.108,-0.1689524,-0.067556,-2.844634e-12,3.023324e-12,-4.008334e-12,1.526192e-13
std,43.445368,1.00335,1.00335,1.00335,1.00335,0.2300184,0.1806643,0.299054,0.317984,0.104818,...,0.255713,0.305264,0.636974,0.867189,0.5041201,0.508774,2.055442,0.4921825,0.2802212,0.1538929
min,0.0,-1.870024,-2.438987,-1.568735,-1.44445,5.399903e-12,9.999779e-13,0.0,0.0,0.544304,...,0.144928,0.04,-1.153846,-2.0,-0.9571429,-0.8,-3.2252,-1.262492,-0.6919416,-0.5035295
25%,37.25,-0.9006812,-0.5877635,-1.227541,-1.181504,0.2222222,0.3333333,0.101695,0.083333,0.64557,...,0.231884,0.12,-0.538462,-0.4,-0.7857143,-0.666667,-2.530159,-0.3235986,-0.202723,-0.07618597
50%,74.5,-0.05250608,-0.1249576,0.3362659,0.1332259,0.4166667,0.4166667,0.567797,0.5,0.734177,...,0.630435,0.52,0.0,0.0,1.500536e-16,0.0,0.553329,-0.03251102,0.01557634,-0.000432201
75%,111.75,0.6745011,0.5692513,0.7627586,0.7905908,0.5833333,0.5416667,0.694915,0.708333,0.810127,...,0.73913,0.72,0.461538,0.6,0.2142857,0.333333,1.549463,0.3288601,0.1780586,0.08895186
max,149.0,2.492019,3.114684,1.786341,1.710902,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.615385,2.8,0.7285714,0.8,3.794687,1.370524,0.7584587,0.504095


## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_FU = pd.DataFrame(clf.transform(X))
skl_output = pd.concat([skl_output_key, skl_output_FU] , axis=1)
# sklearn does not provide column names ;)
skl_output.columns = sql_output.columns
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,std_scaler_output_1,std_scaler_output_2,std_scaler_output_3,std_scaler_output_4,minmax_scaler_output_1,minmax_scaler_output_2,minmax_scaler_output_3,minmax_scaler_output_4,maxabs_scaler_output_1,...,maxabs_scaler_output_3,maxabs_scaler_output_4,robust_scaler_output_1,robust_scaler_output_2,robust_scaler_output_3,robust_scaler_output_4,pca_4comps_output_1,pca_4comps_output_2,pca_4comps_output_3,pca_4comps_output_4
114,114,-0.052506,-0.587764,0.762759,1.579429,0.416667,0.333333,0.694915,0.958333,0.734177,...,0.73913,0.96,0.0,-0.4,0.214286,0.733333,1.585267,-0.539307,0.63057,0.327455
74,74,0.674501,-0.356361,0.307833,0.133226,0.583333,0.375,0.559322,0.5,0.810127,...,0.623188,0.52,0.461538,-0.2,-0.014286,0.0,0.714008,0.150379,-0.320372,0.042941
9,9,-1.143017,0.106445,-1.284407,-1.44445,0.166667,0.458333,0.084746,0.0,0.620253,...,0.217391,0.04,-0.692308,0.2,-0.814286,-0.8,-2.673845,-0.106692,-0.191533,-0.055891
88,88,-0.294842,-0.124958,0.194102,0.133226,0.361111,0.416667,0.525424,0.5,0.708861,...,0.594203,0.52,-0.153846,0.0,-0.071429,0.0,0.245195,-0.266728,0.189562,-0.147328
25,25,-1.021849,-0.124958,-1.227541,-1.312977,0.194444,0.416667,0.101695,0.041667,0.632911,...,0.231884,0.08,-0.615385,0.0,-0.785714,-0.733333,-2.507917,-0.139056,-0.247116,0.035384
5,5,-0.537178,1.957669,-1.170675,-1.050031,0.305556,0.791667,0.118644,0.125,0.683544,...,0.246377,0.16,-0.307692,1.8,-0.757143,-0.6,-2.279897,0.747783,0.174326,-0.027147
48,48,-0.658345,1.494863,-1.284407,-1.312977,0.277778,0.708333,0.084746,0.041667,0.670886,...,0.217391,0.08,-0.384615,1.4,-0.814286,-0.733333,-2.542686,0.586281,-0.011175,-0.048334
117,117,2.249683,1.726266,1.67261,1.316483,0.944444,0.75,0.966102,0.875,0.974684,...,0.971014,0.88,1.461538,1.6,0.671429,0.6,3.488765,1.171545,0.12932,-0.31163
83,83,0.18983,-0.819166,0.762759,0.527645,0.472222,0.291667,0.694915,0.625,0.759494,...,0.73913,0.64,0.153846,-0.6,0.214286,0.2,1.378737,-0.421205,0.01548,-0.177581
105,105,2.128516,-0.124958,1.615744,1.18501,0.916667,0.416667,0.949153,0.833333,0.962025,...,0.956522,0.84,1.384615,0.0,0.642857,0.533333,3.39688,0.547168,-0.351873,-0.11122


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)

Unnamed: 0,KEY_skl,std_scaler_output_1_skl,std_scaler_output_2_skl,std_scaler_output_3_skl,std_scaler_output_4_skl,minmax_scaler_output_1_skl,minmax_scaler_output_2_skl,minmax_scaler_output_3_skl,minmax_scaler_output_4_skl,maxabs_scaler_output_1_skl,...,maxabs_scaler_output_3_sql,maxabs_scaler_output_4_sql,robust_scaler_output_1_sql,robust_scaler_output_2_sql,robust_scaler_output_3_sql,robust_scaler_output_4_sql,pca_4comps_output_1_sql,pca_4comps_output_2_sql,pca_4comps_output_3_sql,pca_4comps_output_4_sql
114,114,-0.052506,-0.587764,0.762759,1.579429,0.416667,0.333333,0.694915,0.958333,0.734177,...,0.73913,0.96,0.0,-0.4,0.214286,0.733333,1.585267,-0.539307,0.63057,0.327455
74,74,0.674501,-0.356361,0.307833,0.133226,0.583333,0.375,0.559322,0.5,0.810127,...,0.623188,0.52,0.461538,-0.2,-0.014286,0.0,0.714008,0.150379,-0.320372,0.042941
9,9,-1.143017,0.106445,-1.284407,-1.44445,0.166667,0.458333,0.084746,0.0,0.620253,...,0.217391,0.04,-0.692308,0.2,-0.814286,-0.8,-2.673845,-0.106692,-0.191533,-0.055891
88,88,-0.294842,-0.124958,0.194102,0.133226,0.361111,0.416667,0.525424,0.5,0.708861,...,0.594203,0.52,-0.153846,0.0,-0.071429,0.0,0.245195,-0.266728,0.189562,-0.147328
25,25,-1.021849,-0.124958,-1.227541,-1.312977,0.194444,0.416667,0.101695,0.041667,0.632911,...,0.231884,0.08,-0.615385,0.0,-0.785714,-0.733333,-2.507917,-0.139056,-0.247116,0.035384
5,5,-0.537178,1.957669,-1.170675,-1.050031,0.305556,0.791667,0.118644,0.125,0.683544,...,0.246377,0.16,-0.307692,1.8,-0.757143,-0.6,-2.279897,0.747783,0.174326,-0.027147
48,48,-0.658345,1.494863,-1.284407,-1.312977,0.277778,0.708333,0.084746,0.041667,0.670886,...,0.217391,0.08,-0.384615,1.4,-0.814286,-0.733333,-2.542686,0.586281,-0.011175,-0.048334
117,117,2.249683,1.726266,1.67261,1.316483,0.944444,0.75,0.966102,0.875,0.974684,...,0.971014,0.88,1.461538,1.6,0.671429,0.6,3.488765,1.171545,0.12932,-0.31163
83,83,0.18983,-0.819166,0.762759,0.527645,0.472222,0.291667,0.694915,0.625,0.759494,...,0.73913,0.64,0.153846,-0.6,0.214286,0.2,1.378737,-0.421205,0.01548,-0.177581
105,105,2.128516,-0.124958,1.615744,1.18501,0.916667,0.416667,0.949153,0.833333,0.962025,...,0.956522,0.84,1.384615,0.0,0.642857,0.533333,3.39688,0.547168,-0.351873,-0.11122


In [14]:
errors_df = pd.DataFrame()
#errors_df['KEY'] = sql_skl_join['KEY_sql']
for col in sql_output.columns:
    errors_df[col + '_Error'] = sql_skl_join[col + '_skl'] - sql_skl_join[col + '_sql']

errors_df.describe()


Unnamed: 0,KEY_Error,std_scaler_output_1_Error,std_scaler_output_2_Error,std_scaler_output_3_Error,std_scaler_output_4_Error,minmax_scaler_output_1_Error,minmax_scaler_output_2_Error,minmax_scaler_output_3_Error,minmax_scaler_output_4_Error,maxabs_scaler_output_1_Error,...,maxabs_scaler_output_3_Error,maxabs_scaler_output_4_Error,robust_scaler_output_1_Error,robust_scaler_output_2_Error,robust_scaler_output_3_Error,robust_scaler_output_4_Error,pca_4comps_output_1_Error,pca_4comps_output_2_Error,pca_4comps_output_3_Error,pca_4comps_output_4_Error
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,...,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,0.0,-4.039568e-12,-2.21693e-15,1.894043e-12,4.380969e-12,-5.742985e-12,-1.351365e-12,-7.481284e-13,-3.661886e-13,3.9227880000000004e-17,...,3.4231880000000004e-17,-5.551115e-18,-2.137179e-17,-1.554312e-17,9.598803e-18,-4.8664780000000007e-17,2.841861e-12,-3.024171e-12,4.008147e-12,-1.519275e-13
std,0.0,1.719557e-13,1.059488e-12,2.556504e-12,2.264527e-13,1.840046e-13,1.445515e-13,4.784936e-13,2.54371e-13,2.833115e-16,...,2.852798e-16,4.7587640000000006e-17,1.084674e-15,2.76557e-16,2.586577e-16,2.938771e-16,3.892743e-13,4.075556e-13,2.284275e-13,4.026851e-13
min,0.0,-4.463097e-12,-2.581935e-12,-2.103429e-12,3.998135e-12,-6.199929e-12,-1.799894e-12,-1.600053e-12,-8.000267e-13,-4.440892e-16,...,-4.440892e-16,-1.110223e-16,-5.107026e-15,-8.881784e-16,-5.551115e-16,-4.440892e-16,2.137401e-12,-3.991807e-12,3.363976e-12,-1.062095e-12
25%,0.0,-4.155232e-12,-6.230572e-13,-1.229239e-12,4.202749e-12,-5.866529e-12,-1.43352e-12,-1.111888e-12,-5.666578e-13,-2.220446e-16,...,-3.330669e-16,0.0,-4.57967e-16,-2.220446e-16,-1.665335e-16,-3.885781e-16,2.549128e-12,-3.392038e-12,3.857432e-12,-4.567093e-13
50%,0.0,-4.031046e-12,-1.343509e-13,2.750911e-12,4.351602e-12,-5.733469e-12,-1.333267e-12,-9.084955e-13,-3.999023e-13,1.110223e-16,...,8.326673e-17,0.0,-1.387779e-17,0.0,-1.0408340000000001e-17,-1.387779e-17,2.721628e-12,-2.978465e-12,3.997318e-12,-1.337038e-13
75%,0.0,-3.885892e-12,5.996315e-13,3.837486e-12,4.648282e-12,-5.577899e-12,-1.266764e-12,-1.627032e-13,-6.666889e-14,3.330669e-16,...,2.63678e-16,0.0,3.330669e-16,2.081668e-16,2.220446e-16,3.053113e-16,3.210654e-12,-2.722336e-12,4.189968e-12,2.191199e-13
max,0.0,-3.720135e-12,3.284484e-12,6.446621e-12,4.71112e-12,-5.399903e-12,-9.999779e-13,0.0,0.0,4.440892e-16,...,4.996004e-16,1.110223e-16,3.996803e-15,8.881784e-16,4.996004e-16,3.330669e-16,3.800515e-12,-1.986189e-12,4.579198e-12,5.906768e-13
