In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

GaussianNB(priors=None)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "RawScores" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST(NULL AS FLOAT(53)) AS "LogProba_0", CAST(NULL AS FLOAT(53)) AS "Proba_0", -1.09861228867 + (-0.5 * -0.267793442091 - (0.5 * ("ADS"."Feature_0" - 5.006) * ("ADS"."Feature_0" - 5.006)) / 0.121764003092) + (-0.5 * -0.112109357731 - (0.5 * ("ADS"."Feature_1" - 3.418) * ("ADS"."Feature_1" - 3.418)) / 0.142276003092) + (-0.5 * -1.68535226038 - (0.5 * ("ADS"."Feature_2" - 1.464) * ("ADS"."Feature_2" - 1.464)) / 0.0295040030924) + (-0.5 * -2.64826613862 - (0.5 * ("ADS"."Feature_3" - 0.244) * ("ADS"."Feature_3" - 0.244)) / 0.0112640030924) AS "Score_0", CAST(NULL AS FLOAT(53)) AS "LogProba_1", CAST(NULL AS FLOAT(53)) AS "Proba_1", -1.09861228867 + (-0.5 * 0.495040594659 - (0.5 * ("ADS"."Feature_0" - 5.936) * ("ADS"."Feature_0" - 5.936)) / 0.261104003092) + (-0.5 * -0.500335172182 - (0.5 * ("ADS"."Feature_1" - 2.77) * ("ADS"."Feature_1" - 2.77)) / 0.0965000030924) + (-0.5 * 0.30725034869 - (0.5 * ("ADS"."Feature_2" - 4.26) * ("ADS"."Feature

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
98,98,,,,4.128729e-30,0.9999998,2.313169e-07,-67.65958,-2.313169e-07,-15.27948,1
31,31,,,,1.0,2.195901e-14,6.176507e-22,-2.198242e-14,-31.4496,-48.83612,0
142,142,,,,5.008456e-151,0.02501216,0.9749878,-346.0792,-3.688393,-0.02533028,2
91,91,,,,3.3087269999999996e-100,0.9893715,0.01062853,-229.0619,-0.01068542,-4.544213,1
140,140,,,,5.79668e-220,5.014466e-09,1.0,-504.8114,-19.11094,-5.014466e-09,2
29,29,,,,1.0,4.5976850000000004e-17,1.258399e-24,0.0,-37.61839,-55.0322,0
148,148,,,,5.586495e-197,2.460204e-07,0.9999998,-451.8889,-15.21785,-2.460205e-07,2
121,121,,,,2.825259e-146,0.0137626,0.9862374,-335.1388,-4.285801,-0.01385818,2
120,120,,,,2.9429950000000003e-219,1.171169e-08,1.0,-503.1867,-18.26268,-1.171169e-08,2
130,130,,,,8.555802999999999e-221,9.062384e-07,0.9999991,-506.7247,-13.91396,-9.062389e-07,2


In [10]:
sql_output.Decision.value_counts()

2    50
1    50
0    50
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
131,131,,,,1.247227e-250,2.967305e-10,1.0,-575.425351,-21.9382,-2.967306e-10,2
33,33,,,,1.0,3.197019e-20,1.428817e-26,0.0,-44.88948,-59.51037,0
29,29,,,,1.0,4.5976850000000004e-17,1.258399e-24,0.0,-37.61839,-55.0322,0
32,32,,,,1.0,6.530873e-21,3.1188770000000003e-27,0.0,-46.47775,-61.03232,0
73,73,,,,4.79428e-97,0.9981565,0.001843481,-221.78333,-0.001845182,-6.2961,1
68,68,,,,5.48395e-102,0.9946971,0.005302892,-233.161854,-0.005317002,-5.239503,1
54,54,,,,9.049382e-107,0.9524418,0.04755819,-244.173908,-0.04872626,-3.045801,1
57,57,,,,1.197348e-34,0.9999998,2.332069e-07,-78.107784,-2.332069e-07,-15.27134,1
108,108,,,,2.1058910000000001e-190,0.0004929018,0.9995071,-436.746429,-7.615201,-0.0004930233,2
19,19,,,,1.0,8.944938000000001e-18,7.106919e-25,0.0,-39.25544,-55.60356,0


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
129,129,,,,1.613376e-181,0.0007018057,0.999298,-416.289573,-7.261854,-0.0007020521,...,,,,1.613376e-181,0.0007018057,0.999298,-416.289573,-7.261854,-0.0007020521,2
113,113,,,,4.082205e-152,0.01288071,0.987119,-348.586297,-4.352025,-0.01296438,...,,,,4.082205e-152,0.01288071,0.987119,-348.586297,-4.352025,-0.01296438,2
52,52,,,,1.5282179999999998e-122,0.4561513,0.543849,-280.491279,-0.784931,-0.6090842,...,,,,1.5282179999999998e-122,0.4561513,0.543849,-280.491279,-0.784931,-0.6090842,2
95,95,,,,1.585577e-73,0.9998499,0.00015,-167.627763,-0.00015,-8.804041,...,,,,1.585577e-73,0.9998499,0.00015,-167.627763,-0.00015,-8.804041,1
117,117,,,,3.753023e-285,1.645749e-12,1.0,-654.91419,-27.132825,-1.644906e-12,...,,,,3.753023e-285,1.645749e-12,1.0,-654.91419,-27.132825,-1.645795e-12,2
96,96,,,,1.026621e-77,0.9997149,0.000285,-177.27278,-0.000285,-8.162834,...,,,,1.026621e-77,0.9997149,0.000285,-177.27278,-0.000285,-8.162834,1
82,82,,,,6.635630999999999e-63,0.9999723,2.8e-05,-143.170407,-2.8e-05,-10.4958,...,,,,6.635630999999999e-63,0.9999723,2.8e-05,-143.170407,-2.8e-05,-10.4958,1
81,81,,,,2.140911e-48,0.9999987,1e-06,-109.762853,-1e-06,-13.51597,...,,,,2.140911e-48,0.9999987,1e-06,-109.762853,-1e-06,-13.51597,1
67,67,,,,2.3153449999999998e-63,0.9999837,1.6e-05,-144.223302,-1.6e-05,-11.02206,...,,,,2.3153449999999998e-63,0.9999837,1.6e-05,-144.223302,-1.6e-05,-11.02206,1
149,149,,,,9.138633999999999e-145,0.05600501,0.943995,-331.662328,-2.882314,-0.05763442,...,,,,9.138633999999999e-145,0.05600501,0.943995,-331.662328,-2.882314,-0.05763442,2


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
