In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

GaussianNB(priors=None)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "RawScores" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST(NULL AS FLOAT(53)) AS "LogProba_0", CAST(NULL AS FLOAT(53)) AS "Proba_0", -1.09861228867 + (-0.5 * -0.267793442091 - (0.5 * ("ADS"."Feature_0" - 5.006) * ("ADS"."Feature_0" - 5.006)) / 0.121764003092) + (-0.5 * -0.112109357731 - (0.5 * ("ADS"."Feature_1" - 3.418) * ("ADS"."Feature_1" - 3.418)) / 0.142276003092) + (-0.5 * -1.68535226038 - (0.5 * ("ADS"."Feature_2" - 1.464) * ("ADS"."Feature_2" - 1.464)) / 0.0295040030924) + (-0.5 * -2.64826613862 - (0.5 * ("ADS"."Feature_3" - 0.244) * ("ADS"."Feature_3" - 0.244)) / 0.0112640030924) AS "Score_0", CAST(NULL AS FLOAT(53)) AS "LogProba_1", CAST(NULL AS FLOAT(53)) AS "Proba_1", -1.09861228867 + (-0.5 * 0.495040594659 - (0.5 * ("ADS"."Feature_0" - 5.936) * ("ADS"."Feature_0" - 5.936)) / 0.261104003092) + (-0.5 * -0.500335172182 - (0.5 * ("ADS"."Feature_1" - 2.77) * ("ADS"."Feature_1" - 2.77)) / 0.0965000030924) + (-0.5 * 0.30725034869 - (0.5 * ("ADS"."Feature_2" - 4.26) * ("ADS"."Feature

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
135,135,-581.027263,-26.97011,-3.837318,2.135952e-251,8.985786e-11,1.0,-577.1899,-23.132792,-8.98579e-11,2
82,82,-144.27545,-1.105071,-11.600841,6.635630999999999e-63,0.9999723,2.765239e-05,-143.1704,-2.8e-05,-10.4958,1
131,131,-583.416359,-29.929205,-7.991009,1.247227e-250,2.967305e-10,1.0,-575.4254,-21.938197,-2.967304e-10,2
130,130,-509.578491,-16.767758,-2.853796,8.555802999999999e-221,9.062384e-07,0.9999991,-506.7247,-13.913963,-9.062389e-07,2
1,1,0.442626,-38.307886,-56.290206,1.0,1.4820620000000002e-17,2.29744e-25,0.0,-38.750512,-56.73283,0
100,100,-580.273435,-27.224121,-3.74474,4.137795e-251,6.35381e-11,1.0,-576.5287,-23.479381,-6.353806e-11,2
122,122,-631.371544,-26.18387,-5.18385,1.122379e-272,7.582404e-10,1.0,-626.1877,-21.000021,-7.582404e-10,2
38,38,-1.405606,-42.115238,-60.143319,1.0,2.089448e-18,3.094109e-26,0.0,-40.709632,-58.73771,0
51,51,-233.118091,-2.436088,-5.283208,6.188548e-101,0.9451696,0.05483036,-230.7384,-0.056391,-2.903511,1
35,35,-0.176078,-41.349871,-58.973029,1.0,1.313557e-18,2.916143e-26,0.0,-41.173793,-58.79695,0


In [10]:
sql_output.Decision.value_counts()

2    50
1    50
0    50
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
141,141,,,,1.610931e-184,4.677981e-07,0.9999995,-423.198845,-14.57523,-4.677982e-07,2
9,9,,,,1.0,3.203442e-18,1.129895e-25,0.0,-40.28231,-57.4425,0
64,64,,,,1.725091e-55,0.9999754,2.461264e-05,-126.0969,-2.461294e-05,-10.61225,1
7,7,,,,1.0,6.576153e-18,2.79021e-25,0.0,-39.56308,-56.53851,0
84,84,,,,3.716474e-98,0.9924766,0.007523362,-224.340564,-0.007551805,-4.889742,1
102,102,,,,1.0494170000000001e-218,1.679154e-07,0.9999998,-501.915316,-15.59981,-1.679154e-07,2
30,30,,,,1.0,1.050324e-16,2.326775e-24,0.0,-36.79226,-54.41756,0
145,145,,,,6.477323e-189,5.85508e-07,0.9999994,-433.320275,-14.35079,-5.855081e-07,2
122,122,,,,1.122379e-272,7.582404e-10,1.0,-626.187694,-21.00002,-7.582406e-10,2
93,93,,,,6.269125e-35,0.9999998,2.024879e-07,-78.754841,-2.024879e-07,-15.41259,1


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
122,122,,,,1.122379e-272,7.582404e-10,1.0,-626.187694,-21.000021,-7.582406e-10,...,-631.371544,-26.18387,-5.18385,1.122379e-272,7.582404e-10,1.0,-626.187694,-21.000021,-7.582404e-10,2
90,90,,,,3.927107e-82,0.9998737,0.0001263203,-187.444075,-0.000126,-8.97669,...,-188.747817,-1.303869,-10.280432,3.927107e-82,0.9998737,0.0001263203,-187.444075,-0.000126,-8.97669,1
62,62,,,,6.401655e-61,0.999994,6.016325e-06,-138.601134,-6e-06,-12.02103,...,-142.372819,-3.771691,-15.792719,6.401655e-61,0.999994,6.016325e-06,-138.601134,-6e-06,-12.02103,1
141,141,,,,1.610931e-184,4.677981e-07,0.9999995,-423.198845,-14.575229,-4.677982e-07,...,-425.512466,-16.888851,-2.313622,1.610931e-184,4.677981e-07,0.9999995,-423.198845,-14.575229,-4.677982e-07,2
46,46,,,,1.0,2.396423e-18,3.1190920000000003e-25,0.0,-40.572554,-56.42709,...,0.309658,-40.262896,-56.117427,1.0,2.396423e-18,3.1190920000000003e-25,0.0,-40.572554,-56.42709,0
76,76,,,,4.950122e-113,0.9128444,0.08715556,-258.592703,-0.09119,-2.440061,...,-261.218561,-2.717048,-5.065919,4.950122e-113,0.9128444,0.08715556,-258.592703,-0.09119,-2.440061,1
86,86,,,,1.057867e-111,0.7992948,0.2007052,-255.530691,-0.224026,-1.605918,...,-258.368669,-3.062004,-4.443897,1.057867e-111,0.7992948,0.2007052,-255.530691,-0.224026,-1.605918,1
78,78,,,,4.1970269999999997e-100,0.9864803,0.01351973,-228.824133,-0.013612,-4.303605,...,-229.971706,-1.161185,-5.451178,4.1970269999999997e-100,0.9864803,0.01351973,-228.824133,-0.013612,-4.303605,1
48,48,,,,1.0,2.704142e-18,2.8649280000000002e-25,0.0,-40.451747,-56.51208,...,0.515844,-39.935903,-55.99624,1.0,2.704142e-18,2.8649280000000002e-25,0.0,-40.451747,-56.51208,0
143,143,,,,2.540294e-231,4.42556e-09,1.0,-530.964877,-19.235869,-4.42556e-09,...,-533.245692,-21.516685,-2.280816,2.540294e-231,4.42556e-09,1.0,-530.964877,-19.235869,-4.42556e-09,2


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
