In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

GaussianNB(priors=None)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "RawScores" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST(NULL AS FLOAT) AS "LogProba_0", CAST(NULL AS FLOAT) AS "Proba_0", -1.09861228867 + (-0.5 * -0.267793442091 - (0.5 * (CAST("ADS"."Feature_0" AS FLOAT) - 5.006) * (CAST("ADS"."Feature_0" AS FLOAT) - 5.006)) / 0.121764003092) + (-0.5 * -0.112109357731 - (0.5 * (CAST("ADS"."Feature_1" AS FLOAT) - 3.418) * (CAST("ADS"."Feature_1" AS FLOAT) - 3.418)) / 0.142276003092) + (-0.5 * -1.68535226038 - (0.5 * (CAST("ADS"."Feature_2" AS FLOAT) - 1.464) * (CAST("ADS"."Feature_2" AS FLOAT) - 1.464)) / 0.0295040030924) + (-0.5 * -2.64826613862 - (0.5 * (CAST("ADS"."Feature_3" AS FLOAT) - 0.244) * (CAST("ADS"."Feature_3" AS FLOAT) - 0.244)) / 0.0112640030924) AS "Score_0", CAST(NULL AS FLOAT) AS "LogProba_1", CAST(NULL AS FLOAT) AS "Proba_1", -1.09861228867 + (-0.5 * 0.495040594659 - (0.5 * (CAST("ADS"."Feature_0" AS FLOAT) - 5.936) * (CAST("ADS"."Feature_0" AS FLOAT) - 5.936)) / 0.261104003092) + (-0.5 * -0.500335172182 - (0.5 * (CAST("ADS"."Featur

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,,,,2.75153e-187,1.042537e-06,0.999999,-429.5713,-13.773854,-1.042537e-06,2,0.999999
74,74,,,,2.7170780000000002e-84,0.9984608,0.001539184,-192.4176,-0.00154,-6.476503,1,0.998461
9,9,,,,1.0,3.203442e-18,1.129895e-25,0.0,-40.282306,-57.4425,0,1.0
88,88,,,,3.078949e-73,0.9997963,0.0002037301,-166.9641,-0.000204,-8.498714,1,0.999796
25,25,,,,1.0,3.378233e-16,6.395328e-24,-4.440892e-16,-35.624009,-53.40647,0,1.0
5,5,,,,1.0,1.525989e-14,1.798834e-21,-1.532108e-14,-31.813548,-47.76715,0,1.0
48,48,,,,1.0,2.704142e-18,2.8649280000000002e-25,0.0,-40.451747,-56.51208,0,1.0
117,117,,,,3.753023e-285,1.645749e-12,1.0,-654.9142,-27.132825,-1.645795e-12,2,1.0
83,83,,,,2.611248e-134,0.6121598,0.3878402,-307.5866,-0.490762,-0.947162,1,0.61216
105,105,,,,4.516547e-271,2.40977e-10,1.0,-622.4928,-22.14632,-2.40977e-10,2,1.0


In [10]:
sql_output.Decision.value_counts()

2    50
1    50
0    50
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,2.75153e-187,1.042537e-06,0.999999,-429.5713,-13.773854,-1.042537e-06,2
74,74,,,,2.7170780000000002e-84,0.9984608,0.001539184,-192.4176,-0.00154,-6.476503,1
9,9,,,,1.0,3.203442e-18,1.129895e-25,0.0,-40.282306,-57.4425,0
88,88,,,,3.078949e-73,0.9997963,0.0002037301,-166.9641,-0.000204,-8.498714,1
25,25,,,,1.0,3.378233e-16,6.395328e-24,-4.440892e-16,-35.624009,-53.40647,0
5,5,,,,1.0,1.525989e-14,1.798834e-21,-1.509903e-14,-31.813548,-47.76715,0
48,48,,,,1.0,2.704142e-18,2.8649280000000002e-25,0.0,-40.451747,-56.51208,0
117,117,,,,3.753023e-285,1.645749e-12,1.0,-654.9142,-27.132825,-1.644906e-12,2
83,83,,,,2.611248e-134,0.6121598,0.3878402,-307.5866,-0.490762,-0.947162,1
105,105,,,,4.516547e-271,2.40977e-10,1.0,-622.4928,-22.14632,-2.40977e-10,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
129,129,,,,1.613376e-181,0.0007018057,0.9992982,-416.2896,-7.261854,-0.0007020521,...,,,1.613376e-181,0.0007018057,0.9992982,-416.2896,-7.261854,-0.0007020521,2,0.999298
137,137,,,,1.308683e-169,0.001902276,0.9980977,-388.8679,-6.264704,-0.001904088,...,,,1.308683e-169,0.001902276,0.9980977,-388.8679,-6.264704,-0.001904088,2,0.998098
107,107,,,,2.221915e-227,1.340181e-06,0.9999987,-521.8884,-13.522706,-1.340182e-06,...,,,2.221915e-227,1.340181e-06,0.9999987,-521.8884,-13.522706,-1.340182e-06,2,0.999999
21,21,,,,1.0,6.397262e-16,2.980661e-23,-6.661338e-16,-34.985491,-51.86731,...,,,1.0,6.397262e-16,2.980661e-23,-6.661338e-16,-34.985491,-51.86731,0,1.0
92,92,,,,8.315456e-67,0.9999662,3.377132e-05,-152.1551,-3.4e-05,-10.2959,...,,,8.315456e-67,0.9999662,3.377132e-05,-152.1551,-3.4e-05,-10.2959,1,0.999966
54,54,,,,9.049382e-107,0.9524418,0.04755819,-244.1739,-0.048726,-3.045801,...,,,9.049382e-107,0.9524418,0.04755819,-244.1739,-0.048726,-3.045801,1,0.952442
100,100,,,,4.137795e-251,6.35381e-11,1.0,-576.5287,-23.479381,-6.353806e-11,...,,,4.137795e-251,6.35381e-11,1.0,-576.5287,-23.479381,-6.353806e-11,2,1.0
23,23,,,,1.0,1.883706e-11,3.4769459999999996e-19,-1.883693e-11,-24.695195,-42.50296,...,,,1.0,1.883706e-11,3.4769459999999996e-19,-1.883715e-11,-24.695195,-42.50296,0,1.0
142,142,,,,5.008456e-151,0.02501216,0.9749878,-346.0792,-3.688393,-0.02533028,...,,,5.008456e-151,0.02501216,0.9749878,-346.0792,-3.688393,-0.02533028,2,0.974988
84,84,,,,3.716474e-98,0.9924766,0.007523362,-224.3406,-0.007552,-4.889742,...,,,3.716474e-98,0.9924766,0.007523362,-224.3406,-0.007552,-4.889742,1,0.992477


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
