In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH linear_model_cte AS 
(SELECT "ADS"."KEY" AS "KEY", 0.41498832829570037 * "ADS"."Feature_0" + 1.461297388562228 * "ADS"."Feature_1" + -2.2621411772020728 * "ADS"."Feature_2" + -1.02909509924489 * "ADS"."Feature_3" + 0.265606167976 AS "Score_0", 0.4166396855952165 * "ADS"."Feature_0" + -1.6008331852575775 * "ADS"."Feature_1" + 0.5776576286775883 * "ADS"."Feature_2" + -1.3855384286634425 * "ADS"."Feature_3" + 1.08542374239 AS "Score_1", -1.7075251538239065 * "ADS"."Feature_0" + -1.534268339988975 * "ADS"."Feature_1" + 2.470971680772018 * "ADS"."Feature_2" + 2.5553821129820897 * "ADS"."Feature_3" + -1.21471457808 AS "Score_2" 
FROM "INPUT_DATA" AS "ADS"), 
orig_cte AS 
(SELECT linear_model_cte."KEY" AS "KEY", linear_model_cte."Score_0" AS "Score_0", linear_model_cte."Score_1" AS "Score_1", linear_model_cte."Score_2" AS "Score_2", CAST(NULL AS FLOAT) AS "Proba_0", CAST(NULL AS FLOAT) AS "Proba_1", CAST(NULL AS FLOAT) AS "Proba_2", CAST(NULL AS FLOAT) AS "LogProba_0", CAST(NULL AS FLOAT

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
120,120,-7.455946,-1.056519,2.055621,7.1e-05,0.042606,0.9573229,-9.555181,-3.155754,-0.043615,2
87,87,-5.050228,0.768831,-1.306668,0.002632,0.88616,0.1112076,-5.939917,-0.120858,-2.196357,1
63,63,-5.037999,-0.240253,-0.888894,0.005388,0.653166,0.3414463,-5.22367,-0.425924,-1.074565,1
54,54,-4.894829,-0.109834,-1.410037,0.006522,0.780743,0.2127342,-5.032504,-0.247509,-1.547712,1
83,83,-6.482433,-0.007795,1.088177,0.000386,0.250399,0.7492153,-7.859339,-1.384701,-0.288729,2
36,36,4.51598,-1.752127,-12.252703,0.998108,0.001892,5.207521e-08,-0.001894,-6.270001,-16.770577,0
103,103,-7.402567,-0.191249,2.015628,7.3e-05,0.099127,0.9007994,-9.522667,-2.31135,-0.104473,2
3,3,3.105544,-1.371238,-9.608028,0.988755,0.011242,2.976148e-06,-0.011309,-4.48809,-12.724881,0
101,101,-6.874159,-0.506785,2.196297,0.000108,0.062785,0.9371072,-9.135413,-2.768039,-0.064958,2
25,25,2.899195,-0.986733,-9.890514,0.979882,0.020116,2.733212e-06,-0.020324,-3.906252,-12.810033,0


In [10]:
sql_output.Decision.value_counts()

2    54
0    50
1    46
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [12]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
# skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
95,95,-3.720975,-0.578714,-2.105873,0.048104,0.73104,0.220856,-3.034388,-0.313287,-1.510245,1
59,59,-3.894035,-0.757189,-1.022045,0.033049,0.528709,0.438242,-3.409752,-0.637318,-0.824984,1
62,62,-4.107269,0.988521,-2.395987,0.019538,0.879698,0.100764,-3.935403,-0.128177,-2.294972,1
125,125,-7.495544,-0.06546,1.006964,0.000456,0.397528,0.602016,-7.692178,-0.922491,-0.507471,2
124,124,-7.186995,-1.022822,1.732622,0.000678,0.237204,0.762118,-7.296589,-1.438835,-0.271654,2
64,64,-2.654229,-0.945443,-3.008739,0.167435,0.712756,0.119809,-1.787161,-0.338616,-2.121857,1
148,148,-6.775536,-1.84163,2.202743,0.001098,0.131786,0.867117,-6.814449,-2.026579,-0.142582,2
112,112,-7.131457,-0.61644,1.527956,0.000681,0.29886,0.700459,-7.291904,-1.207781,-0.356019,2
107,107,-8.571077,0.629751,2.037783,0.000123,0.424394,0.575483,-9.001314,-0.857094,-0.552545,2
18,18,4.029601,-2.05654,-11.810561,0.896526,0.103468,7e-06,-0.109228,-2.268497,-11.902172,0


## Comparing the SQL and Scikit-learn Predictions

In [13]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [14]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
123,123,-6.111327,-0.275443,0.592801,0.002053,0.400422,0.597525,-6.188457,-0.915237,-0.514959,...,-6.111327,-0.275443,0.592801,0.000863,0.295365,0.7037725,-7.055429,-1.219544,-0.3513,2
5,5,3.948325,-2.480169,-11.196192,0.926987,0.073,1.3e-05,-0.075816,-2.61729,-11.252919,...,3.948325,-2.480169,-11.196192,0.998387,0.001612,2.643132e-07,-0.001614,-6.430108,-15.146131,0
76,76,-5.119851,0.269244,-1.683638,0.008145,0.777157,0.214698,-4.810364,-0.252113,-1.538522,...,-5.119851,0.269244,-1.683638,0.003983,0.872272,0.1237447,-5.525748,-0.136653,-2.089535,1
132,132,-7.918836,-0.543717,3.020455,0.000275,0.278027,0.721697,-8.197722,-1.280036,-0.326149,...,-7.918836,-0.543717,3.020455,1.7e-05,0.02754,0.9724428,-10.967235,-3.592116,-0.027944,2
32,32,4.918743,-2.583533,-12.42235,0.933948,0.066048,4e-06,-0.068334,-2.717378,-12.483406,...,4.918743,-2.583533,-12.42235,0.999448,0.000552,2.941841e-08,-0.000552,-7.502829,-17.341645,0
148,148,-6.775536,-1.84163,2.202743,0.001098,0.131786,0.867117,-6.814449,-2.026579,-0.142582,...,-6.775536,-1.84163,2.202743,0.000124,0.017217,0.9826592,-8.995772,-4.061866,-0.017493,2
43,43,3.218206,-2.341365,-9.635496,0.916309,0.083629,6.2e-05,-0.087402,-2.481366,-9.683716,...,3.218206,-2.341365,-9.635496,0.996162,0.003836,2.606383e-06,-0.003846,-5.563416,-12.857547,0
79,79,-2.516176,-0.065633,-3.732923,0.128473,0.831362,0.040165,-2.052036,-0.18469,-3.214752,...,-2.516176,-0.065633,-3.732923,0.077574,0.899449,0.02297697,-2.556517,-0.105973,-3.773263,1
35,35,4.096311,-1.537963,-11.185757,0.847611,0.152378,1.2e-05,-0.165334,-1.881394,-11.334607,...,4.096311,-1.537963,-11.185757,0.996439,0.003561,2.298969e-07,-0.003567,-5.637841,-15.285635,0
99,99,-3.88993,-0.454867,-1.790579,0.036349,0.704234,0.259417,-3.314591,-0.350644,-1.349319,...,-3.88993,-0.454867,-1.790579,0.024879,0.772085,0.2030359,-3.693724,-0.258661,-1.594373,1


In [15]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
