In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH linear_model_cte AS 
(SELECT "ADS"."KEY" AS "KEY", 0.41498832829570037 * "ADS"."Feature_0" + 1.461297388562228 * "ADS"."Feature_1" + -2.2621411772020728 * "ADS"."Feature_2" + -1.02909509924489 * "ADS"."Feature_3" + 0.265606167976 AS "Score_0", 0.4166396855952165 * "ADS"."Feature_0" + -1.6008331852575775 * "ADS"."Feature_1" + 0.5776576286775883 * "ADS"."Feature_2" + -1.3855384286634425 * "ADS"."Feature_3" + 1.08542374239 AS "Score_1", -1.7075251538239065 * "ADS"."Feature_0" + -1.534268339988975 * "ADS"."Feature_1" + 2.470971680772018 * "ADS"."Feature_2" + 2.5553821129820897 * "ADS"."Feature_3" + -1.21471457808 AS "Score_2" 
FROM "INPUT_DATA" AS "ADS"), 
orig_cte AS 
(SELECT linear_model_cte."KEY" AS "KEY", linear_model_cte."Score_0" AS "Score_0", linear_model_cte."Score_1" AS "Score_1", linear_model_cte."Score_2" AS "Score_2", (1.0 / (1.0 + exp(-linear_model_cte."Score_0"))) / (1.0 / (1.0 + exp(-linear_model_cte."Score_0")) + 1.0 / (1.0 + exp(-linear_model_cte."Score_1")) + 1.0 / (

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
132,132,-7.918836,-0.543717,3.020455,0.000275,0.278027,0.721697,-8.197722,-1.280036,-0.326149,2
25,25,2.899195,-0.986733,-9.890514,0.777263,0.222695,4.2e-05,-0.251976,-1.501951,-10.088936,0
148,148,-6.775536,-1.84163,2.202743,0.001098,0.131786,0.867117,-6.814449,-2.026579,-0.142582,2
116,116,-6.947225,-0.32577,1.273599,0.000799,0.34893,0.650271,-7.131831,-1.052884,-0.430366,2
48,48,4.272814,-2.04009,-11.723857,0.89552,0.104473,7e-06,-0.110351,-2.258827,-11.82037,0
112,112,-7.131457,-0.61644,1.527956,0.000681,0.29886,0.700459,-7.291904,-1.207781,-0.356019,2
82,82,-3.439223,-0.230097,-2.557637,0.056972,0.811251,0.131777,-2.865187,-0.209178,-2.026648,1
19,19,4.233036,-2.422055,-11.28024,0.923616,0.076373,1.2e-05,-0.079459,-2.572131,-11.345308,0
125,125,-7.495544,-0.06546,1.006964,0.000456,0.397528,0.602016,-7.692178,-0.922491,-0.507471,2
98,98,-1.883138,-0.582916,-3.534928,0.254575,0.690791,0.054634,-1.368161,-0.369917,-2.907098,1


In [10]:
sql_output.Decision.value_counts()

2    54
0    50
1    46
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
# skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
80,80,-3.672985,0.205949,-2.087734,0.03609,0.803217,0.160692,-3.321733,-0.21913,-1.828264,1
20,20,3.423495,-1.402645,-10.940134,0.830668,0.169316,1.5e-05,-0.185525,-1.775986,-11.093599,0
133,133,-6.108897,0.095667,0.166954,0.002077,0.490653,0.50727,-6.176679,-0.712019,-0.678712,2
45,45,3.165716,-1.324146,-9.787665,0.82031,0.179642,4.8e-05,-0.198073,-1.716787,-9.944476,0
98,98,-1.883138,-0.582916,-3.534928,0.254575,0.690791,0.054634,-1.368161,-0.369917,-2.907098,1
44,44,3.22527,-2.329546,-10.036314,0.915519,0.084439,4.2e-05,-0.088264,-2.471723,-10.085646,0
66,66,-4.749845,-0.862742,-0.427215,0.012253,0.423869,0.563877,-4.401982,-0.85833,-0.572918,2
1,1,3.310124,-1.143928,-10.213956,0.799706,0.200263,3e-05,-0.223511,-1.608122,-10.401643,0
47,47,3.477887,-1.589087,-10.008552,0.851214,0.148746,3.9e-05,-0.161091,-1.905515,-10.139283,0
76,76,-5.119851,0.269244,-1.683638,0.008145,0.777157,0.214698,-4.810364,-0.252113,-1.538522,1


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
132,132,-7.918836,-0.543717,3.020455,0.000275,0.278027,0.721697,-8.197722,-1.280036,-0.326149,...,-7.918836,-0.543717,3.020455,0.000275,0.278027,0.721697,-8.197722,-1.280036,-0.326149,2
44,44,3.22527,-2.329546,-10.036314,0.915519,0.084439,4.2e-05,-0.088264,-2.471723,-10.085646,...,3.22527,-2.329546,-10.036314,0.915519,0.084439,4.2e-05,-0.088264,-2.471723,-10.085646,0
56,56,-4.576302,-1.074366,-1.33303,0.02152,0.537733,0.440747,-3.83877,-0.620393,-0.819284,...,-4.576302,-1.074366,-1.33303,0.02152,0.537733,0.440747,-3.83877,-0.620393,-0.819284,1
69,69,-3.711571,0.145295,-2.164816,0.035978,0.808752,0.15527,-3.324843,-0.212263,-1.862592,...,-3.711571,0.145295,-2.164816,0.035978,0.808752,0.15527,-3.324843,-0.212263,-1.862592,1
105,105,-9.287822,0.352295,2.880005,6e-05,0.382744,0.617195,-9.715869,-0.960388,-0.48257,...,-9.287822,0.352295,2.880005,6e-05,0.382744,0.617195,-9.715869,-0.960388,-0.48257,2
121,121,-6.461508,-1.004281,2.145719,0.001339,0.230143,0.768518,-6.615729,-1.469053,-0.263292,...,-6.461508,-1.004281,2.145719,0.001339,0.230143,0.768518,-6.615729,-1.469053,-0.263292,2
84,84,-4.832843,-0.94607,-0.08571,0.010312,0.365035,0.624653,-4.574414,-1.007763,-0.470559,...,-4.832843,-0.94607,-0.08571,0.010312,0.365035,0.624653,-4.574414,-1.007763,-0.470559,2
108,108,-8.273519,0.731272,2.44052,0.00016,0.423196,0.576644,-8.740776,-0.85992,-0.55053,...,-8.273519,0.731272,2.44052,0.00016,0.423196,0.576644,-8.740776,-0.85992,-0.55053,2
55,55,-4.794787,-0.223804,-0.80219,0.010767,0.583013,0.40622,-4.531274,-0.539545,-0.900861,...,-4.794787,-0.223804,-0.80219,0.010767,0.583013,0.40622,-4.531274,-0.539545,-0.900861,1
20,20,3.423495,-1.402645,-10.940134,0.830668,0.169316,1.5e-05,-0.185525,-1.775986,-11.093599,...,3.423495,-1.402645,-10.940134,0.830668,0.169316,1.5e-05,-0.185525,-1.775986,-11.093599,0


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
