In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH linear_model_cte AS 
(SELECT "ADS"."KEY" AS "KEY", 0.41498832829570037 * "ADS"."Feature_0" + 1.461297388562228 * "ADS"."Feature_1" + -2.2621411772020728 * "ADS"."Feature_2" + -1.02909509924489 * "ADS"."Feature_3" + 0.265606167976 AS "Score_0", 0.4166396855952165 * "ADS"."Feature_0" + -1.6008331852575775 * "ADS"."Feature_1" + 0.5776576286775883 * "ADS"."Feature_2" + -1.3855384286634425 * "ADS"."Feature_3" + 1.08542374239 AS "Score_1", -1.7075251538239065 * "ADS"."Feature_0" + -1.534268339988975 * "ADS"."Feature_1" + 2.470971680772018 * "ADS"."Feature_2" + 2.5553821129820897 * "ADS"."Feature_3" + -1.21471457808 AS "Score_2" 
FROM "INPUT_DATA" AS "ADS"), 
orig_cte AS 
(SELECT linear_model_cte."KEY" AS "KEY", linear_model_cte."Score_0" AS "Score_0", linear_model_cte."Score_1" AS "Score_1", linear_model_cte."Score_2" AS "Score_2", (1.0 / (1.0 + exp(-linear_model_cte."Score_0"))) / (1.0 / (1.0 + exp(-linear_model_cte."Score_0")) + 1.0 / (1.0 + exp(-linear_model_cte."Score_1")) + 1.0 / (

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,-7.242577,-1.359637,3.320561,0.000611,0.174594,0.824795,-7.400417,-1.745294,-0.19262,2,0.824795
74,74,-3.905737,-0.207771,-2.645079,0.036927,0.83899,0.124082,-3.298801,-0.175556,-2.086809,1,0.83899
9,9,3.33295,-1.107692,-10.375824,0.795422,0.204553,2.6e-05,-0.228883,-1.586929,-10.569673,0,0.795422
88,88,-3.639169,-0.816697,-1.92668,0.055754,0.667411,0.276835,-2.886803,-0.404349,-1.284334,1,0.667411
25,25,2.899195,-0.986733,-9.890514,0.777263,0.222695,4.2e-05,-0.251976,-1.501951,-10.088936,0,0.777263
5,5,3.948325,-2.480169,-11.196192,0.926987,0.073,1.3e-05,-0.075816,-2.61729,-11.252919,0,0.926987
48,48,4.272814,-2.04009,-11.723857,0.89552,0.104473,7e-06,-0.110351,-2.258827,-11.82037,0,0.89552
117,117,-8.406409,-0.967495,1.984473,0.000193,0.238474,0.761333,-8.550525,-1.433496,-0.272685,2,0.761333
83,83,-6.482433,-0.007795,1.088177,0.001225,0.399202,0.599574,-6.705197,-0.918288,-0.511537,2,0.599574
105,105,-9.287822,0.352295,2.880005,6e-05,0.382744,0.617195,-9.715869,-0.960388,-0.48257,2,0.617195


In [10]:
sql_output.Decision.value_counts()

2    54
0    50
1    46
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
# skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,-7.242577,-1.359637,3.320561,0.000611,0.174594,0.824795,-7.400417,-1.745294,-0.19262,2
74,74,-3.905737,-0.207771,-2.645079,0.036927,0.83899,0.124082,-3.298801,-0.175556,-2.086809,1
9,9,3.33295,-1.107692,-10.375824,0.795422,0.204553,2.6e-05,-0.228883,-1.586929,-10.569673,0
88,88,-3.639169,-0.816697,-1.92668,0.055754,0.667411,0.276835,-2.886803,-0.404349,-1.284334,1
25,25,2.899195,-0.986733,-9.890514,0.777263,0.222695,4.2e-05,-0.251976,-1.501951,-10.088936,0
5,5,3.948325,-2.480169,-11.196192,0.926987,0.073,1.3e-05,-0.075816,-2.61729,-11.252919,0
48,48,4.272814,-2.04009,-11.723857,0.89552,0.104473,7e-06,-0.110351,-2.258827,-11.82037,0
117,117,-8.406409,-0.967495,1.984473,0.000193,0.238474,0.761333,-8.550525,-1.433496,-0.272685,2
83,83,-6.482433,-0.007795,1.088177,0.001225,0.399202,0.599574,-6.705197,-0.918288,-0.511537,2
105,105,-9.287822,0.352295,2.880005,6e-05,0.382744,0.617195,-9.715869,-0.960388,-0.48257,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
124,124,-7.186995,-1.022822,1.732622,0.000678,0.237204,0.762118,-7.296589,-1.438835,-0.271654,...,-1.022822,1.732622,0.000678,0.237204,0.762118,-7.296589,-1.438835,-0.271654,2,0.762118
46,46,4.109732,-2.225735,-11.288681,0.909856,0.090133,1.2e-05,-0.094469,-2.406472,-11.366884,...,-2.225735,-11.288681,0.909856,0.090133,1.2e-05,-0.094469,-2.406472,-11.366884,0,0.909856
5,5,3.948325,-2.480169,-11.196192,0.926987,0.073,1.3e-05,-0.075816,-2.61729,-11.252919,...,-2.480169,-11.196192,0.926987,0.073,1.3e-05,-0.075816,-2.61729,-11.252919,0,0.926987
146,146,-6.73271,-0.036064,1.402291,0.000919,0.37922,0.619861,-6.992197,-0.969638,-0.478261,...,-0.036064,1.402291,0.000919,0.37922,0.619861,-6.992197,-0.969638,-0.478261,2,0.619861
103,103,-7.402567,-0.191249,2.015628,0.000456,0.338732,0.660812,-7.692385,-1.082545,-0.414287,...,-0.191249,2.015628,0.000456,0.338732,0.660812,-7.692385,-1.082545,-0.414287,2,0.660812
36,36,4.51598,-1.752127,-12.252703,0.870019,0.129976,4e-06,-0.13924,-2.040403,-12.381073,...,-1.752127,-12.252703,0.870019,0.129976,4e-06,-0.13924,-2.040403,-12.381073,0,0.870019
60,60,-3.683447,0.603219,-1.617094,0.029312,0.772718,0.197971,-3.529765,-0.257842,-1.619637,...,0.603219,-1.617094,0.029312,0.772718,0.197971,-3.529765,-0.257842,-1.619637,1,0.772718
116,116,-6.947225,-0.32577,1.273599,0.000799,0.34893,0.650271,-7.131831,-1.052884,-0.430366,...,-0.32577,1.273599,0.000799,0.34893,0.650271,-7.131831,-1.052884,-0.430366,2,0.650271
76,76,-5.119851,0.269244,-1.683638,0.008145,0.777157,0.214698,-4.810364,-0.252113,-1.538522,...,0.269244,-1.683638,0.008145,0.777157,0.214698,-4.810364,-0.252113,-1.538522,1,0.777157
67,67,-3.685833,0.162542,-2.574519,0.038475,0.850175,0.111349,-3.257737,-0.162313,-2.195084,...,0.162542,-2.574519,0.038475,0.850175,0.111349,-3.257737,-0.162313,-2.195084,1,0.850175


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
