In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier()
clf.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "IL" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) AS "Feature_0", CAST("ADS"."Feature_1" AS FLOAT) AS "Feature_1", CAST("ADS"."Feature_2" AS FLOAT) AS "Feature_2", CAST("ADS"."Feature_3" AS FLOAT) AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"), 
"HL_BA_1" AS 
(SELECT "IL"."KEY" AS "KEY", -0.0770229138622 * "IL"."Feature_0" + -0.0606941304091 * "IL"."Feature_1" + 0.0607145117983 * "IL"."Feature_2" + -0.0105272911151 * "IL"."Feature_3" + 0.225571028374 AS "NEUR_1_1", 0.133643010103 * "IL"."Feature_0" + -0.141615733768 * "IL"."Feature_1" + -0.227713500237 * "IL"."Feature_2" + -0.248843259355 * "IL"."Feature_3" + 0.0802821322299 AS "NEUR_1_2", -0.0524075207684 * "IL"."Feature_0" + -0.0405493356622 * "IL"."Feature_1" + 0.0413828344281 * "IL"."Feature_2" + -0.0578047047708 * "IL"."Feature_3" + 0.195346843511 AS "NEUR_1_3", 0.0249078555029 * "IL"."Feature_0" + -0.146945705923 * "IL"."Feature_1" + 0.272482686976 * "IL"."Feature_2" + 0.2379966754 * "IL"."Feature_3" + -0.

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,,,,0.000437,0.049308,0.950255,-7.734775,-3.009675,-0.051025,2,0.950255
74,74,,,,0.030341,0.86327,0.106389,-3.495259,-0.147027,-2.240656,1,0.86327
9,9,,,,0.949152,0.050672,0.000176,-0.052187,-2.98238,-8.643568,0,0.949152
88,88,,,,0.044851,0.764878,0.190271,-3.104402,-0.268039,-1.659308,1,0.764878
25,25,,,,0.923997,0.075687,0.000316,-0.079047,-2.581147,-8.059591,0,0.923997
5,5,,,,0.96821,0.031699,9.1e-05,-0.032307,-3.451467,-9.301013,0,0.96821
48,48,,,,0.975679,0.024269,5.2e-05,-0.024621,-3.71856,-9.866035,0,0.975679
117,117,,,,0.000404,0.137852,0.861745,-7.815114,-1.981576,-0.148796,2,0.861745
83,83,,,,0.003446,0.316985,0.67957,-5.670593,-1.148902,-0.386296,2,0.67957
105,105,,,,0.000178,0.107715,0.892107,-8.631532,-2.228267,-0.11417,2,0.892107


In [10]:
sql_output.Decision.value_counts()

2    53
0    50
1    47
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.000437,0.049308,0.950255,-7.734775,-3.009675,-0.051025,2
74,74,,,,0.030341,0.86327,0.106389,-3.495259,-0.147027,-2.240656,1
9,9,,,,0.949152,0.050672,0.000176,-0.052187,-2.98238,-8.643568,0
88,88,,,,0.044851,0.764878,0.190271,-3.104402,-0.268039,-1.659308,1
25,25,,,,0.923997,0.075687,0.000316,-0.079047,-2.581147,-8.059591,0
5,5,,,,0.96821,0.031699,9.1e-05,-0.032307,-3.451467,-9.301013,0
48,48,,,,0.975679,0.024269,5.2e-05,-0.024621,-3.71856,-9.866035,0
117,117,,,,0.000404,0.137852,0.861745,-7.815114,-1.981576,-0.148796,2
83,83,,,,0.003446,0.316985,0.67957,-5.670593,-1.148902,-0.386296,2
105,105,,,,0.000178,0.107715,0.892107,-8.631532,-2.228267,-0.11417,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
101,101,,,,0.001413,0.140819,0.857768,-6.56207,-1.96028,-0.153422,...,,,0.001413,0.140819,0.857768,-6.56207,-1.96028,-0.153422,2,0.857768
107,107,,,,0.000509,0.205526,0.793965,-7.583403,-1.582182,-0.230716,...,,,0.000509,0.205526,0.793965,-7.583403,-1.582182,-0.230716,2,0.793965
136,136,,,,0.00053,0.063172,0.936298,-7.543243,-2.761891,-0.065821,...,,,0.00053,0.063172,0.936298,-7.543243,-2.761891,-0.065821,2,0.936298
66,66,,,,0.017611,0.537843,0.444546,-4.039221,-0.620189,-0.810702,...,,,0.017611,0.537843,0.444546,-4.039221,-0.620189,-0.810702,1,0.537843
21,21,,,,0.968974,0.030919,0.000107,-0.031518,-3.476375,-9.144066,...,,,0.968974,0.030919,0.000107,-0.031518,-3.476375,-9.144066,0,0.968974
65,65,,,,0.030781,0.875237,0.093982,-3.480853,-0.133261,-2.364648,...,,,0.030781,0.875237,0.093982,-3.480853,-0.133261,-2.364648,1,0.875237
132,132,,,,0.000368,0.075329,0.924304,-7.908687,-2.585893,-0.078715,...,,,0.000368,0.075329,0.924304,-7.908687,-2.585893,-0.078715,2,0.924304
45,45,,,,0.936902,0.062792,0.000306,-0.065177,-2.767922,-8.092992,...,,,0.936902,0.062792,0.000306,-0.065177,-2.767922,-8.092992,0,0.936902
76,76,,,,0.011683,0.804355,0.183963,-4.449661,-0.217715,-1.693022,...,,,0.011683,0.804355,0.183963,-4.449661,-0.217715,-1.693022,1,0.804355
139,139,,,,0.001963,0.272909,0.725128,-6.233077,-1.298617,-0.321408,...,,,0.001963,0.272909,0.725128,-6.233077,-1.298617,-0.321408,2,0.725128


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
