In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


# Scikit-Learn Model Deployment Use Case

In this case, we will build a model (here, MLP model) on a IRIS dataset  (not very original !!!) and generate a SQL code for deployment using the web service. 

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result. 

Both results are stored in pandas dataframes. 


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1960)
clf.fit(X, Y)



In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "IL" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) AS "Feature_0", CAST("ADS"."Feature_1" AS FLOAT) AS "Feature_1", CAST("ADS"."Feature_2" AS FLOAT) AS "Feature_2", CAST("ADS"."Feature_3" AS FLOAT) AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"), 
"HL_BA_1" AS 
(SELECT "IL"."KEY" AS "KEY", -0.0015728749851640802 * "IL"."Feature_0" + 0.007738492836496263 * "IL"."Feature_1" + -0.10834230254441445 * "IL"."Feature_2" + 0.016128851363088125 * "IL"."Feature_3" + -0.0839798413234357 AS "NEUR_1_1", 0.0031119285993132335 * "IL"."Feature_0" + -0.0239251031949356 * "IL"."Feature_1" + -0.02458011484953259 * "IL"."Feature_2" + -0.08175952564419868 * "IL"."Feature_3" + 0.02754451873665828 AS "NEUR_1_2", 0.24895621146508548 * "IL"."Feature_0" + -0.055288663322814596 * "IL"."Feature_1" + 0.21204946241500228 * "IL"."Feature_2" + 0.1369475017682187 * "IL"."Feature_3" + -0.13481963860377522 AS "NEUR_1_3", -0.04711121223553422 * "IL"."Feature_0" + -0.055848670768729534 * "IL"."Featur

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,,,,0.000819,0.062837,0.936344,-7.107648,-2.767214,-0.065772,2,0.936344
74,74,,,,0.035002,0.853701,0.111297,-3.352343,-0.158174,-2.195556,1,0.853701
9,9,,,,0.939008,0.060175,0.000817,-0.062931,-2.810502,-7.11016,0,0.939008
88,88,,,,0.055523,0.730194,0.214283,-2.89095,-0.314445,-1.540459,1,0.730194
25,25,,,,0.908572,0.090141,0.001287,-0.095881,-2.406382,-6.655265,0,0.908572
5,5,,,,0.963711,0.035854,0.000434,-0.036963,-3.328292,-7.741599,0,0.963711
48,48,,,,0.970557,0.029151,0.000292,-0.029885,-3.535249,-8.139519,0,0.970557
117,117,,,,0.0007,0.146018,0.853282,-7.26488,-1.924024,-0.158665,2,0.853282
83,83,,,,0.004844,0.302749,0.692408,-5.3301,-1.194852,-0.36758,2,0.692408
105,105,,,,0.000309,0.115297,0.884395,-8.083683,-2.160248,-0.122852,2,0.884395


In [10]:
sql_output.Decision.value_counts()

2    53
0    50
1    47
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.000819,0.062837,0.936344,-7.107648,-2.767214,-0.065772,2
74,74,,,,0.035002,0.853701,0.111297,-3.352343,-0.158174,-2.195556,1
9,9,,,,0.939008,0.060175,0.000817,-0.062931,-2.810502,-7.11016,0
88,88,,,,0.055523,0.730194,0.214283,-2.89095,-0.314445,-1.540459,1
25,25,,,,0.908572,0.090141,0.001287,-0.095881,-2.406382,-6.655265,0
5,5,,,,0.963711,0.035854,0.000434,-0.036963,-3.328292,-7.741599,0
48,48,,,,0.970557,0.029151,0.000292,-0.029885,-3.535249,-8.139519,0
117,117,,,,0.0007,0.146018,0.853282,-7.26488,-1.924024,-0.158665,2
83,83,,,,0.004844,0.302749,0.692408,-5.3301,-1.194852,-0.36758,2
105,105,,,,0.000309,0.115297,0.884395,-8.083683,-2.160248,-0.122852,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
114,114,,,,0.000819,0.062837,0.936344,-7.107648,-2.767214,-0.065772,...,,,0.000819,0.062837,0.936344,-7.107648,-2.767214,-0.065772,2,0.936344
74,74,,,,0.035002,0.853701,0.111297,-3.352343,-0.158174,-2.195556,...,,,0.035002,0.853701,0.111297,-3.352343,-0.158174,-2.195556,1,0.853701
9,9,,,,0.939008,0.060175,0.000817,-0.062931,-2.810502,-7.11016,...,,,0.939008,0.060175,0.000817,-0.062931,-2.810502,-7.11016,0,0.939008
88,88,,,,0.055523,0.730194,0.214283,-2.89095,-0.314445,-1.540459,...,,,0.055523,0.730194,0.214283,-2.89095,-0.314445,-1.540459,1,0.730194
25,25,,,,0.908572,0.090141,0.001287,-0.095881,-2.406382,-6.655265,...,,,0.908572,0.090141,0.001287,-0.095881,-2.406382,-6.655265,0,0.908572
5,5,,,,0.963711,0.035854,0.000434,-0.036963,-3.328292,-7.741599,...,,,0.963711,0.035854,0.000434,-0.036963,-3.328292,-7.741599,0,0.963711
48,48,,,,0.970557,0.029151,0.000292,-0.029885,-3.535249,-8.139519,...,,,0.970557,0.029151,0.000292,-0.029885,-3.535249,-8.139519,0,0.970557
117,117,,,,0.0007,0.146018,0.853282,-7.26488,-1.924024,-0.158665,...,,,0.0007,0.146018,0.853282,-7.26488,-1.924024,-0.158665,2,0.853282
83,83,,,,0.004844,0.302749,0.692408,-5.3301,-1.194852,-0.36758,...,,,0.004844,0.302749,0.692408,-5.3301,-1.194852,-0.36758,2,0.692408
105,105,,,,0.000309,0.115297,0.884395,-8.083683,-2.160248,-0.122852,...,,,0.000309,0.115297,0.884395,-8.083683,-2.160248,-0.122852,2,0.884395


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
