In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64

import time
from datetime import datetime


# Scikit-Learn Model Deployment Use Case

In this case, we will build a very large ensemble model (here, Random Foreast with 512 trees) on a digits dataset  (not very original !!!) and generate a SQL code for deployment using the web service. 

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result. 

Both results are stored in pandas dataframes. 


## Build a scikit-learn model

In [2]:
from sklearn import datasets

digits = datasets.load_digits()
X = digits.data
n_classes = len(digits.target_names)
print(X.shape)

(1797, 64)


In [3]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=512, max_depth=7, min_samples_leaf=30, random_state = 1960)
clf.fit(digits.data, digits.target)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:


def test_ws_sql_gen(pickle_data):
    WS_URL="http://localhost:1888/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"sqlite"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;



In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
with open('data/digits_64_features_RF_512_SQLite.sql', 'w') as f:
    f.write(lSQL)
N = len(lSQL)
P = 4000
print(lSQL[0:P] + "..." + lSQL[N//2:(N//2 + P)] + "..." + lSQL[-P:])

WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_43" <= 2.5) THEN CASE WHEN ("ADS"."Feature_26" <= 3.5) THEN CASE WHEN ("ADS"."Feature_19" <= 6.5) THEN CASE WHEN ("ADS"."Feature_28" <= 15.5) THEN 4 ELSE 5 END ELSE 6 END ELSE CASE WHEN ("ADS"."Feature_22" <= 0.5) THEN CASE WHEN ("ADS"."Feature_54" <= 2.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 10 ELSE 11 END ELSE CASE WHEN ("ADS"."Feature_33" <= 0.5) THEN 13 ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN CASE WHEN ("ADS"."Feature_44" <= 1.5) THEN 17 ELSE 18 END ELSE CASE WHEN ("ADS"."Feature_35" <= 2.5) THEN CASE WHEN ("ADS"."Feature_37" <= 8.5) THEN 21 ELSE 22 END ELSE 23 END END END END ELSE CASE WHEN ("ADS"."Feature_54" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 13.5) THEN CASE WHEN ("ADS"."Feature_61" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 0.5) THEN 29 ELSE 30 END ELSE 31 END ELSE CASE WHEN ("ADS"."Feature

## Execute the SQL Code

In [7]:
# save the dataset in a database table

def null_protected_function_1_arg(f , x):
    return f(x);

def null_protected_function_2_args(f , x , n):
    return f(x , n);

def declareAdditionalFunctionsForSQLite(conn):
    import numpy as np

    my_exp = lambda x : null_protected_function_1_arg(np.exp, x);
    conn.connection.connection.create_function("exp", 1, my_exp) 
    my_log = lambda x : null_protected_function_1_arg(np.log, x);
    conn.connection.connection.create_function("ln", 1, my_log) 
    my_tanh = lambda x : null_protected_function_1_arg(np.tanh, x);
    conn.connection.connection.create_function("tanh", 1, my_tanh) 
    my_pow = lambda x, n : null_protected_function_2_args(np.power, x , n);
    conn.connection.connection.create_function("power", 2, my_pow) 
    my_sign = lambda x : null_protected_function_1_arg(np.sign, x);
    conn.connection.connection.create_function("sign", 1, my_sign) 
    my_sqrt = lambda x : null_protected_function_1_arg(np.sqrt, x);
    conn.connection.connection.create_function("sqrt", 1, my_sqrt) 

engine = sa.create_engine('sqlite://' , echo=False)
conn = engine.connect()
declareAdditionalFunctionsForSQLite(conn)

lTable = pd.DataFrame(digits.data);
lTable.columns = ['Feature_' + str(c) for c in range(digits.data.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_csv("data/digits.csv", index=False)
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,...,LogProba_2,LogProba_3,LogProba_4,LogProba_5,LogProba_6,LogProba_7,LogProba_8,LogProba_9,Decision,DecisionProba
1197,1197,,,,,,,,,,...,-3.271224,-2.185342,-3.074713,-1.532286,-3.913458,-2.400581,-1.469503,-2.246424,8,0.23004
913,913,,,,,,,,,,...,-2.49632,-2.097181,-3.883939,-2.617019,-3.047992,-3.786127,-0.820853,-2.183164,8,0.440056
893,893,,,,,,,,,,...,-2.764154,-2.486125,-4.001539,-0.71849,-4.412884,-3.060126,-2.164634,-2.968205,5,0.487488
604,604,,,,,,,,,,...,-3.325013,-4.182968,-2.009558,-1.538349,-1.123425,-3.581279,-2.308723,-4.636519,6,0.325164
743,743,,,,,,,,,,...,-5.167743,-5.425719,-0.615005,-3.479554,-4.037963,-1.379316,-3.507207,-3.300613,4,0.540638
556,556,,,,,,,,,,...,-1.689932,-2.412699,-3.587378,-4.484135,-2.86901,-3.602473,-0.967115,-3.418996,8,0.380178
664,664,,,,,,,,,,...,-3.111373,-2.387851,-3.237153,-2.289746,-2.790281,-3.369948,-0.969122,-2.358257,8,0.379416
195,195,,,,,,,,,,...,-3.639786,-4.971824,-3.079982,-4.108785,-0.226434,-5.803139,-3.301036,-5.121011,6,0.797372
692,692,,,,,,,,,,...,-3.625768,-3.617052,-3.603747,-0.359531,-3.612624,-3.345042,-2.542779,-3.533635,5,0.698004
1589,1589,,,,,,,,,,...,-3.479113,-4.513059,-3.068721,-0.702351,-2.655061,-2.595765,-2.939838,-3.16815,5,0.495419


In [10]:
sql_output.Decision.value_counts()

7    197
9    186
1    184
4    180
6    179
0    178
5    176
2    175
3    175
8    167
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_' + str(c) for c in range(n_classes)]);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_' + str(c) for c in range(n_classes)])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_' + str(c) for c in range(n_classes)])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Score_3,Score_4,Score_5,Score_6,Score_7,Score_8,...,LogProba_1,LogProba_2,LogProba_3,LogProba_4,LogProba_5,LogProba_6,LogProba_7,LogProba_8,LogProba_9,Decision
1197,1197,,,,,,,,,,...,-2.239128,-3.271224,-2.185342,-3.074713,-1.532286,-3.913458,-2.400581,-1.469503,-2.246424,8
913,913,,,,,,,,,,...,-3.128394,-2.49632,-2.097181,-3.883939,-2.617019,-3.047992,-3.786127,-0.820853,-2.183164,8
893,893,,,,,,,,,,...,-2.129655,-2.764154,-2.486125,-4.001539,-0.71849,-4.412884,-3.060126,-2.164634,-2.968205,5
604,604,,,,,,,,,,...,-2.750753,-3.325013,-4.182968,-2.009558,-1.538349,-1.123425,-3.581279,-2.308723,-4.636519,6
743,743,,,,,,,,,,...,-3.187762,-5.167743,-5.425719,-0.615005,-3.479554,-4.037963,-1.379316,-3.507207,-3.300613,4
556,556,,,,,,,,,,...,-1.72456,-1.689932,-2.412699,-3.587378,-4.484135,-2.86901,-3.602473,-0.967115,-3.418996,8
664,664,,,,,,,,,,...,-2.673481,-3.111373,-2.387851,-3.237153,-2.289746,-2.790281,-3.369948,-0.969122,-2.358257,8
195,195,,,,,,,,,,...,-3.013571,-3.639786,-4.971824,-3.079982,-4.108785,-0.226434,-5.803139,-3.301036,-5.121011,6
692,692,,,,,,,,,,...,-3.312214,-3.625768,-3.617052,-3.603747,-0.359531,-3.612624,-3.345042,-2.542779,-3.533635,5
1589,1589,,,,,,,,,,...,-3.708405,-3.479113,-4.513059,-3.068721,-0.702351,-2.655061,-2.595765,-2.939838,-3.16815,5


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Score_3_skl,Score_4_skl,Score_5_skl,Score_6_skl,Score_7_skl,Score_8_skl,...,LogProba_2_sql,LogProba_3_sql,LogProba_4_sql,LogProba_5_sql,LogProba_6_sql,LogProba_7_sql,LogProba_8_sql,LogProba_9_sql,Decision_sql,DecisionProba
1197,1197,,,,,,,,,,...,-3.271224,-2.185342,-3.074713,-1.532286,-3.913458,-2.400581,-1.469503,-2.246424,8,0.23004
913,913,,,,,,,,,,...,-2.49632,-2.097181,-3.883939,-2.617019,-3.047992,-3.786127,-0.820853,-2.183164,8,0.440056
893,893,,,,,,,,,,...,-2.764154,-2.486125,-4.001539,-0.71849,-4.412884,-3.060126,-2.164634,-2.968205,5,0.487488
604,604,,,,,,,,,,...,-3.325013,-4.182968,-2.009558,-1.538349,-1.123425,-3.581279,-2.308723,-4.636519,6,0.325164
743,743,,,,,,,,,,...,-5.167743,-5.425719,-0.615005,-3.479554,-4.037963,-1.379316,-3.507207,-3.300613,4,0.540638
556,556,,,,,,,,,,...,-1.689932,-2.412699,-3.587378,-4.484135,-2.86901,-3.602473,-0.967115,-3.418996,8,0.380178
664,664,,,,,,,,,,...,-3.111373,-2.387851,-3.237153,-2.289746,-2.790281,-3.369948,-0.969122,-2.358257,8,0.379416
195,195,,,,,,,,,,...,-3.639786,-4.971824,-3.079982,-4.108785,-0.226434,-5.803139,-3.301036,-5.121011,6,0.797372
692,692,,,,,,,,,,...,-3.625768,-3.617052,-3.603747,-0.359531,-3.612624,-3.345042,-2.542779,-3.533635,5,0.698004
1589,1589,,,,,,,,,,...,-3.479113,-4.513059,-3.068721,-0.702351,-2.655061,-2.595765,-2.939838,-3.16815,5,0.495419


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Score_3_skl,Score_4_skl,Score_5_skl,Score_6_skl,Score_7_skl,Score_8_skl,...,LogProba_2_sql,LogProba_3_sql,LogProba_4_sql,LogProba_5_sql,LogProba_6_sql,LogProba_7_sql,LogProba_8_sql,LogProba_9_sql,Decision_sql,DecisionProba
