In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
import xgboost as xgb
clf = xgb.XGBClassifier(n_estimators=5, min_child_weight=10, max_depth=3, seed=1960);

clf.fit(X, Y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=10, missing=None, n_estimators=5,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1960,
       silent=True, subsample=1)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r)
    content = r.json()
    #print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL)

WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_2" < 2.450000047683716) THEN 1 ELSE 2 END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".node_id AS node_id, "Values".feature AS feature, "Values".threshold AS threshold, "Values".depth AS depth, "Values".parent_id AS parent_id, "Values"."Score" AS "Score" 
FROM (SELECT 0 AS node_id, 'Feature_2' AS feature, 2.450000047683716 AS threshold, 0 AS depth, CAST(NULL AS INTEGER) AS parent_id, 0.0 AS "Score" UNION ALL SELECT 1 AS node_id, CAST(NULL AS VARCHAR(256)) AS feature, CAST(NULL AS FLOAT) AS threshold, 1 AS depth, 0 AS parent_id, 0.1435406655073166 AS "Score" UNION ALL SELECT 2 AS node_id, CAST(NULL AS VARCHAR(256)) AS feature, CAST(NULL AS FLOAT) AS threshold, 1 AS depth, 0 AS parent_id, -0.07334963977336884 AS "Score") AS "Values"), 
"DT_Output" AS 
(SELECT "DT_node_lookup"."KEY" AS "KEY", "DT_node_lookup".node_id_2 AS node_id_2, "DT_node_data".node_id AS node_id, "DT_no

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()
#conn.execute('drop table INPUT_DATA')
lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,,,,0.238787,0.248941,0.512272,-1.432183,-1.390541,-0.668899,2,0.512272
74,74,,,,0.226981,0.538565,0.234453,-1.482887,-0.618846,-1.4505,1,0.538565
9,9,,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
88,88,,,,0.228278,0.541641,0.230081,-1.477193,-0.613152,-1.469322,1,0.541641
25,25,,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
5,5,,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
48,48,,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
117,117,,,,0.221793,0.231224,0.546982,-1.506009,-1.464367,-0.603339,2,0.546982
83,83,,,,0.207686,0.415291,0.377024,-1.571729,-0.878777,-0.975447,1,0.415291
105,105,,,,0.221793,0.231224,0.546982,-1.506009,-1.464367,-0.603339,2,0.546982


In [10]:
sql_output.Decision.value_counts()

1    51
0    50
2    49
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.238787,0.248941,0.512272,,,,2
74,74,,,,0.226981,0.538566,0.234453,,,,1
9,9,,,,0.554091,0.22302,0.222889,,,,0
88,88,,,,0.228278,0.541641,0.230081,,,,1
25,25,,,,0.554091,0.22302,0.222889,,,,0
5,5,,,,0.554091,0.22302,0.222889,,,,0
48,48,,,,0.554091,0.22302,0.222889,,,,0
117,117,,,,0.221793,0.231224,0.546982,,,,2
83,83,,,,0.207686,0.415291,0.377024,,,,1
105,105,,,,0.221793,0.231224,0.546982,,,,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
47,47,,,,0.554091,0.22302,0.222889,,,,...,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
58,58,,,,0.228858,0.525042,0.2461,,,,...,,,0.228858,0.525042,0.2461,-1.474653,-0.644277,-1.402017,1,0.525042
122,122,,,,0.221793,0.231224,0.546982,,,,...,,,0.221793,0.231224,0.546982,-1.506009,-1.464367,-0.603339,2,0.546982
34,34,,,,0.554091,0.22302,0.222889,,,,...,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
26,26,,,,0.554091,0.22302,0.222889,,,,...,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
38,38,,,,0.554091,0.22302,0.222889,,,,...,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
107,107,,,,0.221793,0.231224,0.546982,,,,...,,,0.221793,0.231224,0.546982,-1.506009,-1.464367,-0.603339,2,0.546982
2,2,,,,0.554091,0.22302,0.222889,,,,...,,,0.554091,0.22302,0.222889,-0.590427,-1.500493,-1.501082,0,0.554091
149,149,,,,0.238787,0.248941,0.512272,,,,...,,,0.238787,0.248941,0.512272,-1.432183,-1.390541,-0.668899,2,0.512272
143,143,,,,0.221793,0.231224,0.546982,,,,...,,,0.221793,0.231224,0.546982,-1.506009,-1.464367,-0.603339,2,0.546982


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
