In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X, Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "DT_node_data" AS 
(SELECT "Values".node_id AS node_id, "Values".feature AS feature, "Values".threshold AS threshold, "Values".count AS count, "Values".depth AS depth, "Values"."Proba_0" AS "Proba_0", "Values"."LogProba_0" AS "LogProba_0", "Values"."Proba_1" AS "Proba_1", "Values"."LogProba_1" AS "LogProba_1", "Values"."Proba_2" AS "Proba_2", "Values"."LogProba_2" AS "LogProba_2", "Values"."Decision" AS "Decision" 
FROM (SELECT 0 AS node_id, 'Feature_2' AS feature, 2.450000047683716 AS threshold, 150 AS count, 0 AS depth, 0.3333333333333333 AS "Proba_0", -1.09861228867 AS "LogProba_0", 0.3333333333333333 AS "Proba_1", -1.09861228867 AS "LogProba_1", 0.3333333333333333 AS "Proba_2", -1.09861228867 AS "LogProba_2", 0 AS "Decision" UNION ALL SELECT 1 AS node_id, CAST(NULL AS VARCHAR(256)) AS feature, CAST(NULL AS FLOAT) AS threshold, 50 AS count, 1 AS depth, 1.0 AS "Proba_0", 0.0 AS "LogProba_0", 0.0 AS "Proba_1", -1.79769313486231e+308 AS "LogProba_1", 0.0 AS "Proba_2", -1.797693134

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
128,128,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
45,45,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
87,87,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
78,78,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
16,16,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
0,0,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
112,112,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
9,9,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
13,13,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
59,59,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1


In [10]:
sql_output.Decision.value_counts()

2    50
1    50
0    50
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


  return np.log(proba)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
33,33,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
125,125,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
92,92,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
135,135,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
95,95,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
102,102,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
148,148,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
59,59,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
46,46,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
110,110,,,,0.0,0.0,1.0,-inf,-inf,0.0,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
85,85,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
72,72,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
123,123,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
5,5,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
16,16,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
105,105,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
76,76,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
119,119,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
49,49,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
29,29,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
