In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model" # "https://sklearn2sql.herokuapp.com/"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_0" <= 5.550000190734863) THEN CASE WHEN ("ADS"."Feature_3" <= 0.800000011920929) THEN 2 ELSE 3 END ELSE CASE WHEN ("ADS"."Feature_2" <= 5.050000190734863) THEN CASE WHEN ("ADS"."Feature_1" <= 3.5999999046325684) THEN CASE WHEN ("ADS"."Feature_3" <= 1.75) THEN 7 ELSE 8 END ELSE 9 END ELSE CASE WHEN ("ADS"."Feature_0" <= 6.099999904632568) THEN CASE WHEN ("ADS"."Feature_3" <= 1.7000000476837158) THEN 12 ELSE 13 END ELSE 14 END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".node_id AS node_id, "Values".feature AS feature, "Values".threshold AS threshold, "Values".count AS count, "Values".depth AS depth, "Values".parent_id AS parent_id, "Values"."Proba_0.0" AS "Proba_0.0", "Values"."LogProba_0.0" AS "LogProba_0.0", "Values"."Proba_1.0" AS "Proba_1.0", "Values"."LogProba_1.0" AS "LogProba_1.0", "Values"."Proba_2.0" AS "Proba_2.0", "Values"."LogProba_2.0" AS "LogProba_2.

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
74,74,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
9,9,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
88,88,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
25,25,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
5,5,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
48,48,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
117,117,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
83,83,,,,0.0,0.7,0.3,-1.797693e+308,-0.3566749,-1.203973,1
105,105,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2


In [10]:
sql_output.Decision.value_counts()

2    51
0    50
1    49
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


  return np.log(proba)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
74,74,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
9,9,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
88,88,,,,0.0,1.0,0.0,-inf,0.0,-inf,1
25,25,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
5,5,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
48,48,,,,1.0,0.0,0.0,0.0,-inf,-inf,0
117,117,,,,0.0,0.0,1.0,-inf,-inf,0.0,2
83,83,,,,0.0,0.7,0.3,-inf,-0.356675,-1.203973,1
105,105,,,,0.0,0.0,1.0,-inf,-inf,0.0,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
111,111,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
122,122,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
75,75,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1
101,101,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
124,124,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
0,0,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
104,104,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
24,24,,,,1.0,0.0,0.0,0.0,-inf,-inf,...,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
146,146,,,,0.0,0.0,1.0,-inf,-inf,0.0,...,,,,0.0,0.0,1.0,-1.797693e+308,-1.797693e+308,0.0,2
71,71,,,,0.0,1.0,0.0,-inf,0.0,-inf,...,,,,0.0,1.0,0.0,-1.797693e+308,0.0,-1.797693e+308,1


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
