In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

random_state = np.random.RandomState(0)
clf = OneVsRestClassifier(SVC(probability=False, random_state=random_state))

clf.fit(X, Y)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False,
  random_state=<mtrand.RandomState object at 0x7ff6c7c0c558>,
  shrinking=True, tol=0.001, verbose=False),
          n_jobs=1)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:4000])

WITH "SV_data" AS 
(SELECT "Values".sv_idx AS sv_idx, "Values".dual_coeff AS dual_coeff, "Values".sv_0 AS sv_0, "Values".sv_1 AS sv_1, "Values".sv_2 AS sv_2, "Values".sv_3 AS sv_3 
FROM (SELECT 0 AS sv_idx, -0.213687020099 AS dual_coeff, 7.0 AS sv_0, 3.2 AS sv_1, 4.7 AS sv_2, 1.4 AS sv_3 UNION ALL SELECT 1 AS sv_idx, -0.421524254498 AS dual_coeff, 4.9 AS sv_0, 2.4 AS sv_1, 3.3 AS sv_2, 1.0 AS sv_3 UNION ALL SELECT 2 AS sv_idx, -1.0 AS dual_coeff, 5.1 AS sv_0, 2.5 AS sv_1, 3.0 AS sv_2, 1.1 AS sv_3 UNION ALL SELECT 3 AS sv_idx, -0.202228133014 AS dual_coeff, 6.3 AS sv_0, 3.3 AS sv_1, 6.0 AS sv_2, 2.5 AS sv_3 UNION ALL SELECT 4 AS sv_idx, -0.0696490664691 AS dual_coeff, 5.8 AS sv_0, 2.8 AS sv_1, 5.1 AS sv_2, 2.4 AS sv_3 UNION ALL SELECT 5 AS sv_idx, -0.00453627448611 AS dual_coeff, 7.7 AS sv_0, 3.8 AS sv_1, 6.7 AS sv_2, 2.2 AS sv_3 UNION ALL SELECT 6 AS sv_idx, -0.37172099945 AS dual_coeff, 7.7 AS sv_0, 2.6 AS sv_1, 6.9 AS sv_2, 2.3 AS sv_3 UNION ALL SELECT 7 AS sv_idx, -0.289211353716 AS

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision,DecisionProba
114,114,-1.000584,-1.649375,1.539308,,,,,,,2,
74,74,-1.103844,1.700273,-1.735287,,,,,,,1,
9,9,1.164682,-1.196761,-1.159309,,,,,,,0,
88,88,-1.178317,1.559258,-1.571135,,,,,,,1,
25,25,1.068107,-1.075068,-1.200005,,,,,,,0,
5,5,1.077149,-1.089247,-1.195726,,,,,,,0,
48,48,1.193945,-1.243836,-1.158205,,,,,,,0,
117,117,-1.000132,-1.277026,1.116914,,,,,,,2,
83,83,-1.105858,-0.425507,0.476761,,,,,,,2,
105,105,-1.093245,-1.545382,1.386168,,,,,,,2,


In [10]:
sql_output.Decision.value_counts()

2    52
0    50
1    48
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
#skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,-1.000584,-1.649375,1.539308,,,,,,,2
74,74,-1.103844,1.700273,-1.735287,,,,,,,1
9,9,1.164682,-1.196761,-1.159309,,,,,,,0
88,88,-1.178317,1.559258,-1.571135,,,,,,,1
25,25,1.068107,-1.075068,-1.200005,,,,,,,0
5,5,1.077149,-1.089247,-1.195726,,,,,,,0
48,48,1.193945,-1.243836,-1.158205,,,,,,,0
117,117,-1.000132,-1.277026,1.116914,,,,,,,2
83,83,-1.105858,-0.425507,0.476761,,,,,,,2
105,105,-1.093245,-1.545382,1.386168,,,,,,,2


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
114,114,-1.000584,-1.649375,1.539308,,,,,,,...,-1.649375,1.539308,,,,,,,2,
74,74,-1.103844,1.700273,-1.735287,,,,,,,...,1.700273,-1.735287,,,,,,,1,
9,9,1.164682,-1.196761,-1.159309,,,,,,,...,-1.196761,-1.159309,,,,,,,0,
88,88,-1.178317,1.559258,-1.571135,,,,,,,...,1.559258,-1.571135,,,,,,,1,
25,25,1.068107,-1.075068,-1.200005,,,,,,,...,-1.075068,-1.200005,,,,,,,0,
5,5,1.077149,-1.089247,-1.195726,,,,,,,...,-1.089247,-1.195726,,,,,,,0,
48,48,1.193945,-1.243836,-1.158205,,,,,,,...,-1.243836,-1.158205,,,,,,,0,
117,117,-1.000132,-1.277026,1.116914,,,,,,,...,-1.277026,1.116914,,,,,,,2,
83,83,-1.105858,-0.425507,0.476761,,,,,,,...,-0.425507,0.476761,,,,,,,2,
105,105,-1.093245,-1.545382,1.386168,,,,,,,...,-1.545382,1.386168,,,,,,,2,


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql,DecisionProba
