In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

ds = datasets.load_diabetes()
X = ds.data  
Y = ds.target
# print(iris.DESCR)

In [3]:
from sklearn.svm import SVR

random_state = np.random.RandomState(0)
clf = SVR()


clf.fit(X, Y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [4]:
# clf.__dict__


## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "SV_data" AS 
(SELECT "Values".sv_idx AS sv_idx, "Values".dual_coeff AS dual_coeff, "Values".sv_0 AS sv_0, "Values".sv_1 AS sv_1, "Values".sv_2 AS sv_2, "Values".sv_3 AS sv_3, "Values".sv_4 AS sv_4, "Values".sv_5 AS sv_5, "Values".sv_6 AS sv_6, "Values".sv_7 AS sv_7, "Values".sv_8 AS sv_8, "Values".sv_9 AS sv_9 
FROM (SELECT 0 AS sv_idx, 1.0 AS dual_coeff, 0.0380759064334 AS sv_0, 0.0506801187398 AS sv_1, 0.0616962065187 AS sv_2, 0.021872354995 AS sv_3, -0.0442234984244 AS sv_4, -0.0348207628377 AS sv_5, -0.043400845652 AS sv_6, -0.00259226199818 AS sv_7, 0.0199084208763 AS sv_8, -0.0176461251598 AS sv_9 UNION ALL SELECT 1 AS sv_idx, -1.0 AS dual_coeff, -0.00188201652779 AS sv_0, -0.044641636507 AS sv_1, -0.0514740612388 AS sv_2, -0.0263278347174 AS sv_3, -0.00844872411122 AS sv_4, -0.0191633397482 AS sv_5, 0.0744115640788 AS sv_6, -0.0394933828741 AS sv_7, -0.0683297436244 AS sv_8, -0.0922040496268 AS sv_9 UNION ALL SELECT 2 AS sv_idx, 1.0 AS dual_coeff, 0.0852989062967 AS sv_0, 

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_' + str(c) for c in range(X.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()


In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Estimator
417,417,140.544419
411,411,140.378884
296,296,140.236823
64,64,140.473991
74,74,140.801534
292,292,140.145841
251,251,141.512181
351,351,140.201124
305,305,140.710223
382,382,141.060614


In [10]:
sql_output.describe()

Unnamed: 0,KEY,Estimator
count,442.0,442.0
mean,220.5,140.695142
std,127.738666,0.368948
min,0.0,139.925357
25%,110.25,140.414653
50%,220.5,140.708991
75%,330.75,140.97554
max,441.0,141.644468


## Scikit-learn Prediction

In [11]:
skl_output = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output['KEY'] = skl_output_key.KEY
skl_output['Estimator'] = clf.predict(X)
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Estimator
417,417,140.544419
411,411,140.378884
296,296,140.236823
64,64,140.473991
74,74,140.801534
292,292,140.145841
251,251,141.512181
351,351,140.201124
305,305,140.710223
382,382,141.060614


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')
sql_skl_join['Error'] = sql_skl_join.Estimator_sql - sql_skl_join.Estimator_skl

In [13]:
sql_skl_join.head(12)

Unnamed: 0,KEY_skl,Estimator_skl,KEY_sql,Estimator_sql,Error
0,0,140.901365,0,140.901365,3.317666e-10
1,1,140.123226,1,140.123226,3.317382e-10
2,2,140.79657,2,140.79657,3.313687e-10
3,3,140.716535,3,140.716535,3.315108e-10
4,4,140.53684,4,140.53684,3.31454e-10
5,5,140.062743,5,140.062743,3.311129e-10
6,6,140.278201,6,140.278201,3.316813e-10
7,7,140.88511,7,140.88511,3.313119e-10
8,8,140.797221,8,140.797221,3.314256e-10
9,9,140.794286,9,140.794286,3.315108e-10


In [14]:
sql_skl_join.describe()


Unnamed: 0,KEY_skl,Estimator_skl,KEY_sql,Estimator_sql,Error
count,442.0,442.0,442.0,442.0,442.0
mean,220.5,140.695142,220.5,140.695142,3.312856e-10
std,127.738666,0.368948,127.738666,0.368948,2.844894e-13
min,0.0,139.925357,0.0,139.925357,3.307719e-10
25%,110.25,140.414653,110.25,140.414653,3.310276e-10
50%,220.5,140.708991,220.5,140.708991,3.313119e-10
75%,330.75,140.97554,330.75,140.97554,3.315392e-10
max,441.0,141.644468,441.0,141.644468,3.318235e-10
