In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

ds = datasets.load_diabetes()
X = ds.data  
Y = ds.target
# print(iris.DESCR)

In [3]:
from  sklearn.linear_model import Ridge

random_state = np.random.RandomState(0)
clf = Ridge()

clf.fit(X, Y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [4]:
clf.__dict__


{'alpha': 1.0,
 'coef_': array([  29.46574564,  -83.15488546,  306.35162706,  201.62943384,
           5.90936896,  -29.51592665, -152.04046539,  117.31171538,
         262.94499533,  111.878718  ]),
 'copy_X': True,
 'fit_intercept': True,
 'intercept_': 152.13348416289622,
 'max_iter': None,
 'n_iter_': None,
 'normalize': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.001}

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH linear_model_cte AS 
(SELECT "ADS"."KEY" AS "KEY", 29.465745644229408 * "ADS"."Feature_0" + -83.15488546324998 * "ADS"."Feature_1" + 306.3516270563724 * "ADS"."Feature_2" + 201.62943383870245 * "ADS"."Feature_3" + 5.909368961868564 * "ADS"."Feature_4" + -29.515926646857697 * "ADS"."Feature_5" + -152.04046539318395 * "ADS"."Feature_6" + 117.3117153820285 * "ADS"."Feature_7" + 262.944995326854 * "ADS"."Feature_8" + 111.87871800112701 * "ADS"."Feature_9" + 152.133484163 AS "Estimator" 
FROM "INPUT_DATA" AS "ADS")
 SELECT linear_model_cte."KEY" AS "KEY", linear_model_cte."Estimator" AS "Estimator" 
FROM linear_model_cte


## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_' + str(c) for c in range(X.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Estimator
417,417,145.911386
411,411,138.544138
296,296,100.400942
64,64,127.389828
74,74,158.327138
292,292,103.504063
251,251,241.260154
351,351,104.225791
305,305,142.345257
382,382,210.97894


In [10]:
sql_output.describe()

Unnamed: 0,KEY,Estimator
count,442.0,442.0
mean,220.5,152.133484
std,127.738666,38.663073
min,0.0,73.293741
25%,110.25,123.096865
50%,220.5,151.681327
75%,330.75,178.741098
max,441.0,257.555305


## Scikit-learn Prediction

In [11]:
skl_output = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_estimator = pd.DataFrame(clf.predict(X), columns=['Estimator'])
skl_output['KEY'] = skl_output_key
skl_output['Estimator'] = skl_output_estimator
skl_output.sample(12, random_state=1960)


Unnamed: 0,KEY,Estimator
417,417,145.911386
411,411,138.544138
296,296,100.400942
64,64,127.389828
74,74,158.327138
292,292,103.504063
251,251,241.260154
351,351,104.225791
305,305,142.345257
382,382,210.97894


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')
sql_skl_join['Error'] = sql_skl_join.Estimator_sql - sql_skl_join.Estimator_skl

In [13]:
sql_skl_join.head(12)

Unnamed: 0,KEY_skl,Estimator_skl,KEY_sql,Estimator_sql,Error
0,0,182.673573,0,182.673573,1.032845e-10
1,1,90.999027,1,90.999027,1.037819e-10
2,2,166.114007,2,166.114007,1.037108e-10
3,3,156.035951,3,156.035951,1.039098e-10
4,4,133.658649,4,133.658649,1.041087e-10
5,5,101.814532,5,101.814532,1.03654e-10
6,6,103.814655,6,103.814655,1.034834e-10
7,7,149.490088,7,149.490088,1.034266e-10
8,8,161.090676,8,161.090676,1.035687e-10
9,9,179.766558,9,179.766558,1.035687e-10


In [14]:
sql_skl_join.describe()


Unnamed: 0,KEY_skl,Estimator_skl,KEY_sql,Estimator_sql,Error
count,442.0,442.0,442.0,442.0,442.0
mean,220.5,152.133484,220.5,152.133484,1.037598e-10
std,127.738666,38.663073,127.738666,38.663073,2.716532e-13
min,0.0,73.293741,0.0,73.293741,1.032703e-10
25%,110.25,123.096865,110.25,123.096865,1.035403e-10
50%,220.5,151.681327,220.5,151.681327,1.037677e-10
75%,330.75,178.741098,330.75,178.741098,1.039666e-10
max,441.0,257.555305,441.0,257.555305,1.042508e-10
