In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier()
clf.fit(X, Y)

RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH linear_model_cte AS 
(SELECT "ADS"."KEY" AS "KEY", 0.12705283360921124 * "ADS"."Feature_0" + 0.4695089303373806 * "ADS"."Feature_1" + -0.4427640409866786 * "ADS"."Feature_2" + -0.130707226701817 * "ADS"."Feature_3" + -0.688748826512 AS "Score_0", -0.030416160173984446 * "ADS"."Feature_0" + -0.8686439569055142 * "ADS"."Feature_1" + 0.36914586521603815 * "ADS"."Feature_2" + -0.8145271451057391 * "ADS"."Feature_3" + 2.08608735288 AS "Score_1", -0.09663667343522389 * "ADS"."Feature_0" + 0.39913502656813316 * "ADS"."Feature_1" + 0.07361817577063601 * "ADS"."Feature_2" + 0.9452343718075641 * "ADS"."Feature_3" + -2.39733852637 AS "Score_2" 
FROM "INPUT_DATA" AS "ADS"), 
orig_cte AS 
(SELECT linear_model_cte."KEY" AS "KEY", linear_model_cte."Score_0" AS "Score_0", linear_model_cte."Score_1" AS "Score_1", linear_model_cte."Score_2" AS "Score_2", CAST(NULL AS FLOAT) AS "Proba_0", CAST(NULL AS FLOAT) AS "Proba_1", CAST(NULL AS FLOAT) AS "Proba_2", CAST(NULL AS FLOAT) AS "LogProba_0", CAST(NU

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
121,121,-1.093586,-0.336686,0.430272,0.129508,0.276066,0.594427,-2.044016,-1.287116,-0.520158,2
79,79,-0.424206,0.131724,-0.707519,0.285972,0.49861,0.215418,-1.251861,-0.695931,-1.535174,1
60,60,-0.794848,0.674202,-0.879354,0.159641,0.693655,0.146704,-1.834831,-0.36578,-1.919336,1
43,43,0.80295,-1.00433,-0.79862,0.732236,0.12016,0.147604,-0.311652,-2.118932,-1.913222,0
10,10,1.044232,-0.901329,-1.142903,0.796721,0.113857,0.089422,-0.227251,-2.172812,-2.414386,0
8,8,0.585848,-0.212912,-1.372936,0.628569,0.282784,0.088647,-0.46431,-1.263071,-2.423094,0
67,67,-0.630208,0.263306,-0.633098,0.225184,0.550282,0.224534,-1.490838,-0.597324,-1.493728,1
53,53,-0.851063,0.338616,-0.487552,0.174691,0.574039,0.25127,-1.744737,-0.555058,-1.381226,1
46,46,1.008791,-0.942154,-1.066637,0.788866,0.112129,0.099005,-0.237159,-2.188104,-2.312586,0
48,48,1.031527,-0.898288,-1.133239,0.793681,0.115223,0.091096,-0.231074,-2.160888,-2.395839,0


In [10]:
sql_output.Decision.value_counts()

2    62
0    50
1    38
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(clf.decision_function(X), columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(columns=['Proba_0', 'Proba_1', 'Proba_2'])
# skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
# skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
58,58,-0.695258,0.005459,-0.310201,,,,,,,1
129,129,-1.142605,0.098962,0.043643,,,,,,,1
10,10,1.044232,-0.901329,-1.142903,,,,,,,0
104,104,-1.309966,-0.368463,0.678429,,,,,,,2
45,45,0.67055,-0.393396,-1.277154,,,,,,,0
13,13,0.765994,-0.326026,-1.439968,,,,,,,0
106,106,-1.107058,0.041899,0.06516,,,,,,,2
127,127,-0.910017,-0.362717,0.272734,,,,,,,2
68,68,-1.056601,0.425856,-0.369255,,,,,,,1
6,6,0.832943,-0.73477,-1.098172,,,,,,,0


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
94,94,-0.739107,0.061946,-0.322838,,,,,,,...,-0.739107,0.061946,-0.322838,0.210785,0.469604,0.319611,-1.556918,-0.755866,-1.140649,1
148,148,-0.996243,-0.935907,0.93215,,,,,,,...,-0.996243,-0.935907,0.93215,0.111849,0.118805,0.769346,-2.190608,-2.130272,-0.262214,2
37,37,0.712071,-0.283482,-1.428589,,,,,,,...,0.712071,-0.283482,-1.428589,0.672451,0.248484,0.079065,-0.396825,-1.392378,-2.537485,0
119,119,-1.303393,0.616512,-0.313119,,,,,,,...,-1.303393,0.616512,-0.313119,0.095127,0.648795,0.256079,-2.352545,-0.432639,-1.362271,1
59,59,-0.67017,-0.118084,-0.211746,,,,,,,...,-0.67017,-0.118084,-0.211746,0.231565,0.402198,0.366238,-1.462897,-0.910811,-1.004473,1
109,109,-1.111365,-1.044555,1.15592,,,,,,,...,-1.111365,-1.044555,1.15592,0.085308,0.091202,0.82349,-2.461489,-2.394679,-0.194204,2
83,83,-1.125986,0.137652,-0.011666,,,,,,,...,-1.125986,0.137652,-0.011666,0.131826,0.466436,0.401739,-2.026273,-0.762635,-0.911954,1
7,7,0.852558,-0.62857,-1.223989,,,,,,,...,0.852558,-0.62857,-1.223989,0.739238,0.168089,0.092673,-0.302135,-1.783262,-2.378682,0
54,54,-0.781056,-0.067541,-0.151404,,,,,,,...,-0.781056,-0.067541,-0.151404,0.20333,0.415028,0.381642,-1.592925,-0.879409,-0.963272,1
138,138,-0.878445,-0.39659,0.275036,,,,,,,...,-0.878445,-0.39659,0.275036,0.172763,0.279716,0.547521,-1.755835,-1.27398,-0.602354,2


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
