In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64


## Build a scikit-learn model

In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

random_state = np.random.RandomState(0)
clf = OneVsRestClassifier(SVC(probability=True, random_state=random_state))

clf.fit(X, Y)

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True,
  random_state=<mtrand.RandomState object at 0x7f38e1aaaee8>,
  shrinking=True, tol=0.001, verbose=False),
          n_jobs=1)

In [4]:
#clf.__dict__

## Generate SQL Code from the Model

In [5]:

def test_ws_sql_gen(pickle_data):
    WS_URL="http://192.168.88.88:1888/model"
    b64_data = base64.b64encode(pickle_data)
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;


In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])

WITH "SV_data" AS 
(SELECT "Values".sv_idx AS sv_idx, "Values".dual_coeff AS dual_coeff, "Values".sv_0 AS sv_0, "Values".sv_1 AS sv_1, "Values".sv_2 AS sv_2, "Values".sv_3 AS sv_3 
FROM (SELECT 0 AS sv_idx, -0.213687020099 AS dual_coeff, 7.0 AS sv_0, 3.2 AS sv_1, 4.7 AS sv_2, 1.4 AS sv_3 UNION ALL SELECT 1 AS sv_idx, -0.421524254498 AS dual_coeff, 4.9 AS sv_0, 2.4 AS sv_1, 3.3 AS sv_2, 1.0 AS sv_3 UNION ALL SELECT 2 AS sv_idx, -1.0 AS dual_coeff, 5.1 AS sv_0, 2.5 AS sv_1, 3.0 AS sv_2, 1.1 AS sv_3 UNION ALL SELECT 3 AS sv_idx, -0.202228133014 AS dual_coeff, 6.3 AS sv_0, 3.3 AS sv_1, 6.0 AS sv_2, 2.5 AS sv_3 UNION ALL SELECT 4 AS sv_idx, -0.0696490664691 AS dual_coeff, 5.8 AS sv_0, 2.8 AS sv_1, 5.1 AS sv_2, 2.4 AS sv_3 UNION ALL SELECT 5 AS sv_idx, -0.00453627448611 AS dual_coeff, 7.7 AS sv_0, 3.8 AS sv_1, 6.7 AS sv_2, 2.2 AS sv_3 UNION ALL SELECT 6 AS sv_idx, -0.37172099945 AS dual_coeff, 7.7 AS sv_0, 2.6 AS sv_1, 6.9 AS sv_2, 2.3 AS sv_3 UNION ALL SELECT 7 AS sv_idx, -0.289211353716 AS

## Execute the SQL Code

In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)


In [8]:
sql_output = pd.read_sql(lSQL , conn);


In [9]:
sql_output.sample(12)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
23,23,1.000064,-0.999911,-1.260724,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
137,137,-1.097838,-1.108564,1.183407,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,2
91,91,-1.142207,1.074786,-1.023586,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,1
146,146,-1.091793,-0.72736,0.724121,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,2
92,92,-1.224239,1.715417,-1.705135,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,1
108,108,-1.053504,-1.607066,1.567271,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,2
20,20,1.048555,-1.033764,-1.248957,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
119,119,-1.058431,-0.345043,0.324045,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,2
99,99,-1.220833,1.570906,-1.548526,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,1
58,58,-1.075951,1.292075,-1.311027,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,1


In [10]:
sql_output.Decision.value_counts()

2    52
0    50
1    48
Name: Decision, dtype: int64

## Scikit-learn Prediction

In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
#skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_log_proba = pd.DataFrame(columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12)


Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
70,70,,,,0.009292,0.487246,0.503462,,,,1
1,1,,,,0.974982,0.010369,0.014648,,,,0
3,3,,,,0.973226,0.01042,0.016354,,,,0
134,134,,,,0.013879,0.021499,0.964622,,,,2
141,141,,,,0.011312,0.021559,0.96713,,,,2
45,45,,,,0.974456,0.010673,0.014871,,,,0
10,10,,,,0.976073,0.009699,0.014228,,,,0
83,83,,,,0.009084,0.148097,0.842819,,,,2
31,31,,,,0.97453,0.013797,0.011673,,,,0
28,28,,,,0.977814,0.008279,0.013907,,,,0


## Comparing the SQL and Scikit-learn Predictions

In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.head(12)

Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
0,0,,,,0.978103,0.007267,0.014629,,,,...,1.234933,-1.297494,-1.152149,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
1,1,,,,0.974982,0.010369,0.014648,,,,...,1.160136,-1.200607,-1.151997,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
2,2,,,,0.975701,0.007172,0.017127,,,,...,1.222472,-1.300574,-1.109277,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
3,3,,,,0.973226,0.01042,0.016354,,,,...,1.142706,-1.199078,-1.1222,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
4,4,,,,0.977579,0.006596,0.015824,,,,...,1.246043,-1.323655,-1.130768,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
5,5,,,,0.971892,0.015626,0.012481,,,,...,1.077149,-1.089247,-1.195726,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
6,6,,,,0.974794,0.00734,0.017866,,,,...,1.191883,-1.294421,-1.098079,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
7,7,,,,0.977791,0.008683,0.013526,,,,...,1.206204,-1.249178,-1.173493,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
8,8,,,,0.968398,0.011365,0.020236,,,,...,1.098639,-1.174836,-1.064382,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0
9,9,,,,0.975229,0.010516,0.014255,,,,...,1.164682,-1.196761,-1.159309,,,,-1.797693e+308,-1.797693e+308,-1.797693e+308,0


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Unnamed: 0,KEY_skl,Score_0_skl,Score_1_skl,Score_2_skl,Proba_0_skl,Proba_1_skl,Proba_2_skl,LogProba_0_skl,LogProba_1_skl,LogProba_2_skl,...,Score_0_sql,Score_1_sql,Score_2_sql,Proba_0_sql,Proba_1_sql,Proba_2_sql,LogProba_0_sql,LogProba_1_sql,LogProba_2_sql,Decision_sql
