# In-database Scikit-Learn

## Preparing the dataset

In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
import sqlalchemy as sa

def populate_table(conn):
    iris = datasets.load_iris()
    X = iris.data  
    N = X.shape[0]
    y = iris.target.reshape(N,1)
    k = np.arange(N).reshape(N, 1)
    k_X_y = np.concatenate((k, X, y) , axis=1)
    lTable=pd.DataFrame(k_X_y)
    # print(lTable.head())
    lTable.columns = ['KEY', 'sepal_length_cm', 'sepal_width _cm', 'petal_length_cm', 'petal_width_cm' , 'TGT'];
    lTable['TGT'] = lTable['TGT'].apply(int)
    lTable['KEY'] = lTable['KEY'].apply(int)
    lTable.to_sql("iris" , conn,   if_exists='replace', index=False)



In [2]:
lDSN = "postgresql://db:db@localhost/db?port=5432"
# lDSN = "sqlite:///"
engine = sa.create_engine(lDSN, echo=False)
conn = engine.connect()

# for this demo, copy the iris dataset in a table called 'iris'.
populate_table(conn)

In [3]:
df = pd.read_sql("iris" , conn)
df.sample(12, random_state=1960)

Unnamed: 0,KEY,sepal_length_cm,sepal_width _cm,petal_length_cm,petal_width_cm,TGT
114,114,5.8,2.8,5.1,2.4,2
74,74,6.4,2.9,4.3,1.3,1
9,9,4.9,3.1,1.5,0.1,0
88,88,5.6,3.0,4.1,1.3,1
25,25,5.0,3.0,1.6,0.2,0
5,5,5.4,3.9,1.7,0.4,0
48,48,5.3,3.7,1.5,0.2,0
117,117,7.7,3.8,6.7,2.2,2
83,83,6.0,2.7,5.1,1.6,1
105,105,7.6,3.0,6.6,2.1,2


## Training a Model

In [4]:
metadata = {"primary_key" : "KEY",
            "features" : ['sepal_length_cm', 'sepal_width _cm', 'petal_length_cm', 'petal_width_cm'],
            "targets" : ["TGT"],
            "table" : "iris"}

# train any scikit model on the iris dataset
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=2, min_samples_leaf=30, random_state = 1960)
clf.fit(df[metadata['features']].values, df[metadata['targets']].values)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=30,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1960, splitter='best')

## Deploying the Model

In [5]:
from sklearn2sql import PyCodeGenerator as codegen
cg1 = codegen.cAbstractCodeGenerator();

# this API calls uses a trained model (clf), metadata and a SQL dialect.
lSQL = cg1.generateCodeWithMetadata(clf , metadata, dialect="postgresql")[0];


cGenerationWrapperFactory::createWrapper() <class 'sklearn.tree.tree.DecisionTreeClassifier'>
cClassifierMixin_CodeGenWrapper::setObject <class 'sklearn.tree.tree.DecisionTreeClassifier'>
CREATING_DATABASE_BACKEND_DSN_DIALECT 1.1.6 None postgresql


In [6]:
print(lSQL)

WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS".petal_width_cm <= 0.800000011920929) THEN 1 ELSE CASE WHEN ("ADS".petal_width_cm <= 1.75) THEN 3 ELSE 4 END END AS node_id_2 
FROM iris AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".node_id AS node_id, "Values".feature AS feature, "Values".threshold AS threshold, "Values".count AS count, "Values".depth AS depth, "Values".parent_id AS parent_id, "Values"."Proba_0" AS "Proba_0", "Values"."LogProba_0" AS "LogProba_0", "Values"."Proba_1" AS "Proba_1", "Values"."LogProba_1" AS "LogProba_1", "Values"."Proba_2" AS "Proba_2", "Values"."LogProba_2" AS "LogProba_2", "Values"."Decision" AS "Decision" 
FROM (SELECT 0 AS node_id, 'petal_width_cm' AS feature, 0.800000011920929 AS threshold, 150 AS count, 0 AS depth, CAST(NULL AS INTEGER) AS parent_id, 0.3333333333333333 AS "Proba_0", -1.09861228867 AS "LogProba_0", 0.3333333333333333 AS "Proba_1", -1.09861228867 AS "LogProba_1", 0.3333333333333333 AS "Proba_2", -1.09861228867

In [7]:
sql_output = pd.read_sql(lSQL , conn);

In [8]:
sql_output.sample(12, random_state=1960)

Unnamed: 0,KEY,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2
74,74,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
9,9,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
88,88,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
25,25,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
5,5,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
48,48,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
117,117,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2
83,83,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
105,105,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2


In [9]:
enriched_dataset = df.merge(sql_output)

In [10]:
enriched_dataset.sample(12, random_state=1960)

Unnamed: 0,KEY,sepal_length_cm,sepal_width _cm,petal_length_cm,petal_width_cm,TGT,Score_0,Score_1,Score_2,Proba_0,Proba_1,Proba_2,LogProba_0,LogProba_1,LogProba_2,Decision
114,114,5.8,2.8,5.1,2.4,2,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2
74,74,6.4,2.9,4.3,1.3,1,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
9,9,4.9,3.1,1.5,0.1,0,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
88,88,5.6,3.0,4.1,1.3,1,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
25,25,5.0,3.0,1.6,0.2,0,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
5,5,5.4,3.9,1.7,0.4,0,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
48,48,5.3,3.7,1.5,0.2,0,,,,1.0,0.0,0.0,0.0,-1.797693e+308,-1.797693e+308,0
117,117,7.7,3.8,6.7,2.2,2,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2
83,83,6.0,2.7,5.1,1.6,1,,,,0.0,0.907407,0.092593,-1.797693e+308,-0.09716375,-2.379546,1
105,105,7.6,3.0,6.6,2.1,2,,,,0.0,0.021739,0.978261,-1.797693e+308,-3.828641,-0.02197891,2
