# 10 minutes to gators

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import numpy as np
import pandas as pd
from pandas.testing import assert_frame_equal
from xgboost import XGBClassifier
import treelite
import treelite_runtime
import dill

In [3]:
# imputers
from gators.imputers import (
    FloatImputer, 
    IntImputer, 
    ObjectImputer,
)
# encoders
from gators.encoders import WOEEncoder
# pipeline
from gators.pipeline import Pipeline
# model building
from gators.model_building import XGBBoosterBuilder

## end-to-end simple worflow

The workflow is as followed:

1- create a pipeline to take care of the data preproprocessing.

2- train the pipeline on a *pandas* or *koalas* dataframe.

3- generate the preproccessed data.

4- train a decision tree based model on the preprocessed data.

5- use *treelite* to compile the model in C.
    
The pipeline and the compiled model can then be deployed in production.

**Notes:**
    
* *koalas* and/or *pandas* are used offline,
by means of the `fit` and `transform methods`.
* In production, *numpy* is used with `transform_numpy`.

The pipeline will be only composed of the following four transformers:

* ObjectImputer
* WOEEncoder
* FloatImputer
* IntImputer

### with pandas

In [4]:
data = pd.read_parquet('data/titanic.parquet')
y = data['Survived']
X = data.drop(['Survived'], axis=1)
X.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


#### pipeline

In [5]:
prepro_steps = [
    ObjectImputer(strategy='constant', value='MISSING'),
    WOEEncoder(),
    FloatImputer(strategy='mean'),
    IntImputer(strategy='constant', value=-1),
]
pipe = Pipeline(steps=prepro_steps)
X_prepro = pipe.fit_transform(X, y)
X_prepro_np = pipe.transform_numpy(X.to_numpy())

In [6]:
X_prepro.head(2)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3.0,0.0,-0.983833,22.0,1.0,0.0,0.0,7.25,-0.374703,-0.203599
2,1.0,0.0,1.529877,38.0,1.0,0.0,0.0,71.2833,0.0,0.688399


##### check `transform` and `tranform_numpy` output

In [7]:
assert X_prepro.shape == X_prepro_np.shape
X_prepro_np_pd = pd.DataFrame(
    X_prepro_np, 
    index=X_prepro.index, 
    columns=X_prepro.columns,
)
assert_frame_equal(X_prepro, X_prepro_np_pd)

### model building

In [8]:
model = XGBClassifier(
    max_depth=2,
    n_estimators=10,
    random_state=0, 
    eval_metric='mlogloss', 
    use_label_encoder=False)

#### pandas model

In [9]:
model_pd = copy.copy(model)
_ = model_pd.fit(X_prepro, y)

#### numpy model

In [10]:
model = model.fit(X_prepro.to_numpy(), y.to_numpy())

#### treelite model

In [11]:
xgb_booster = XGBBoosterBuilder.train(
    model=model, 
    X_train=X_prepro_np, 
    y_train=y.to_numpy(),
)

treelite_model = treelite.Model.from_xgboost(xgb_booster)
treelite_model.export_lib(
    toolchain='gcc', 
    libpath='./models/treelite_simple_xgb.so', 
    params={'parallel_comp': 4},
    verbose=True)
model_tl = treelite_runtime.Predictor(
    './models/treelite_simple_xgb.so', verbose=False)

[11:41:25] ../src/compiler/ast_native.cc:45: Using ASTNativeCompiler
[11:41:25] ../src/compiler/ast/split.cc:31: Parallel compilation enabled; member trees will be divided into 4 translation units.
[11:41:25] ../src/c_api/c_api.cc:121: Code generation finished. Writing code to files...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file recipe.json...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file tu3.c...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file tu2.c...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file tu1.c...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file tu0.c...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file header.h...
[11:41:25] ../src/c_api/c_api.cc:126: Writing file main.c...
[11:41:25] /Users/cpoli/gators38/lib/python3.8/site-packages/treelite/contrib/util.py:105: Compiling sources files in directory ./models/tmpsdrk142d into object files (*.o)...
[11:41:25] /Users/cpoli/gators38/lib/python3.8/site-packages/treelite/contrib/util.py:134: Generating dynam

##### per-sample model benchmarking

In [12]:
x = X.iloc[[0]]
xnp = x.to_numpy()
stats_pd = %timeit -o model_pd.predict_proba(pipe.transform(x))[0][1]
stats_pd_tl = %timeit -o model_tl.predict(treelite_runtime.DMatrix(pipe.transform(x).to_numpy()))
stats_np = %timeit -o model.predict_proba(pipe.transform_numpy(xnp))[0][1]
stats_np_tl = %timeit -o model_tl.predict(treelite_runtime.DMatrix(pipe.transform_numpy(xnp)))

55.1 ms ± 1.83 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
59.8 ms ± 5.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
114 µs ± 6.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
110 µs ± 6.57 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


Overall speed-up

In [13]:
speedup = 1e3 * float(str(stats_pd).split(' ')[0]) / float(str(stats_np_tl).split(' ')[0])
f'Speed-up Pandas VS Numpy&Treelite x{round(speedup)}'

'Speed-up Pandas VS Numpy&Treelite x500.91'

#### check model predictions

In [14]:
X_np = X.to_numpy()
y_pred_pd = model_pd.predict_proba(pipe.transform(X))[:, 1]
y_pred_np = model.predict_proba(pipe.transform_numpy(X_np))[:, 1]
y_pred_tl = model_tl.predict(treelite_runtime.DMatrix(pipe.transform_numpy(X_np).astype(float)))
assert np.allclose(y_pred_np, y_pred_pd)
assert np.allclose(y_pred_np, y_pred_tl)

#### dumping both model and pipeline 

In [15]:
model_path = 'models/simple_xgb.dill'
with open(model_path, 'wb') as file:
    dill.dump(model, file)
model_path = 'pipelines/simple_pipeline.dill'
with open(model_path, 'wb') as file:
    dill.dump(model, file)

### with koalas

In [16]:
import databricks.koalas as ks
from gators.converter import KoalasToPandas

In [17]:
data = ks.read_parquet('data/titanic.parquet')
y_ks = data['Survived']
X_ks = data.drop(['Survived', 'PassengerId'], axis=1)
X_ks.head(2)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [18]:
X_prepro_ks = pipe.fit_transform(X_ks, y_ks)
X_prepro_ks_np = pipe.transform_numpy(X.to_numpy())

In [19]:
X_prepro_ks_pd, y_ks_pd = KoalasToPandas().transform(X_prepro_ks, y_ks)
X_prepro_ks_pd.index = X_prepro.index

##### check `pandas` and `koalas` output

In [20]:
assert_frame_equal(X_prepro_ks_pd, X_prepro)
assert np.allclose(X_prepro_ks_np, X_prepro)

Since we have a pandas dataframe, the same steps from the pandas section can now followed

## Create you own transformers: example with log10

### imports

In [21]:
from typing import List, Union
from math import log10
import numpy as np
import pandas as pd
import databricks.koalas as ks
from gators.util import util
from gators.transformers import Transformer

### Inplace transformer on the all dataframe

In [22]:
class Log10Inplace(Transformer):
    def __init__(self):
        pass

    def fit(self,
            X: Union[pd.DataFrame, ks.DataFrame],
            y: Union[pd.Series, ks.Series] = None) -> 'Log10Inplace':
        self.check_dataframe(X)
        return self

    def transform(
        self, X: Union[pd.DataFrame, ks.DataFrame]
    ) -> Union[pd.DataFrame, ks.DataFrame]:
        self.check_dataframe(X)
        return X.applymap(log10)

    def transform_numpy(self, X: np.ndarray) -> np.ndarray:
        self.check_array(X)
        return np.log10(X)

**Notes:**

If your use case do not need koalas, the `transform` method can be replaced by:
 
`return pd.DataFrame(np.log10(X.to_numpy()), columns=X.columns, index=X.index)`

which is significantly faster.

In [23]:
X = pd.DataFrame(
    np.abs(np.random.randn(10, 10)), columns=list('ABCDEFGHIJ'))
%timeit _ = X.applymap(log10)
%timeit _ = pd.DataFrame(np.log10(X.to_numpy()), columns=X.columns, index=X.index)

2.09 ms ± 192 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
45.2 µs ± 1.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### Inplace transformer on the selected columns

Transform the given columns can be interesting in the following ways:

* Only a few colums need to be transformed.
* Only a given datatype should be transformed.
* The transformation should not be applied on the encoded columns, and the name of the base columns are obtained before the transformation.

In [24]:
class Log10ColumnsInplace(Transformer):
    def __init__(self, columns: List[str]):
        if not isinstance(columns, list):
            raise TypeError('`columns` should be a list.')
        if not columns:
            raise ValueError('`columns` should not be empty.')
        self.columns = columns
        
    def fit(self,
            X: Union[pd.DataFrame, ks.DataFrame],
            y: Union[pd.Series, ks.Series] = None) -> 'Log10Columns':
        self.check_dataframe(X)
        self.idx_columns = util.get_idx_columns(
            columns=X.columns,
            selected_columns=self.columns
        )
        return self

    def transform(
        self, X: Union[pd.DataFrame, ks.DataFrame]
    ) -> Union[pd.DataFrame, ks.DataFrame]:
        self.check_dataframe(X)
        X[self.columns] = X[self.columns].applymap(log10)
        return X

    def transform_numpy(self, X: np.ndarray) -> np.ndarray:
        self.check_array(X)
        X[:, self.idx_columns] = np.log10(X[:, self.idx_columns])
        return X
    

### transformer creating new columns

Creating new columns can be interesting if

* the raw data are needed for other transformations. 
* the raw data still contains some meaningful predictive information. 

In [25]:
class Log10Columns(Transformer):
    def __init__(self, columns: List[str]):
        if not isinstance(columns, list):
            raise TypeError('`columns` should be a list.')
        if not columns:
            raise ValueError('`columns` should not be empty.')
        self.columns = columns
        self.column_names = [f'{c}__log10' for c in self.columns]
        self.colum_mapping = dict(zip(self.column_names, self.columns))

    def fit(self,
            X: Union[pd.DataFrame, ks.DataFrame],
            y: Union[pd.Series, ks.Series] = None) -> 'Log10Columns':
        self.check_dataframe(X)
        self.idx_columns = util.get_idx_columns(
            columns=X.columns,
            selected_columns=self.columns
        )
        return self

    def transform(
        self, X: Union[pd.DataFrame, ks.DataFrame]
    ) -> Union[pd.DataFrame, ks.DataFrame]:
        self.check_dataframe(X)
        X_new = X[self.columns].applymap(log10)
        X_new.columns = self.column_names
        return X.join(X_new)

    def transform_numpy(self, X: np.ndarray) -> np.ndarray:
        self.check_array(X)
        X_new = np.log10(X[:, self.idx_columns])
        return np.concatenate((X, X_new), axis=1)

**Notes**

The class parameter `colum_names` will be used to clean up the pipeline.

### tests

In [26]:
X = pd.DataFrame(
    np.abs(np.random.randn(10, 10)), columns=list('ABCDEFGHIJ'))
X_np = X.to_numpy()

In [27]:
columns = util.get_datatype_columns(X, float)
X_new_inplace_all = Log10Inplace().fit_transform(X.copy())

In [28]:
columns = util.get_datatype_columns(X, float)
X_new_inplace_cols = Log10ColumnsInplace(
    columns=columns).fit_transform(X.copy())

In [29]:
columns = util.get_datatype_columns(X, float)
X_new = Log10Columns(columns=columns).fit_transform(X)

In [30]:
assert np.allclose(
    X_new_inplace_all.to_numpy(), X_new_inplace_cols.to_numpy())
cols = [
    'A__log10', 'B__log10', 'C__log10', 'D__log10', 'E__log10', 
    'F__log10', 'G__log10', 'H__log10', 'I__log10', 'J__log10'
]
assert np.allclose(
    X_new_inplace_all.to_numpy(), X_new[cols].to_numpy())

### per-sample benchmarking

In [31]:
np.random.seed(0)
n_cols = 1000
X = pd.DataFrame(
    np.abs(np.random.randn(1, n_cols)), 
    columns=[f'col{i}'for i in range(n_cols)])
X_np = X.to_numpy()
x = X.iloc[[0]]
x_np = x.to_numpy()

In [32]:
columns = list(X.columns)
obj = Log10ColumnsInplace(columns=columns)
_ = obj.fit(X)

In [33]:
# result saved to be compared with the transform_numpy using Cython.
x_np_new = obj.transform_numpy(x_np.copy())

In [34]:
%timeit _ = obj.transform(x.copy())
%timeit _ = obj.transform_numpy(x_np.copy())

223 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
17 µs ± 714 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


**Notes:**
    
Since the transformation happens inplace, the `.copy()` is 
neccessary however, the `.copy()` runtime is negligeable: 

In [35]:
%timeit x.copy()
%timeit x_np.copy()

20.8 µs ± 1.46 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
590 ns ± 24.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [36]:
columns = list(X.columns)
obj = Log10Columns(columns=columns)
_ = obj.fit(X)

In [37]:
%timeit _ = obj.transform(x.copy())
%timeit _ = obj.transform_numpy(x_np.copy())

152 ms ± 4.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
16.7 µs ± 673 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### Cython

The per-sample runtime of the `transform_numpy` is already pretty good.
But, it some cases, Cython will allow to get even faster.  

In [1]:
%load_ext Cython

In [4]:
%%cython
import cython
import numpy as np
cimport numpy as np
from libc.math cimport log10


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[np.float64_t, ndim=2] cython_log10(
        np.ndarray[np.float_t, ndim=2] X,
        np.ndarray[np.int64_t, ndim=1] idx_columns,
):
    cdef int i
    cdef int j
    cdef int n_rows = X.shape[0]
    cdef int n_cols = X.shape[1]
    with nogil:
        for i in range(n_rows):
            for j in range(n_cols):
                X[i, j] = log10(X[i, j])
    return X

In [40]:
class Log10ColumnsInplaceWithCython(Transformer):
    def __init__(self, columns: List[str]):
        if not isinstance(columns, list):
            raise TypeError('`columns` should be a list.')
        if not columns:
            raise ValueError('`columns` should not be empty.')
        self.columns = columns
        
    def fit(self,
            X: Union[pd.DataFrame, ks.DataFrame],
            y: Union[pd.Series, ks.Series] = None) -> 'Log10Columns':
        self.check_dataframe(X)
        self.idx_columns = util.get_idx_columns(
            columns=X.columns,
            selected_columns=self.columns
        )
        return self

    def transform(
        self, X: Union[pd.DataFrame, ks.DataFrame]
    ) -> Union[pd.DataFrame, ks.DataFrame]:
        self.check_dataframe(X)
        X[self.columns] = X[self.columns].applymap(log10)
        return X

    def transform_numpy(self, X: np.ndarray) -> np.ndarray:
        self.check_array(X)
        X[:, self.idx_columns] = cython_log10(X, self.idx_columns)
        return X

In [41]:
columns = list(X.columns)
obj = Log10ColumnsInplaceWithCython(columns=columns)
_ = obj.fit(X)

In [42]:
assert np.allclose(obj.transform_numpy(x_np.copy()), x_np_new)

In [43]:
%timeit _ = obj.transform_numpy(x_np.copy())

13.2 µs ± 720 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


A slight runtime improvement is obtained for this transformer.

**Notes:**

In some cases, for example the Encoders, Cython leads to a significant runtime improvement. 