### BUILD FILE

In [None]:
%%writefile default_modeling/default_modeling/interface/trainer.pyx

import argparse
import joblib
import os
import pathlib
import time

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

import pyximport
pyximport.install()

from ..utils.preproc import CategoricalEncoder
from ..utils.preproc import NumericEncoder
from ..utils.preproc import feature_definition

import warnings
warnings.filterwarnings("ignore")


def train():
        
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--train-folder", type=str, default=os.environ.get("TRAIN_FOLDER"))
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--target", type=str, default=os.environ.get("TARGET"))
    parser.add_argument("--train-file", type=str, default=os.environ.get("TRAIN_FILE"))
    # RF parameters
    parser.add_argument("--n-estimators", type=int, default=100)
    parser.add_argument("--min-samples-leaf", type=int, default=10)
    parser.add_argument("--max-depth", type=int, default=10)  
    parser.add_argument("--random-state", type=int, default=1234)   


    args, _ = parser.parse_known_args()
    print(args)
    start_time = time.time()
    print(f"Training Data at {os.path.join(args.train_folder, args.train_file)}")
    train_df = pd.read_csv(os.path.join(args.train_folder, args.train_file))
    y_train = train_df[args.target]
     
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + [args.target]
    categories_features.append(args.target)
    input_features = categories_features + numerics_features
    print("Total Input Features", len(input_features))

    # Preproc Data
    numeric_encoder = NumericEncoder
    numeric_transformer = Pipeline(steps=[
        ('numeric_encoder', numeric_encoder(column_list=numerics_features, 
                                            bin_width=1))])
    
    categorical_encoder = CategoricalEncoder
    categorical_transformer = Pipeline(steps=[
    ('categorical_encoder', categorical_encoder(column_list=categories_features))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categories_features),
            ('num', numeric_transformer, numerics_features)],
        remainder="drop")
    
    class_weight_list = class_weight.compute_class_weight(class_weight='balanced',
                                                          classes=np.unique(y_train),
                                                          y=y_train)
    class_weight_dict = {}
    for i, weight in enumerate(class_weight_list):
        class_weight_dict[i] = weight
    print('class weight', class_weight_dict)

    rf_model = RandomForestClassifier(
                n_estimators=args.n_estimators, 
                min_samples_leaf=args.min_samples_leaf, 
                max_depth=args.max_depth, 
                class_weight=class_weight_dict,
                random_state=args.random_state,
                n_jobs=-1,
    )
    
    ml_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", rf_model)
    ])
    
    ml_pipeline.fit(train_df[all_features], y_train)
    os.makedirs(args.model_dir, exist_ok=True)
    saved_name = f"{os.path.join(args.model_dir, args.model_name)}.joblib"
    
    if os.path.isfile(saved_name):  
        print(f"Found existing model at: {saved_name}.\nOverwriting ...")
    
    joblib.dump(ml_pipeline, saved_name)
    print(f"Congratulation! Saving model at {saved_name}. Finish after {time.time() - start_time} s")    

if __name__ == "__main__":
    train()

In [None]:
%%writefile default_modeling/default_modeling/interface/predictor.pyx
import argparse
import joblib
import os
import time
import errno

import pandas as pd
from scipy.stats import ks_2samp 
from sklearn import metrics
import numpy as np

import pyximport
pyximport.install()

from ..utils.preproc import CategoricalEncoder
from ..utils.preproc import NumericEncoder
from ..utils.preproc import feature_definition

def predict():
    
    """
    Args:
    Returns:
    Raise: FileNotFoundError if model hasn't been found
    """
    
    print("extracting arguments")
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--test-folder", type=str, default=os.environ.get("TEST_FOLDER"))
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--target", type=str, default=os.environ.get("TARGET"))
    parser.add_argument("--test-file", type=str, default=os.environ.get("TEST_FILE"))

    args, _ = parser.parse_known_args()

    model_path = os.path.join(args.model_dir, args.model_name)
    model_file = f"{model_path}.joblib"
    print(args)
    if not os.path.isfile(model_file):
        raise(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), model_file))
        
    print(f"Found model at: {model_file}")
    
    risk_model = joblib.load(model_file)
    test_df = pd.read_csv(os.path.join(args.test_folder, args.test_file))
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + ["default"]
    print(f"Predicting {args.test_file} ....")
    start_time = time.time()
    y_test_pred = risk_model.predict_proba(test_df[all_features])
    print(f"Finish after {time.time() - start_time} s")
    y_test_pred = y_test_pred[:, 1]
    test_df["default_prediction"] = y_test_pred
    
    saved_filed = os.path.join(args.test_folder, f"{args.test_file}")
    print(f"...to csv {saved_filed}")
    test_df.to_csv(saved_filed, index=False)
    
if __name__ == "__main__":
    predict()

In [None]:
%%writefile default_modeling/default_modeling/utils/load.pyx
import logging
import re

from typing import Union
import pathlib

import numpy as np
import pandas as pd

LOGGER = logging.getLogger(__name__)


def load_data(event_data: Union[list, str]) -> pd.DataFrame:
    
    """Takes the data returned from Cassadra queries and converts them into a
    DataFrame that can be digested.

    Args:
      event_data(list[dict] or string): The data returned from sedds Cassandra client fetch method or the name of a csv file
    Returns:
     pd.DataFrame
    """

    if not event_data:
        LOGGER.error("event_data is empty")
        return pd.DataFrame()

    if isinstance(event_data, str) or isinstance(event_data, pathlib.PosixPath):
        data = pd.read_csv(event_data)
    else:
        data = pd.DataFrame(event_data)

    return data


In [None]:
%%writefile default_modeling/default_modeling/utils/preproc.pyx
from typing import Union

import pandas as pd
import numpy as np
from category_encoders.woe import WOEEncoder

def feature_definition():
    
    """
    Define list of categorical/numerical features
    """
    # categories
    categories = ['category_1', 'category_2', 'category_3', 'category_4', 'category_5', 'category_6', 'category_7',
                  'category_8', 'category_9', 'category_10', 'category_11', 'category_12', 'category_13', 'category_14',
                  'category_15']
    # numerics
    numerics = ['numeric_0', 'numeric_1', 'numeric_2', 'numeric_3', 'numeric_4', 'numeric_5', 'numeric_6', 'numeric_7',
                'numeric_8', 'numeric_9', 'numeric_10', 'numeric_11', 'numeric_12', 'numeric_13', 'numeric_14', 
                'numeric_15', 'numeric_16', 'numeric_17', 'numeric_18', 'numeric_19', 'numeric_20', 'numeric_21',
                'numeric_22']
    
    return categories, numerics


# ----- PREPROC ------
class NumericEncoder():
    
    """
    Encode number by binning into different ranges
    """
    
    def __init__(self, 
                 column_list: list = None,
                 bin_width: int = None):
        
        self.column_list = column_list
        self.bin_width = bin_width
        
    def __binning__(self, 
                    X: pd.Series, 
                    bucket_list: list, 
                    bin_width: int) -> list:
        """
        Helper function to bin a series
        Args:
            X: continuous value Series
            bucket_list: list of different value for each bin
                         Some features require specific binning values
            bin_width: auto-bin with width percentage
            (Either bin_width or bucket_list is used)
        Returns:
            list of binned values
        """
        
        X = X.copy(deep=True)
        n_null = X.isna().sum()

        if n_null > 0:
            X = X.fillna(-1)
            bucket_bin = [-1]
        else:
            bucket_bin = []

        if bucket_list is None:
            bucket_list = list(range(0, 100 + bin_width, bin_width))
            for i, q in enumerate(bucket_list):
                q_quantile = round(np.percentile(X.astype(np.float32).values, q), 3)
                if q_quantile not in bucket_bin:
                    bucket_bin.append(q_quantile)

        else:
            bucket_bin = bucket_bin + list(bucket_list)
            
        return bucket_bin
    
    def fit(self, 
            X: pd.DataFrame, 
            y: Union[list, np.array] = None, 
            verbose: int = 0):
        """
        Construct encoder as a dictionary
        Args:
            X: pd.DataFrame
            y: np.array Output
            verbose: int. for logging info
        Return:
            encoder object
        """
        X = X.copy(deep=True)
        encode_dict = {}
        for column in self.column_list:
            if column != "age":
                # Encode other columns
                encode_dict[column] = self.__binning__(X[column], None, self.bin_width)
            elif column == "age":
                # Specific encoding for age columns
                max_age = max(X[column])
                age_bucket=[0, 18, 24, 40, 57, 75, max_age]
                encode_dict[column] = self.__binning__(X[column], age_bucket, self.bin_width)
            if verbose:
                print('\n', column)
                print(encode_dict)
        self.encoder = encode_dict
        return self
        
    def transform(self, 
                  X: pd.DataFrame) -> pd.DataFrame:
        """
        Use built encode to transform data
        Args:
            X: pd.DataFrame
        Return:
            pd.DataFrame with transformed columns
        """
        X = X.copy(deep=True)

        for col in self.column_list:     
            # if boolean, convert to int
            if X[col].dtype == bool:
                X[col] = X[col].astype(int)
            
            if X[col].isnull().any():
                X[col] = X[col].fillna(-1)
            
            bucket_bin = self.encoder[col]

            # Extend bin range if values exceed
            if max(bucket_bin) < max(X[col]):
                bucket_bin[-1] = max(X[col])
            if  min(bucket_bin) > min(X[col]):
                bucket_bin[0] = min(X[col])

            X[col] = pd.cut(X[col],
                            bucket_bin,
                            include_lowest=True,
                            retbins=True,
                            labels=bucket_bin[:-1])[0].astype(float)
        return X

class CategoricalEncoder():

    """
    Encode categories by Weight of Evidence 
    (from category_encoders library)
    """
    
    def __init__(self, 
                 column_list: list = None):
        self.encoder = None
        self.column_list = column_list
    
    def fit(self, 
            X: pd.DataFrame,
            y: Union[list, np.array],
            verbose: int = 0):
        """
        Construct encoder as a dictionary
        Args:
            X: pd.DataFrame
            y: np.array Output
            verbose: int. for logging info
        Return:
            encoder object
        """
        X = X.copy(deep=True)
        woe_encoder = WOEEncoder(cols=self.column_list, random_state=50)
        woe_encoder = woe_encoder.fit(X[self.column_list], y)
        self.encoder = woe_encoder
        return self
                        
    def transform(self, 
                  X: pd.DataFrame) -> pd.DataFrame:
        """
        Use built encode to transform data
        Args:
            X: pd.DataFrame
        Return:
            pd.DataFrame with transformed columns
        """
        X = X.copy(deep=True)
        X[self.column_list] = self.encoder.transform(X[self.column_list])
        return X


### CYTHON INTERFACE

In [None]:
%%writefile default_modeling/default_modeling/interface/launch_trainer.py
import pyximport
pyximport.install()
from .trainer import train
train()

In [None]:
%%writefile default_modeling/default_modeling/interface/launch_predictor.py
import pyximport
pyximport.install()
from .predictor import predict
predict()

### CYTHONIZE SETUP

In [None]:
%%writefile default_modeling/setup.py
from distutils.core import setup
import setuptools
from Cython.Build import cythonize

setup(
    name='default_modeling',
    version='0.0.1',
    description="Default Probability Estimation Library",
    author="Linh, V. Nguyen",
    author_email="linhvietnguyen.ee@gmail.com",
    url="https://github.com/nvlinhvn/default-modeling/default_modeling",
    include_package_data=True,
    install_requires=[
        "pandas>=1.3.4",
        "numpy>=1.21.3",
        "scikit-learn>=1.0.0",
        "category_encoders>=2.3.0",
        "Cython>=0.29.21",
        "scipy>=1.7.0",
    ],
    extras_require={"dev": ["joblib" ]},
    ext_modules=cythonize(["default_modeling/default_modeling/utils/*.pyx",
                           "default_modeling/default_modeling/interface/*.pyx"],
                          language_level = "3"),
)

In [None]:
!python3 -m default_modeling.setup build

### LOCAL RUN

In [None]:
!python3 -m unittest discover default_modeling

In [None]:
!python3 -m default_modeling.default_modeling.interface.launch_trainer \
                                                --model-dir ./model \
                                                --model-name risk_model \
                                                --train-folder train_data \
                                                --train-file train_set_2.csv \
                                                --target default

In [None]:
!python3 -m default_modeling.default_modeling.interface.launch_predictor --test-file test_set_1.csv \
                                               --model-dir ./model \
                                               --model-name risk_model \
                                               --test-folder test_data \
                                               --test-file test_set_1.csv \
                                               --target default

In [None]:
%%writefile Dockerfile
FROM python:3.8
WORKDIR /app/

RUN mkdir model

ENV TRAIN_FOLDER=./train_data
ENV TEST_FOLDER=./test_data
ENV TRAIN_FILE=train_set.csv
ENV TEST_FILE=test_set.csv
ENV MODEL_DIR=./model
ENV MODEL_NAME=risk_model
ENV TARGET=default

COPY requirements.txt .
COPY default_modeling default_modeling

RUN pip install -r requirements.txt
RUN python3 -m default_modeling.setup build

ENTRYPOINT ["python3"]

In [None]:
!docker build --no-cache -t default_model -f Dockerfile .

In [None]:
!docker run -t default_model:latest -m unittest discover default_modeling

In [None]:
!docker run -v /home/jupyter/Cython/train_data:/app/train_data \
            -v /home/jupyter/Cython/model:/app/model \
            default_model:latest -m default_modeling.default_modeling.interface.launch_trainer \
            --train-file train_set_1.csv \
            --n-estimators 200 \
            --max-depth 15 \
            --min-samples-leaf 20

In [None]:
!docker run -v /home/jupyter/Cython/test_data:/app/test_data \
            -v /home/jupyter/Cython/model:/app/model \
            default_model:latest -m default_modeling.default_modeling.interface.launch_predictor \
            --test-file test_set_1.csv