### BUILD FILE

In [1]:
%%writefile default_modeling/default_modeling/interface/trainer.pyx

import argparse
import joblib
import os
import pathlib
import time

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

from ..utils.preproc import CategoricalEncoder
from ..utils.preproc import NumericEncoder
from ..utils.preproc import feature_definition

import warnings
warnings.filterwarnings("ignore")


def train():
        
    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--train-folder", type=str, default=os.environ.get("TRAIN_FOLDER"))
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--target", type=str, default=os.environ.get("TARGET"))
    parser.add_argument("--train-file", type=str, default=os.environ.get("TRAIN_FILE"))
    # RF parameters
    parser.add_argument("--n-estimators", type=int, default=100)
    parser.add_argument("--min-samples-leaf", type=int, default=10)
    parser.add_argument("--max-depth", type=int, default=10)  
    parser.add_argument("--random-state", type=int, default=1234)   


    args, _ = parser.parse_known_args()
    print(args)
    start_time = time.time()
    print(f"Training Data at {os.path.join(args.train_folder, args.train_file)}")
    train_df = pd.read_csv(os.path.join(args.train_folder, args.train_file))
    y_train = train_df[args.target]
     
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + [args.target]
    categories_features.append(args.target)
    input_features = categories_features + numerics_features
    print("Total Input Features", len(input_features))

    # Preproc Data
    numeric_encoder = NumericEncoder
    numeric_transformer = Pipeline(steps=[
        ('numeric_encoder', numeric_encoder(column_list=numerics_features, 
                                            bin_width=1))])
    
    categorical_encoder = CategoricalEncoder
    categorical_transformer = Pipeline(steps=[
    ('categorical_encoder', categorical_encoder(column_list=categories_features))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categories_features),
            ('num', numeric_transformer, numerics_features)],
        remainder="drop")
    
    class_weight_list = class_weight.compute_class_weight(class_weight='balanced',
                                                          classes=np.unique(y_train),
                                                          y=y_train)
    class_weight_dict = {}
    for i, weight in enumerate(class_weight_list):
        class_weight_dict[i] = weight
    print('class weight', class_weight_dict)

    rf_model = RandomForestClassifier(
                n_estimators=args.n_estimators, 
                min_samples_leaf=args.min_samples_leaf, 
                max_depth=args.max_depth, 
                class_weight=class_weight_dict,
                random_state=args.random_state,
                n_jobs=-1,
    )
    
    ml_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", rf_model)
    ])
    
    ml_pipeline.fit(train_df[all_features], y_train)
    os.makedirs(args.model_dir, exist_ok=True)
    saved_name = f"{os.path.join(args.model_dir, args.model_name)}.joblib"
    
    if os.path.isfile(saved_name):  
        print(f"Found existing model at: {saved_name}.\nOverwriting ...")
    
    joblib.dump(ml_pipeline, saved_name)
    print(f"Congratulation! Saving model at {saved_name}. Finish after {time.time() - start_time} s")    

if __name__ == "__main__":
    train()

Overwriting default_modeling/default_modeling/interface/trainer.pyx


In [2]:
%%writefile default_modeling/default_modeling/interface/predictor.pyx
import argparse
import joblib
import os
import time
import pandas as pd
from scipy.stats import ks_2samp 
from sklearn import metrics
import numpy as np

from ..utils.preproc import CategoricalEncoder
from ..utils.preproc import NumericEncoder
from ..utils.preproc import feature_definition

import argparse
import joblib
import os
import time
import errno

import pandas as pd
from scipy.stats import ks_2samp 
from sklearn import metrics
import numpy as np

from ..utils.preproc import CategoricalEncoder
from ..utils.preproc import NumericEncoder
from ..utils.preproc import feature_definition

def predict():
    
    """
    Args:
    Returns:
    Raise: FileNotFoundError if model hasn't been found
    """
    
    print("extracting arguments")
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--test-folder", type=str, default=os.environ.get("TEST_FOLDER"))
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--target", type=str, default=os.environ.get("TARGET"))
    parser.add_argument("--test-file", type=str, default=os.environ.get("TEST_FILE"))

    args, _ = parser.parse_known_args()

    model_path = os.path.join(args.model_dir, args.model_name)
    model_file = f"{model_path}.joblib"
    print(args)
    if not os.path.isfile(model_file):
        raise(FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), model_file))
        
    print(f"Found model at: {model_file}")
    
    risk_model = joblib.load(model_file)
    test_df = pd.read_csv(os.path.join(args.test_folder, args.test_file))
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + ["default"]
    print(f"Predicting {args.test_file} ....")
    start_time = time.time()
    y_test_pred = risk_model.predict_proba(test_df[all_features])
    print(f"Finish after {time.time() - start_time} s")
    y_test_pred = y_test_pred[:, 1]
    test_df["default_prediction"] = y_test_pred
    
    saved_filed = os.path.join(args.test_folder, f"{args.test_file}")
    print(f"...to csv {saved_filed}")
    test_df.to_csv(saved_filed, index=False)
    
if __name__ == "__main__":
    predict()

Overwriting default_modeling/default_modeling/interface/predictor.pyx


### CYTHON INTERFACE

In [3]:
%%writefile default_modeling/default_modeling/interface/launch_training.py
from .trainer import train
train()

Overwriting default_modeling/default_modeling/interface/launch_training.py


In [4]:
%%writefile default_modeling/default_modeling/interface/launch_predicting.py
from .predictor import predict
predict()

Overwriting default_modeling/default_modeling/interface/launch_predicting.py


### CYTHONIZE SETUP

In [5]:
%%writefile default_modeling/setup.py
from Cython.Build import cythonize
from distutils.core import setup

setup(
    name='default_modeling',
    version='0.0.1',
    description="Default Probability Estimation Library",
    author="Linh, V. Nguyen",
    author_email="linhvietnguyen.ee@gmail.com",
    ext_modules=cythonize(["default_modeling/default_modeling/interface/*.pyx"]),
)

Overwriting default_modeling/setup.py


In [6]:
!python3 -m default_modeling.setup build_ext --inplace

Compiling default_modeling/default_modeling/interface/predictor.pyx because it changed.
Compiling default_modeling/default_modeling/interface/trainer.pyx because it changed.
[1/2] Cythonizing default_modeling/default_modeling/interface/predictor.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
[2/2] Cythonizing default_modeling/default_modeling/interface/trainer.pyx
  tree = Parsing.p_module(s, pxd, full_module_name)
running build_ext
building 'default_modeling.default_modeling.interface.predictor' extension
gcc -pthread -B /opt/conda/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/opt/conda/include/python3.7m -c default_modeling/default_modeling/interface/predictor.c -o build/temp.linux-x86_64-3.7/default_modeling/default_modeling/interface/predictor.o
gcc -pthread -shared -B /opt/conda/compiler_compat -L/opt/conda/lib -Wl,-rpath=/opt/conda/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.7/default_modeli

### LOCAL RUN

In [7]:
!python3 -m default_modeling.default_modeling.interface.launch_training --train-file train_set_1.csv \
                                                --model-dir ./model \
                                                --model-name risk_model \
                                                --train-folder train_data \
                                                --train-file train_set_1.csv \
                                                --target default

extracting arguments
Namespace(max_depth=10, min_samples_leaf=10, model_dir='./model', model_name='risk_model', n_estimators=100, random_state=1234, target='default', train_file='train_set_1.csv', train_folder='train_data')
Training Data at train_data/train_set_1.csv
('Total Input Features', 39)
('class weight', {0: 0.5071993428787708, 1: 35.22539149888143})
Congratulation! Saving model at ./model/risk_model.joblib. Finish after 4.053596019744873 s


In [8]:
!python3 -m default_modeling.default_modeling.interface.launch_predicting --test-file test_set_1.csv \
                                               --model-dir ./model \
                                               --model-name risk_model \
                                               --test-folder test_data \
                                               --test-file test_set_1.csv \
                                               --target default

extracting arguments
Namespace(model_dir='./model', model_name='risk_model', target='default', test_file='test_set_1.csv', test_folder='test_data')
Found model at: ./model/risk_model.joblib
Predicting test_set_1.csv ....
Finish after 0.44262242317199707 s
...to csv test_data/test_set_1.csv


In [9]:
%%writefile Dockerfile
FROM python:3.8
WORKDIR /app/

RUN mkdir model

ENV TRAIN_FOLDER=./train_data
ENV TEST_FOLDER=./test_data
ENV TRAIN_FILE=train_set.csv
ENV TEST_FILE=test_set.csv
ENV MODEL_DIR=./model
ENV MODEL_NAME=risk_model
ENV TARGET=default

COPY requirements.txt .

RUN pip install -r requirements.txt
COPY default_modeling default_modeling
RUN python3 -m default_modeling.setup build_ext --inplace

ENTRYPOINT ["python3"]

Overwriting Dockerfile


In [10]:
!docker build --no-cache -t default_model -f Dockerfile .

Sending build context to Docker daemon  46.24MB
Step 1/15 : FROM python:3.8
 ---> 79372a158581
Step 2/15 : WORKDIR /app/
 ---> Running in 76046c31486d
Removing intermediate container 76046c31486d
 ---> 74791ab2cde2
Step 3/15 : RUN mkdir model
 ---> Running in 48706c67d0be
Removing intermediate container 48706c67d0be
 ---> dbcf3cc43d04
Step 4/15 : ENV TRAIN_FOLDER=./train_data
 ---> Running in 8e4af9db8dbc
Removing intermediate container 8e4af9db8dbc
 ---> 4e42ae91a8b4
Step 5/15 : ENV TEST_FOLDER=./test_data
 ---> Running in 2be3a2d3fd67
Removing intermediate container 2be3a2d3fd67
 ---> 6556e99e1922
Step 6/15 : ENV TRAIN_FILE=train_set.csv
 ---> Running in cff90288c8c0
Removing intermediate container cff90288c8c0
 ---> 0cd143d353d6
Step 7/15 : ENV TEST_FILE=test_set.csv
 ---> Running in f4dd9031918a
Removing intermediate container f4dd9031918a
 ---> 8146cac52b76
Step 8/15 : ENV MODEL_DIR=./model
 ---> Running in 13117e5d5084
Removing intermediate container 13117e5d5084
 ---> fa28f81e27

In [11]:
!docker run -t default_model:latest -m unittest discover default_modeling

Found the following test data
default_modeling/tests/data/test_sample_1.csv
..
----------------------------------------------------------------------
Ran 2 tests in 0.640s

OK


In [12]:
!docker run -v /home/jupyter/Cython/train_data:/app/train_data \
            -v /home/jupyter/Cython/model:/app/model \
            default_model:latest -m default_modeling.default_modeling.interface.launch_training \
            --train-file train_set_1.csv

extracting arguments
Namespace(max_depth=10, min_samples_leaf=10, model_dir='./model', model_name='risk_model', n_estimators=100, random_state=1234, target='default', train_file='train_set_1.csv', train_folder='./train_data')
Training Data at ./train_data/train_set_1.csv
('Total Input Features', 39)
('class weight', {0: 0.5071993428787708, 1: 35.22539149888143})
Found existing model at: ./model/risk_model.joblib.
Overwriting ...
Congratulation! Saving model at ./model/risk_model.joblib. Finish after 4.340307712554932 s


In [21]:
!docker run -v /home/jupyter/Cython/test_data:/app/test_data \
            -v /home/jupyter/Cython/model:/app/model \
            default_model:latest -m default_modeling.default_modeling.interface.launch_predicting \
            --test-file test_set_1.csv

extracting arguments
Namespace(model_dir='./model', model_name='risk_model', target='default', test_file='test_set_1.csv', test_folder='./test_data')
Found model at: ./model/risk_model.joblib
Predicting test_set_1.csv ....
Finish after 0.40616917610168457 s
...to csv ./test_data/test_set_1.csv
