# UTILS

#### PREPROC FILES

In [51]:
%%writefile default_modeling/default_modeling/utils/preproc.py

from typing import Union

import pandas as pd
import numpy as np
from category_encoders.woe import WOEEncoder

def feature_definition():
    
    numerics = ['age', 'account_amount_added_12_24m', 'account_days_in_rem_12_24m', 'account_days_in_term_12_24m',
                'account_incoming_debt_vs_paid_0_24m', 'avg_payment_span_0_12m', 'avg_payment_span_0_3m',
                'max_paid_inv_0_12m', 'max_paid_inv_0_24m', 'num_active_div_by_paid_inv_0_12m',
                'num_active_inv', 'num_arch_dc_0_12m', 'num_arch_dc_12_24m', 'num_arch_ok_0_12m', 
                'num_arch_ok_12_24m', 'num_arch_rem_0_12m', 'num_arch_written_off_12_24m',
                'num_unpaid_bills', 'sum_capital_paid_account_0_12m', 'sum_capital_paid_account_12_24m',
                'has_paid',
                'sum_paid_inv_0_12m', 'time_hours']
    categories = ['account_status', 'account_worst_status_0_3m', 'account_worst_status_12_24m',
                  'account_worst_status_3_6m', 'account_worst_status_6_12m', 'status_last_archived_0_24m',
                  'status_2nd_last_archived_0_24m', 'status_3rd_last_archived_0_24m', 'status_max_archived_0_6_months',
                  'status_max_archived_0_12_months', 'status_max_archived_0_24_months',
                  'worst_status_active_inv', 'merchant_category', 'merchant_group', 'name_in_email']
    
    return categories, numerics


# ----- PREPROC ------
class NumericEncoder():
    
    """
    Encode number by binning into different ranges
    """
    
    def __init__(self, 
                 column_list: list = None,
                 bin_width: int = None):
        
        self.column_list = column_list
        self.bin_width = bin_width
        
    def __binning__(self, 
                    X: pd.Series, 
                    bucket_list: list, 
                    bin_width: int) -> list:
        """
        Helper function to bin a series
        Args:
            X: continuous value Series
            bucket_list: list of different value for each bin
                         Some features require specific binning values
            bin_width: auto-bin with width percentage
            (Either bin_width or bucket_list is used)
        Returns:
            list of binned values
        """
        
        X = X.copy(deep=True)
        n_null = X.isna().sum()

        if n_null > 0:
            X = X.fillna(-1)
            bucket_bin = [-1]
        else:
            bucket_bin = []

        if bucket_list is None:
            bucket_list = range(0, 100 + bin_width, bin_width)
            for i, q in enumerate(bucket_list):
                q_quantile = round(np.percentile(X.astype(np.float32).values, q), 3)
                if q_quantile not in bucket_bin:
                    bucket_bin.append(q_quantile)

        else:
            bucket_bin = bucket_bin + list(bucket_list)
            
        return bucket_bin
    
    def fit(self, 
            X: pd.DataFrame, 
            y: Union[list, np.array] = None, 
            verbose: int = 0):
        """
        Construct encoder as a dictionary
        Args:
            X: pd.DataFrame
            y: np.array Output
            verbose: int. for logging info
        Return:
            encoder object
        """
        X = X.copy(deep=True)
        encode_dict = {}
        for column in self.column_list:
            if column != "age":
                # Encode other columns
                encode_dict[column] = self.__binning__(X[column], None, self.bin_width)
            elif column == "age":
                # Specific encoding for age columns
                max_age = max(X[column])
                age_bucket=[0, 18, 24, 40, 57, 75, max_age]
                encode_dict[column] = self.__binning__(X[column], age_bucket, self.bin_width)
            if verbose:
                print('\n', column)
                print(encode_dict)
        self.encoder = encode_dict
        return self
        
    def transform(self, 
                  X: pd.DataFrame) -> pd.DataFrame:
        """
        Use built encode to transform data
        Args:
            X: pd.DataFrame
        Return:
            pd.DataFrame with transformed columns
        """
        X = X.copy(deep=True)
        if "has_paid" in X:
            # has_paid is boolean
            X["has_paid"] = X["has_paid"].astype(int)

        for col in self.column_list:        
            if X[col].isnull().any():
                X[col] = X[col].fillna(-1)
            
            bucket_bin = self.encoder[col]

            # Extend bin range if values exceed
            if max(bucket_bin) < max(X[col]):
                bucket_bin[-1] = max(X[col])
            if  min(bucket_bin) > min(X[col]):
                bucket_bin[0] = min(X[col])

            X[col] = pd.cut(X[col],
                            bucket_bin,
                            include_lowest=True,
                            retbins=True,
                            labels=bucket_bin[:-1])[0].astype(float)
        return X

class CategoricalEncoder():

    """
    Encode categories by Weight of Evidence 
    (from category_encoders library)
    """
    
    def __init__(self, 
                 column_list: list = None):
        self.encoder = None
        self.column_list = column_list
    
    def fit(self, 
            X: pd.DataFrame,
            y: Union[list, np.array],
            verbose: int = 0):
        """
        Construct encoder as a dictionary
        Args:
            X: pd.DataFrame
            y: np.array Output
            verbose: int. for logging info
        Return:
            encoder object
        """
        X = X.copy(deep=True)
        woe_encoder = WOEEncoder(cols=self.column_list, random_state=50)
        woe_encoder = woe_encoder.fit(X[self.column_list], y)
        self.encoder = woe_encoder
        return self
                        
    def transform(self, 
                  X: pd.DataFrame) -> pd.DataFrame:
        """
        Use built encode to transform data
        Args:
            X: pd.DataFrame
        Return:
            pd.DataFrame with transformed columns
        """
        X = X.copy(deep=True)
        X[self.column_list] = self.encoder.transform(X[self.column_list])
        return X


Overwriting default_modeling/default_modeling/utils/preproc.py


#### LOAD FILES

In [28]:
%%writefile default_modeling/default_modeling/utils/load.py 
import logging
import re

from typing import Union
import pathlib

import numpy as np
import pandas as pd

LOGGER = logging.getLogger(__name__)


def load_data(event_data: Union[list, str]) -> pd.DataFrame:
    
    """Takes the data returned from Cassadra queries and converts them into a
    DataFrame that can be digested.

    Args:
      event_data(list[dict] or string): The data returned from sedds Cassandra client fetch method or the name of a csv file
    Returns:
     pd.DataFrame
    """

    if not event_data:
        LOGGER.error("event_data is empty")
        return pd.DataFrame()

    if isinstance(event_data, str) or isinstance(event_data, pathlib.PosixPath):
        data = pd.read_csv(event_data)
    else:
        data = pd.DataFrame(event_data)

    return data

Overwriting default_modeling/default_modeling/utils/load.py


# UNIT TESTS

In [66]:
%%writefile default_modeling/tests/test_case_base.py 
"""Base class for unit tests"""

import copy
import logging
import unittest

import joblib
import pandas as pd
import pathlib

from default_modeling.utils.load import load_data

LOGGER = logging.getLogger(__name__)


class TestWithData(unittest.TestCase):
    raw = dict()
    available_file = None

    @classmethod
    def setUpClass(cls) -> None:
        cls.available_file = cls.get_available_file()

    @classmethod
    def get_available_file(cls) -> list:
        """Returns the list available test data
        e.g. everything stored under tests/data
        Args:
        Returns: List file
        """
        p = pathlib.Path(".")
        print("folder view")
        print([x for x in p.iterdir() if x.is_dir()])
        test_data = pathlib.Path("default_modeling/tests/data/").glob("*.csv")
        test_data = [f for f in test_data]

        print("Found the following test data")
        for f in test_data:
            print(f)

        return test_data

    @classmethod
    def get_raw(cls, file: str) -> pd.DataFrame:
        """Lazy loading for raw data; will return a copy of the df

        Args:
          file: file name of sample test

        Returns:

        """

        if file in cls.raw:
            LOGGER.info("Found raw data for %s", file)
            return cls.raw[file].copy()
        
        df = load_data(file)

        LOGGER.info("Adding raw data for %s", file)
        cls.raw[file] = df

        return df.copy()

Overwriting default_modeling/tests/test_case_base.py


In [53]:
%%writefile default_modeling/tests/test_data_handling.py 

import pandas.api.types as ptypes
from pandas.testing import assert_frame_equal


from tests.test_case_base import TestWithData
from default_modeling.utils.preproc import feature_definition
from default_modeling.utils.preproc import NumericEncoder
from default_modeling.utils.preproc import CategoricalEncoder


class DataHandlingTests(TestWithData):

    def test_load_data(self):
        """Validate that the most important columns are returned by loading function"""
        numerics, categories = feature_definition()
        key_column = numerics + categories + ["default"]
        for file in self.available_file:
            df = self.get_raw(file)
            for column in key_column:
                self.assertIn(column, df)

    def test_preproc_function(self):
        """Validate if the encoding return reasonable values:
        Categorical encoding: Only encode categorical columns
        Numeric encoding: Only encode numerical columns
        Expected result: all columns are numeric        
        """
        categories, numerics  = feature_definition()
        numeric_encoder = NumericEncoder(column_list=numerics, 
                                         bin_width=1)
        categorical_encoder = CategoricalEncoder(column_list=categories)
        
        input_column = numerics + categories

        for file in self.available_file:
            df = self.get_raw(file)
            y = df["default"].values
            categorical_encoder.fit(df, y)
            cat_transform = categorical_encoder.transform(df)
            # all transformed categories must be numerics
            assert all(ptypes.is_numeric_dtype(cat_transform[col]) for col in categories)
            # all numerical columns must be the same
            assert_frame_equal(cat_transform[numerics], df[numerics])
            
            numeric_encoder.fit(df.copy(), y)
            num_transfom = numeric_encoder.transform(df)
            # all transformed numerics must be numerics
            assert all(ptypes.is_numeric_dtype(num_transfom[col]) for col in numerics)
            # all categorical columns must be the same
            assert_frame_equal(num_transfom[categories], df[categories])

Overwriting default_modeling/tests/test_data_handling.py


In [67]:
!python3 -m unittest discover default_modeling

folder view
[PosixPath('src'), PosixPath('.cache'), PosixPath('.ipython'), PosixPath('.jupyter'), PosixPath('.ipynb_checkpoints'), PosixPath('default_modeling'), PosixPath('.config'), PosixPath('.local'), PosixPath('.docker'), PosixPath('tutorials')]
Found the following test data
default_modeling/tests/data/test_sample_1.csv
..
----------------------------------------------------------------------
Ran 2 tests in 0.681s

OK


# INTERFACE

#### TRAIN

In [25]:
%%writefile default_modeling/train.py
#default_modeling/default_modeling/interface/train.py

import argparse
import joblib
import os
import pathlib

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import class_weight

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

from .default_modeling.utils.preproc import CategoricalEncoder
from .default_modeling.utils.preproc import NumericEncoder
from .default_modeling.utils.preproc import feature_definition

import warnings
warnings.filterwarnings("ignore")


def train():
        
    print("extracting arguments")
    parser = argparse.ArgumentParser()


    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--datafolder", type=str, default="./train_data/")
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--train-file", type=str, default="train_set.csv")
    parser.add_argument("--n-estimators", type=int, default=100)
    parser.add_argument("--min-samples-leaf", type=int, default=10)
    parser.add_argument("--max-depth", type=int, default=10)  
    parser.add_argument("--random-state", type=int, default=1234)   
    parser.add_argument(
        "--target", type=str, default='default'
    )


    args, _ = parser.parse_known_args()

    print("Training Data Preparation")
    print(os.path.join(args.datafolder, args.train_file))
    train_df = pd.read_csv(os.path.join(args.datafolder, args.train_file))
    y_train = train_df[args.target]
     
    print(args)
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + [args.target]
    categories_features.append(args.target)
    input_features = categories_features + numerics_features
    print("Total Input Features", len(input_features))

    # Preproc Data
    numeric_encoder = NumericEncoder
    numeric_transformer = Pipeline(steps=[
        ('numeric_encoder', numeric_encoder(column_list=numerics_features, 
                                            bin_width=1))])
    
    categorical_encoder = CategoricalEncoder
    categorical_transformer = Pipeline(steps=[
    ('categorical_encoder', categorical_encoder(column_list=categories_features))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categories_features),
            ('num', numeric_transformer, numerics_features)],
        remainder="drop")
    
    class_weight_list = class_weight.compute_class_weight(class_weight='balanced',
                                                          classes=np.unique(y_train),
                                                          y=y_train)
    class_weight_dict = {}
    for i, weight in enumerate(class_weight_list):
        class_weight_dict[i] = weight
    print('class weight', class_weight_dict)

    rf_model = RandomForestClassifier(
                n_estimators=args.n_estimators, 
                min_samples_leaf=args.min_samples_leaf, 
                max_depth=args.max_depth, 
                class_weight=class_weight_dict,
                random_state=args.random_state,
                n_jobs=-1,
    )
    
    ml_pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", rf_model)
    ])
    
    ml_pipeline.fit(train_df[all_features], y_train)
    print("Saving Pipeline ........")
    os.makedirs(args.model_dir, exist_ok=True)
    joblib.dump(ml_pipeline, os.path.join(args.model_dir, args.model_name))
    print("Congratulation! Finish.")    
         

if __name__ == "__main__":
    train()

Overwriting default_modeling/train.py


##### TERMINAL SCRIPT

In [28]:
!python3 -m default_modeling.train --datafolder ./train_data \
                                   --model-dir ./default_modeling/default_modeling/interface/ \
                                   --model-name risk_model.joblib 

extracting arguments
Training Data Preparation
./train_data/train_set.csv
Namespace(datafolder='./train_data', max_depth=10, min_samples_leaf=10, model_dir='./default_modeling/default_modeling/interface/', model_name='risk_model.joblib', n_estimators=100, random_state=1234, target='default', train_file='train_set.csv')
Total Input Features 39
class weight {0: 0.5071993428787708, 1: 35.22539149888143}
Saving Pipeline ........
Congratulation! Finish.


In [71]:
#!python3 default_modeling/default_modeling/interface/__init__.py

In [72]:
#!python3 -m default_modeling.default_modeling.interface.train

#### PREDICTION

In [105]:
%%writefile default_modeling/predict.py
import argparse
import joblib
import os
import time
import pandas as pd
from scipy.stats import ks_2samp 
from sklearn import metrics
import numpy as np

from .default_modeling.utils.preproc import CategoricalEncoder
from .default_modeling.utils.preproc import NumericEncoder
from .default_modeling.utils.preproc import feature_definition

def predict():
    
    print("extracting arguments")
    parser = argparse.ArgumentParser()
    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("MODEL_DIR"))
    parser.add_argument("--datafolder", type=str, default=os.environ.get("TESTING_FOLDER"))
    parser.add_argument("--model-name", type=str, default=os.environ.get("MODEL_NAME"))
    parser.add_argument("--test-file", type=str, default="test_set.csv")
 
    parser.add_argument(
        "--target", type=str, default="default"
    )

    args, _ = parser.parse_known_args()

    print(f"Model path: {os.path.join(args.model_dir, args.model_name)}")
    risk_model = joblib.load(os.path.join(args.model_dir, args.model_name))
    test_df = pd.read_csv(os.path.join(args.datafolder, args.test_file))
    categories_features, numerics_features = feature_definition()
    all_features = categories_features + numerics_features + ["default"]
    print(f"Predicting {args.test_file} ....")
    start_time = time.time()
    y_test_pred = risk_model.predict_proba(test_df[all_features])
    print(f"Finish after {time.time() - start_time} s")
    y_test_pred = y_test_pred[:, 1]
    test_df["default_prediction"] = y_test_pred
    saved_filed = os.path.join(args.datafolder, f"{args.test_file}")
    print(f"...to csv {saved_filed}")
    test_df.to_csv(saved_filed, index=False)
    
if __name__ == "__main__":
    predict()

Overwriting default_modeling/predict.py


##### TERMINAL SCRIPT

In [36]:
!python3 -m default_modeling.predict --model-dir ./default_modeling/default_modeling/interface/ \
                                     --model-name risk_model.joblib  \
                                     --datafolder ./test_data \
                                     --test-file test_set_2.csv

extracting arguments
args.model_dir ./default_modeling/default_modeling/interface/
args.model_name risk_model.joblib
args.test_file test_set_2.csv
Model path: ./default_modeling/default_modeling/interface/risk_model.joblib
Predicting....
Finish after 0.2402658462524414 s
...to csv ./test_data/test_set_2.csv


#### SETUP

In [73]:
%%writefile default_modeling/setup.py

from distutils.core import setup
import setuptools

setup(
    name='default_modeling',
    version='0.0.1',
    description="Default Probability Estimation Library",
    author="Linh, V. Nguyen",
    author_email="linhvietnguyen.ee@gmail.com",
    packages=['default_modeling', 'default_modeling.interface', 'default_modeling.utils'],
    include_package_data=True,
    install_requires=[
        "pandas>=0.25.0",
        "numpy=>1.17.1",
        "scikit-learn>=0.23.0",
        "scipy>=0.18.1",
        "category_encoders>=0.23.0"
    ]
)


Overwriting default_modeling/setup.py


In [111]:
!tree

[01;34m.[00m
├── DockerFile
├── Readme.ipynb
├── WriteFile.ipynb
├── [01;34mdata[00m
├── [01;34mdefault_modeling[00m
│   ├── __init__.py
│   ├── [01;34m__pycache__[00m
│   │   ├── __init__.cpython-37.pyc
│   │   ├── __main__.cpython-37.pyc
│   │   ├── predict.cpython-37.pyc
│   │   └── train.cpython-37.pyc
│   ├── [01;34mdefault_modeling[00m
│   │   ├── __init__.py
│   │   ├── [01;34m__pycache__[00m
│   │   │   ├── __init__.cpython-37.pyc
│   │   │   └── __main__.cpython-37.pyc
│   │   ├── [01;34minterface[00m
│   │   │   ├── __init__.py
│   │   │   ├── [01;34m__pycache__[00m
│   │   │   │   ├── __init__.cpython-37.pyc
│   │   │   │   ├── predict.cpython-37.pyc
│   │   │   │   └── train.cpython-37.pyc
│   │   │   └── risk_model.joblib
│   │   └── [01;34mutils[00m
│   │       ├── __init__.py
│   │       ├── [01;34m__pycache__[00m
│   │       │   ├── __init__.cpython-37.pyc
│   │       │   ├── load.cpython-37.pyc
│   │       │   └── preproc.cpython-37.pyc
│   │       

# DOCKFERFILE

In [113]:
%%writefile DockerFile

FROM python:3.8
RUN pwd
RUN dir
WORKDIR /app
COPY requirements.txt .
RUN pip install -r requirements.txt
ENV TRAINING_FOLDER=./train_data
ENV TESTING_FOLDER=./test_data
ENV MODEL_DIR=./default_modeling/default_modeling/interface/
ENV MODEL_NAME=risk_model.joblib

COPY default_modeling default_modeling
COPY train_data train_data

RUN dir
RUN python3 -m default_modeling.train --datafolder ${TRAINING_FOLDER} \
                                      --model-dir ${MODEL_DIR} \
                                      --model-name ${MODEL_NAME}
ENTRYPOINT ["python3", "-m"]

Overwriting DockerFile


# BUILDING IMAGE

In [114]:
!docker build -t default_model -f DockerFile .

Sending build context to Docker daemon  65.99MB
Step 1/14 : FROM python:3.8
 ---> 79372a158581
Step 2/14 : RUN pwd
 ---> Using cache
 ---> a1eb9c6ecf40
Step 3/14 : RUN dir
 ---> Using cache
 ---> 86f013f58573
Step 4/14 : ADD requirements.txt .
 ---> Using cache
 ---> 802ce470a44d
Step 5/14 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> f48df72b81f6
Step 6/14 : ENV TRAINING_FOLDER=./train_data
 ---> Using cache
 ---> ed426028a565
Step 7/14 : ENV TESTING_FOLDER=./test_data
 ---> Using cache
 ---> cf87e3385b35
Step 8/14 : ENV MODEL_DIR=./default_modeling/default_modeling/interface/
 ---> Using cache
 ---> 5ae07b8e658e
Step 9/14 : ENV MODEL_NAME=risk_model.joblib
 ---> Using cache
 ---> 1a59dc9dd9ca
Step 10/14 : ADD default_modeling default_modeling
 ---> Using cache
 ---> 026b4ed8ea6b
Step 11/14 : ADD train_data train_data
 ---> Using cache
 ---> 6101affba3d0
Step 12/14 : RUN dir
 ---> Using cache
 ---> 2703567123b9
Step 13/14 : RUN python3 -m default_modeling.train --dataf

# Run Unit Test in Image

In [115]:
# !python3 -m unittest discover default_modeling
!docker run -t default_model:latest  unittest discover default_modeling

folder view
[PosixPath('bin'), PosixPath('sys'), PosixPath('dev'), PosixPath('var'), PosixPath('mnt'), PosixPath('opt'), PosixPath('proc'), PosixPath('boot'), PosixPath('usr'), PosixPath('lib'), PosixPath('media'), PosixPath('etc'), PosixPath('sbin'), PosixPath('root'), PosixPath('lib64'), PosixPath('srv'), PosixPath('run'), PosixPath('tmp'), PosixPath('home'), PosixPath('default_modeling'), PosixPath('train_data')]
Found the following test data
default_modeling/tests/data/test_sample_1.csv
..
----------------------------------------------------------------------
Ran 2 tests in 0.716s

OK


## USE IMAGE TO PREDICT NEW DATA 1

In [117]:
!docker run -v /home/jupyter/test_data:/test_data default_model:latest \
                                     default_modeling.predict \
                                     --test-file test_set_1.csv

extracting arguments
Model path: ./default_modeling/default_modeling/interface/risk_model.joblib
Predicting test_set_1.csv ....
Finish after 0.43554115295410156 s
...to csv ./test_data/test_set_1.csv


## USE IMAGE TO PREDICT NEW DATA 2

In [118]:
!docker run -v /home/jupyter/test_data:/test_data default_model:latest \
                                     default_modeling.predict \
                                     --test-file test_set_2.csv

extracting arguments
Model path: ./default_modeling/default_modeling/interface/risk_model.joblib
Predicting test_set_2.csv ....
Finish after 0.23408913612365723 s
...to csv ./test_data/test_set_2.csv


In [1]:
!tar chvfz notebook.tar.gz *

Build_Image.ipynb
Build_Image.md
Build_Image_Test.ipynb
Dockerfile
Prototype_and_Experiment.ipynb
Readme.ipynb
Readme.md
Try Cython.ipynb
build/
build/lib.linux-x86_64-3.7/
build/lib.linux-x86_64-3.7/mytrain.cpython-37m-x86_64-linux-gnu.so
build/temp.linux-x86_64-3.7/
build/temp.linux-x86_64-3.7/mytrain.o
build/temp.linux-x86_64-3.7/default_modeling/
build/temp.linux-x86_64-3.7/default_modeling/train.o
build/temp.linux-x86_64-3.7/default_modeling/predict.o
data/
default_modeling/
default_modeling/setup.py
default_modeling/__pycache__/
default_modeling/__pycache__/predict.cpython-37.pyc
default_modeling/__pycache__/__main__.cpython-37.pyc
default_modeling/__pycache__/train.cpython-37.pyc
default_modeling/__pycache__/__init__.cpython-37.pyc
default_modeling/__pycache__/launch_predicting.cpython-37.pyc
default_modeling/__pycache__/launch_training.cpython-37.pyc
default_modeling/__pycache__/setup.cpython-37.pyc
default_modeling/train.cpython-37m-x86_64-linux-gnu.so
default_modeling/.ipynb_

In [32]:
!jupyter nbconvert --to markdown Build_Image.ipynb

[NbConvertApp] Converting notebook Build_Image.ipynb to markdown
[NbConvertApp] Writing 8954 bytes to Build_Image.md


In [1]:
%%writefile DockerFile

FROM python:3.8
WORKDIR /app/
COPY requirements.txt .
RUN pip install -r requirements.txt

COPY default_modeling default_modeling
COPY train_data train_data

ENTRYPOINT ["python3"]

Overwriting DockerFile


In [2]:
!docker build -t default_model -f DockerFile .

Sending build context to Docker daemon   71.2MB
Step 1/7 : FROM python:3.8
 ---> 79372a158581
Step 2/7 : WORKDIR /app/
 ---> Using cache
 ---> 6927edeea50a
Step 3/7 : COPY requirements.txt .
 ---> Using cache
 ---> be3191584c0c
Step 4/7 : RUN pip install -r requirements.txt
 ---> Using cache
 ---> b4606af6a945
Step 5/7 : COPY default_modeling default_modeling
 ---> 1d3798217e7f
Step 6/7 : COPY train_data train_data
 ---> 3874f13479f2
Step 7/7 : ENTRYPOINT ["python3"]
 ---> Running in 286483df009c
Removing intermediate container 286483df009c
 ---> d952a8249917
Successfully built d952a8249917
Successfully tagged default_model:latest


### RUN TRAINING
- ENV TRAINING_FOLDER=./train_data
- ENV TESTING_FOLDER=./test_data
- ENV MODEL_DIR=./default_modeling/default_modeling/interface/
- ENV MODEL_NAME=risk_model.joblib

In [24]:
%%writefile .env
TRAINING_FOLDER=./train_data
TESTING_FOLDER=./test_data
MODEL_DIR=./default_modeling/default_modeling/interface/
MODEL_NAME=risk_model.joblib
N_ESTIMATORS=5
MIN_SAMPLES_LEAF=5
MAX_DEPTH=5
RANDOM_STATE=123
TARGET=default

Overwriting .env


In [None]:
!docker run --env-file .env -v /home/jupyter/train_data:/app/train_data default_model:latest  \
                                                        -m default_modeling.train \
                                                        --n-estimators .env["N_ESTIMATORS"]

In [3]:
!docker run -v /home/jupyter/test_data:/app/test_data default_model:latest \
                                     default_modeling.predict \
                                     --test-file test_set_1.csv

extracting arguments
Model path: ./default_modeling/default_modeling/interface/risk_model.joblib
Predicting test_set_1.csv ....
Finish after 0.4289968013763428 s
...to csv ./test_data/test_set_1.csv


In [2]:
!tree

[01;34m.[00m
├── Build_Image.ipynb
├── Build_Image.md
├── Build_Image_Test.ipynb
├── Dockerfile
├── Prototype_and_Experiment.ipynb
├── README.md
├── Readme.ipynb
├── Readme.md
├── Try Cython.ipynb
├── [01;34mbuild[00m
│   ├── [01;34mlib.linux-x86_64-3.7[00m
│   │   └── [01;32mmytrain.cpython-37m-x86_64-linux-gnu.so[00m
│   └── [01;34mtemp.linux-x86_64-3.7[00m
│       ├── [01;34mdefault_modeling[00m
│       │   ├── predict.o
│       │   └── train.o
│       └── mytrain.o
├── [01;34mdata[00m
├── [01;34mdefault_modeling[00m
│   ├── __init__.py
│   ├── [01;34m__pycache__[00m
│   │   ├── __init__.cpython-37.pyc
│   │   ├── __main__.cpython-37.pyc
│   │   ├── launch_predicting.cpython-37.pyc
│   │   ├── launch_training.cpython-37.pyc
│   │   ├── predict.cpython-37.pyc
│   │   ├── setup.cpython-37.pyc
│   │   └── train.cpython-37.pyc
│   ├── [01;34mdefault_modeling[00m
│   │   ├── __init__.py
│   │   ├── [01;34minterface[00m
│   │   │   ├── __init__.py
│   │   │   ├── pre

In [None]:
    .
    ├── Dockerfile
    ├── Prototype_and_Experiment.ipynb
    ├── README.md
    ├── default_modeling
    │   ├── __init__.py
    │   ├── default_modeling
    │   │   ├── __init__.py
    │   │   ├── interface
    │   │   │   ├── __init__.py
    │   │   │   ├── predict.py
    │   │   │   └── train.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── load.py
    │   │       └── preproc.py
    │   ├── setup.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── data
    │   │   │   └── test_sample_1.csv
    │   │   ├── test_case_base.py
    │   │   └── test_data_handling.py
    ├── model
    │   └── risk_model.joblib
    ├── requirements.txt
    ├── test_data
    │   ├── test_set_1.csv
    │   └── test_set_2.csv
    └── train_data
        ├── train_set_1.csv
        └── train_set_2.csv