## Environment set-up

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
%store -r

In [60]:
import boto3
import sagemaker

from sagemaker import get_execution_role

role = get_execution_role()

ecr_namespace = 'sagemaker-training-containers/'
prefix = 'catboost-image'

ecr_repository_name = ecr_namespace + prefix
account_id = role.split(':')[4]
region = boto3.Session().region_name
sess = sagemaker.session.Session()
bucket = sess.default_bucket()

print('Account: {}'.format(account_id))
print('Region: {}'.format(region))
print('Role: {}'.format(role))
print('S3 Bucket: {}'.format(bucket))

Account: 806174985048
Region: ap-northeast-2
Role: arn:aws:iam::806174985048:role/service-role/AmazonSageMaker-ExecutionRole-20201218T151409
S3 Bucket: sagemaker-ap-northeast-2-806174985048


In [2]:
import pandas as pd
pd.options.display.max_rows=200
import os

In [3]:
# full_local_path = 'data/full/full-184903890.pkl'

In [4]:
%%time
file_path = full_local_path

raw_df = pd.read_pickle(file_path)

CPU times: user 1min 3s, sys: 41.3 s, total: 1min 44s
Wall time: 1min 44s


In [5]:
raw_df

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE
0,0,str_83230,str_3,str_1,str_13,str_379,2017-11-06 14:32:21,2017-11-06 14:00:00
1,0,str_17357,str_3,str_1,str_19,str_379,2017-11-06 14:33:34,2017-11-06 14:00:00
2,0,str_35810,str_3,str_1,str_13,str_379,2017-11-06 14:34:12,2017-11-06 14:00:00
3,0,str_45745,str_14,str_1,str_13,str_478,2017-11-06 14:34:52,2017-11-06 14:00:00
4,0,str_161007,str_3,str_1,str_13,str_379,2017-11-06 14:35:08,2017-11-06 14:00:00
...,...,...,...,...,...,...,...,...
184903885,0,str_121312,str_12,str_1,str_10,str_340,2017-11-09 16:00:00,2017-11-09 16:00:00
184903886,0,str_46894,str_3,str_1,str_19,str_211,2017-11-09 16:00:00,2017-11-09 16:00:00
184903887,0,str_320126,str_1,str_1,str_13,str_274,2017-11-09 16:00:00,2017-11-09 16:00:00
184903888,0,str_189286,str_12,str_1,str_37,str_259,2017-11-09 16:00:00,2017-11-09 16:00:00


In [6]:
sampling_rate = 0.01
df = raw_df.sample(frac=sampling_rate, random_state=100)

In [7]:
df

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE
71816445,0,str_103106,str_3,str_1,str_13,str_371,2017-11-08 00:49:06,2017-11-08 00:00:00
118420665,0,str_83928,str_3,str_1,str_19,str_417,2017-11-08 14:54:59,2017-11-08 14:00:00
2983188,0,str_36150,str_15,str_2,str_49,str_3,2017-11-06 17:27:10,2017-11-06 17:00:00
111499266,0,str_40056,str_26,str_1,str_18,str_477,2017-11-08 13:00:46,2017-11-08 13:00:00
1598682,0,str_39081,str_18,str_1,str_8,str_107,2017-11-06 16:37:52,2017-11-06 16:00:00
...,...,...,...,...,...,...,...,...
110325929,0,str_49431,str_12,str_1,str_13,str_245,2017-11-08 12:40:52,2017-11-08 12:00:00
19714645,0,str_124478,str_3,str_1,str_13,str_137,2017-11-07 03:07:24,2017-11-07 03:00:00
55527020,0,str_43793,str_6,str_1,str_19,str_459,2017-11-07 14:33:58,2017-11-07 14:00:00
76863922,0,str_53454,str_13,str_1,str_19,str_400,2017-11-08 02:24:18,2017-11-08 02:00:00


In [8]:
from IPython.display import display as dp

def change_code_to_string(raw, col, new_col, verbose=False):
    '''
    숫자값에 'str' 를 넣어서 명시적으로 스트링으로 타입 변환
    '''
    df = raw.copy()
    col_val = df[col].unique()
    str_code = df[col].apply(lambda x: 'str_' + str(x))    
    
    index = [ i for i, e in enumerate(df.columns) if e == col]    # 생성 컬럼의 위치를 알기 위해서 임. 해당 컬럼의 옆에 삽입하기 위함.
    df.insert(index[0], column=new_col, value=str_code)
    if verbose:
        dp(col_val)
    
    return df

In [9]:
def create_hour_feature(df, src_col, new_col):
    fdf = df.copy()
    fdf.insert(len(fdf.columns),column=new_col, value=fdf[src_col].apply(lambda x: x.hour))        
    
    return fdf
    
df = create_hour_feature(df, src_col='EVENT_TIMESTAMP_SIMPLE', 
                         new_col='EVENT_HOUR')    
df = change_code_to_string(df, col='EVENT_HOUR', new_col='STR_EVENT_HOUR', verbose=False)


In [10]:
df = drop_column(df, col='EVENT_HOUR')
df.head(10)

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE,STR_EVENT_HOUR
71816445,0,str_103106,str_3,str_1,str_13,str_371,2017-11-08 00:49:06,2017-11-08 00:00:00,str_0
118420665,0,str_83928,str_3,str_1,str_19,str_417,2017-11-08 14:54:59,2017-11-08 14:00:00,str_14
2983188,0,str_36150,str_15,str_2,str_49,str_3,2017-11-06 17:27:10,2017-11-06 17:00:00,str_17
111499266,0,str_40056,str_26,str_1,str_18,str_477,2017-11-08 13:00:46,2017-11-08 13:00:00,str_13
1598682,0,str_39081,str_18,str_1,str_8,str_107,2017-11-06 16:37:52,2017-11-06 16:00:00,str_16
20861536,0,str_182847,str_3,str_1,str_17,str_280,2017-11-07 03:29:10,2017-11-07 03:00:00,str_3
29404151,0,str_80037,str_29,str_1,str_17,str_210,2017-11-07 05:59:52,2017-11-07 05:00:00,str_5
8242101,0,str_117157,str_9,str_1,str_19,str_134,2017-11-06 23:36:18,2017-11-06 23:00:00,str_23
162685886,0,str_119823,str_18,str_1,str_10,str_107,2017-11-09 09:14:35,2017-11-09 09:00:00,str_9
20207684,0,str_84896,str_64,str_1,str_17,str_459,2017-11-07 03:16:39,2017-11-07 03:00:00,str_3


In [11]:
df = df.sort_values(by= ['str_ip', 'EVENT_TIMESTAMP'])
df

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE,STR_EVENT_HOUR
125221605,0,str_1,str_2,str_1,str_15,str_477,2017-11-08 17:25:31,2017-11-08 17:00:00,str_17
125222959,0,str_1,str_12,str_1,str_15,str_265,2017-11-08 17:25:35,2017-11-08 17:00:00,str_17
128312967,0,str_1,str_2,str_1,str_2,str_477,2017-11-08 22:03:35,2017-11-08 22:00:00,str_22
167270350,0,str_1,str_3,str_1,str_48,str_182,2017-11-09 10:42:13,2017-11-09 10:00:00,str_10
5683883,0,str_10,str_6,str_1,str_20,str_459,2017-11-06 21:56:10,2017-11-06 21:00:00,str_21
...,...,...,...,...,...,...,...,...,...
46082634,0,str_99998,str_18,str_1,str_13,str_121,2017-11-07 11:33:20,2017-11-07 11:00:00,str_11
46296711,0,str_99998,str_15,str_1,str_11,str_265,2017-11-07 11:37:33,2017-11-07 11:00:00,str_11
46300666,0,str_99998,str_3,str_1,str_11,str_130,2017-11-07 11:37:38,2017-11-07 11:00:00,str_11
46762198,0,str_99998,str_14,str_1,str_11,str_401,2017-11-07 11:46:39,2017-11-07 11:00:00,str_11


In [12]:
def split_data_by_time(df, target_col, label_col, total_samples, split_rate, train_end, test_start, verbose=False):
    '''
    시간 관점으로 번반부튼 훈련, 후반부는 테스트 데이터로 해서 샘블링 함.
    '''
    
    # 훈련 데이터 셋
    train_df = df[df[target_col] <= train_end]   
    train_num = int(total_samples * (1 - split_rate))    # 훈련 샘플 데이터 수
    train_sample = train_df.sample(n = train_num, random_state=100)    # 샘플링    

    print("train sample shape: ", train_sample.shape)
    print("train min time: ", train_sample[target_col].min())
    print("train max time: ", train_sample[target_col].max())
    print("Train fraud ratio: ", round(train_sample[label_col].value_counts()[1] / train_sample.shape[0],5))
    print("# of Train frauds: ", train_sample[label_col].value_counts()[1])     


    # 테스트 데이터 셋    
    test_df = df[df[target_col] >= test_start]    
    test_num = int(total_samples * (split_rate))    # 테스트 샘플 데이터 수
    test_sample = test_df.sample(n = test_num, random_state=100)    
    

    print("\ntest sample shape: ", test_sample.shape)    
    print("test min time: ", test_sample[target_col].min())
    print("test max time: ", test_sample[target_col].max())
    print("Test fraud ratio: ", round(test_sample[label_col].value_counts()[1] / test_sample.shape[0],5))    
    print("# of test frauds: ", test_sample[label_col].value_counts()[1])         
    
    
    return train_sample, test_sample

In [13]:
def save_csv_local(raw_df, preproc_folder, label, file_name):
    '''
    주어진 파일을 저장
    '''
    os.makedirs(preproc_folder, exist_ok=True)
    
    df = raw_df.copy()
    df = pd.concat([df[label], df.drop([label], axis=1)], axis=1)
    file_path = os.path.join(preproc_folder, file_name)
    df.to_csv(file_path, index=False, )

    print(f'{file_path} is saved')

    
    return file_path


In [14]:
train_df, test_df = split_data_by_time(
                       df=df, 
                       target_col='EVENT_TIMESTAMP', 
                       label_col = 'EVENT_LABEL',
                       total_samples=200000, 
                       split_rate=0.1, 
                       train_end='2017-11-08 23:59', 
                       test_start='2017-11-09 00:00',    
                       verbose = True,
                  )

train sample shape:  (180000, 9)
train min time:  2017-11-06 16:00:02
train max time:  2017-11-08 23:59:00
Train fraud ratio:  0.00257
# of Train frauds:  462

test sample shape:  (20000, 9)
test min time:  2017-11-09 00:00:00
test max time:  2017-11-09 15:59:42
Test fraud ratio:  0.00235
# of test frauds:  47


In [15]:
train_df.head()

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE,STR_EVENT_HOUR
7797164,0,str_44555,str_3,str_1,str_13,str_153,2017-11-06 23:25:11,2017-11-06 23:00:00,str_23
19430508,0,str_46382,str_3,str_1,str_22,str_480,2017-11-07 03:02:01,2017-11-07 03:00:00,str_3
101051316,0,str_215360,str_64,str_1,str_8,str_459,2017-11-08 09:58:13,2017-11-08 09:00:00,str_9
1270380,0,str_196847,str_2,str_1,str_13,str_477,2017-11-06 16:28:55,2017-11-06 16:00:00,str_16
2372129,0,str_53382,str_3,str_1,str_19,str_424,2017-11-06 17:02:09,2017-11-06 17:00:00,str_17


In [16]:
test_df.head()

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,EVENT_TIMESTAMP,EVENT_TIMESTAMP_SIMPLE,STR_EVENT_HOUR
137879907,0,str_46901,str_3,str_1,str_18,str_280,2017-11-09 01:52:06,2017-11-09 01:00:00,str_1
147147685,0,str_99862,str_3,str_1,str_25,str_280,2017-11-09 04:36:49,2017-11-09 04:00:00,str_4
168360052,0,str_80048,str_9,str_1,str_19,str_244,2017-11-09 11:01:24,2017-11-09 11:00:00,str_11
170342105,0,str_37776,str_9,str_1,str_13,str_258,2017-11-09 11:37:11,2017-11-09 11:00:00,str_11
141482677,0,str_18520,str_15,str_1,str_18,str_245,2017-11-09 03:02:16,2017-11-09 03:00:00,str_3


#### CatBoost 알고리즘에 사용되지 않는 Timestamp 는 지우고 데이터 파일을 생성하도록 합니다.

In [17]:
train_df = drop_column(train_df, col='EVENT_TIMESTAMP')
train_df = drop_column(train_df, col='EVENT_TIMESTAMP_SIMPLE')

In [18]:
train_df.head()

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,STR_EVENT_HOUR
7797164,0,str_44555,str_3,str_1,str_13,str_153,str_23
19430508,0,str_46382,str_3,str_1,str_22,str_480,str_3
101051316,0,str_215360,str_64,str_1,str_8,str_459,str_9
1270380,0,str_196847,str_2,str_1,str_13,str_477,str_16
2372129,0,str_53382,str_3,str_1,str_19,str_424,str_17


In [19]:
test_df = drop_column(test_df, col='EVENT_TIMESTAMP')
test_df = drop_column(test_df, col='EVENT_TIMESTAMP_SIMPLE')

In [20]:
test_df.head()

Unnamed: 0,EVENT_LABEL,str_ip,str_app,str_device,str_os,str_channel,STR_EVENT_HOUR
137879907,0,str_46901,str_3,str_1,str_18,str_280,str_1
147147685,0,str_99862,str_3,str_1,str_25,str_280,str_4
168360052,0,str_80048,str_9,str_1,str_19,str_244,str_11
170342105,0,str_37776,str_9,str_1,str_13,str_258,str_11
141482677,0,str_18520,str_15,str_1,str_18,str_245,str_3


In [21]:
train_file_name = 'train-' + str(train_df.shape[0]) + ".csv"
train_local_path = save_csv_local(raw_df=train_df, preproc_folder='data/train', 
                                  label='EVENT_LABEL', file_name=train_file_name)
print("train_local_path: ", train_local_path)

test_file_name = 'test-' + str(test_df.shape[0]) + ".csv"
test_local_path = save_csv_local(raw_df=test_df, preproc_folder='data/test', 
                                  label='EVENT_LABEL', file_name=test_file_name)
print("test_local_path: ", test_local_path)

data/train/train-180000.csv is saved
train_local_path:  data/train/train-180000.csv
data/test/test-20000.csv is saved
test_local_path:  data/test/test-20000.csv


In [22]:
import sagemaker

bucket = sagemaker.Session().default_bucket()
# 프로젝트 변수
project_prefix = 'adtalking_fraud_phase0'


# S3에 저장되는 데이터의 기본 폴더 위치
s3_train_data_uri = f"s3://{bucket}/{project_prefix}/train"
s3_test_data_uri = f"s3://{bucket}/{project_prefix}/test"

In [23]:
s3_train_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=train_local_path, 
    desired_s3_uri=s3_train_data_uri,    
)
print("s3_train_data_uri: \n", s3_train_data_uri)

s3_test_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=test_local_path, 
    desired_s3_uri=s3_test_data_uri,    
)
print("s3_test_data_uri: \n", s3_test_data_uri)

s3_train_data_uri: 
 s3://sagemaker-ap-northeast-2-806174985048/adtalking_fraud_phase0/train/train-180000.csv
s3_test_data_uri: 
 s3://sagemaker-ap-northeast-2-806174985048/adtalking_fraud_phase0/test/test-20000.csv


## (Option 1) Create a script for CatBoost

트레이닝과 추론에 필요한 스크립트를 생성하고 파일로 저장하도록 합니다.

In [81]:
%%writefile source/catboost_training.py

import argparse
import logging
import os
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import numpy as np
import pandas as pd
import joblib


if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--train-file', type=str, default='train.csv')
    parser.add_argument('--cat-features', type=str)  # in this script we ask user to explicitly name features
    parser.add_argument('--target', type=str) # in this script we ask user to explicitly name the target
    

    args, _ = parser.parse_known_args()

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    
    logging.info('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))

    logging.info('building training and testing datasets')
    X = train_df.drop(args.target, axis=1)
    y = train_df[args.target]
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    
        
    # define and train model
    clf = CatBoostClassifier(
        iterations=10,
        learning_rate=0.1, 
    )

    clf.fit(X_train, y_train, cat_features=args.cat_features.split(), eval_set=(X_val, y_val),)
    
    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    logging.info('saving to {}'.format(path))
    clf.save_model(path)
    
# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

Overwriting source/catboost_training.py


__(Note) CatBoost 알고리즘을 사용하기 위한 패키지들을 설치하고 저장하기 위한 requirements.txt 파일을 준비합니다. requirements.txt 파일은 스크립트 파일과 같은 장소에 저장하여 스크립트 실행시 같이 실행되도록 합니다.__

In [82]:
%%writefile source/requirements.txt

catboost

Overwriting source/requirements.txt


In [83]:
from sagemaker.sklearn.estimator import SKLearn

In [84]:
FRAMEWORK_VERSION = "0.23-1"

sklearn = SKLearn(
    base_job_name = "catboost-training",
    entry_point="catboost_training.py",    
    source_dir='source', 
    framework_version=FRAMEWORK_VERSION,
    instance_type="ml.c5.xlarge",
    role=role,
    sagemaker_session=sess,
    hyperparameters={'cat-features': 'str_ip str_app str_device str_os str_channel STR_EVENT_HOUR',
                     'target': 'EVENT_LABEL',
                    'train-file': 'train-180000.csv'},
)

In [85]:
sklearn.fit({'train':s3_train_data_uri}, logs=True)

2021-10-27 03:05:58 Starting - Starting the training job...
2021-10-27 03:06:22 Starting - Launching requested ML instancesProfilerReport-1635303958: InProgress
...
2021-10-27 03:06:53 Starting - Preparing the instances for training.........
2021-10-27 03:08:22 Downloading - Downloading input data
2021-10-27 03:08:22 Training - Downloading the training image.[34m2021-10-27 03:08:31,934 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-10-27 03:08:31,938 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-10-27 03:08:31,950 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2021-10-27 03:08:32,232 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/miniconda3/bin/python -m pip install -r requirements.txt[0m
[34mCollecting catboost
  Downloading catboost-1.0.0-cp37-none-manylinux1_x86_64.whl (76.4 MB)[0m
[34mCollec

### Inference

In [None]:
predictor = sklearn.deploy(
    initial_instance_count=1, instance_type="ml.c5.xlarge"
)

## (Option 2) Bring your own container for CatBoost

Docker 파일을 구축하도록 합니다.

In [1]:
%%writefile Dockerfile

FROM ubuntu:16.04

RUN apt-get update && \
    apt-get -y install build-essential libatlas-dev git wget curl nginx jq libatlas3-base

RUN curl -LO http://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
    bash Miniconda3-latest-Linux-x86_64.sh -bfp /miniconda3 && \
    rm Miniconda3-latest-Linux-x86_64.sh

ENV PATH=/miniconda3/bin:${PATH}
        
RUN apt-get update && apt-get install -y python-pip && pip install sagemaker-training catboost

ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8

Writing Dockerfile


Docker 파일 구축 후 Docker 파일을 생성하고 ECR 에 저장하기 위한 스크립트를 작성하도록 합니다.

In [87]:
%%writefile build_and_push.sh

ACCOUNT_ID=$1
REGION=$2
REPO_NAME=$3


sudo docker build -f Dockerfile -t $REPO_NAME .

docker tag $REPO_NAME $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:latest

$(aws ecr get-login --no-include-email --registry-ids $ACCOUNT_ID)

aws ecr describe-repositories --repository-names $REPO_NAME || aws ecr create-repository --repository-name $REPO_NAME

docker push $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/$REPO_NAME:latest

Writing build_and_push.sh


In [None]:
! bash build_and_push.sh $account_id $region $ecr_repository_name

In [5]:
container_image_uri = '{0}.dkr.ecr.{1}.amazonaws.com/{2}:latest'.format(account_id, region, ecr_repository_name)
print('ECR container ARN: {}'.format(container_image_uri))

ECR container ARN: 806174985048.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-training-containers/catboost-image:latest


In [6]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.0.0-cp36-none-manylinux1_x86_64.whl (76.4 MB)
[K     |████████████████████████████████| 76.4 MB 149.2 MB/s eta 0:00:01   |██████████▉                     | 25.8 MB 1.7 MB/s eta 0:00:31
Collecting graphviz
  Downloading graphviz-0.17-py3-none-any.whl (18 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.0.0 graphviz-0.17
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [7]:
import os

import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [8]:
# we use the Boston housing dataset 
data = load_boston()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

In [10]:
local_train = 'boston_train.csv'
local_test = 'boston_test.csv'

trainX.to_csv(local_train)
testX.to_csv(local_test)

In [11]:
# send data to S3. SageMaker will take training data from S3
train_location = sess.upload_data(
    path=local_train, 
    bucket=bucket,
    key_prefix='catboost')

test_location = sess.upload_data(
    path=local_test, 
    bucket=bucket,
    key_prefix='catboost')


In [13]:
import tarfile

In [14]:
# first compress the code and send to S3
program = 'catboost_training.py'
source = 'source.tar.gz'
project = 'catboost'

tar = tarfile.open(source, 'w:gz')
tar.add(program)
tar.close()

submit_dir = sess.upload_data(
    path=source, 
    bucket=bucket,
    key_prefix=project+ '/' + source)

print(submit_dir)


s3://sagemaker-ap-northeast-2-806174985048/catboost/source.tar.gz/source.tar.gz


Estimator 를 추상화하는 CatBoost 전용 Estimator 를 구축할 수 있습니다.

In [16]:
from sagemaker.estimator import Framework

class CatBoostEstimator(Framework):
    def __init__(
        self,
        entry_point,
        source_dir=None,
        hyperparameters=None,
        py_version="py3",
        framework_version=None,
        image_name=None,
        distributions=None,
        **kwargs):
        
        super(CatBoostEstimator, self).__init__(
            entry_point, source_dir, hyperparameters, image_name=image_name, **kwargs)
    
    
    def _configure_distribution(self, distributions):
        return
    
    def create_model(
        self,
        model_server_workers=None,
        role=None,
        vpc_config_override=None,
        entry_point=None,
        source_dir=None,
        dependencies=None,
        image_name=None,
        **kwargs):
        
        return None

In [17]:
catboost = CatBoostEstimator(
    image_name=container_image_uri,
    role=role,
    entry_point='catboost_training.py',
    output_path=output_path,
    train_instance_count=1, 
    train_instance_type='ml.m5.xlarge',
    hyperparameters={'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
                     'target': 'target'})

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
image_name has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [18]:
catboost.fit({'train':train_location, 'test': test_location}, logs=True)


2021-10-25 16:29:31 Starting - Starting the training job...
2021-10-25 16:29:33 Starting - Launching requested ML instancesProfilerReport-1635179371: InProgress
......
2021-10-25 16:31:02 Starting - Preparing the instances for training......
2021-10-25 16:32:02 Downloading - Downloading input data
2021-10-25 16:32:02 Training - Downloading the training image......
2021-10-25 16:33:02 Uploading - Uploading generated training model[34m2021-10-25 16:32:49,749 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-10-25 16:32:52,777 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-10-25 16:32:52,787 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-10-25 16:32:52,795 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "test": "/opt/ml/i

## (Option 3) Testing local mode

In [36]:
X = train_df.drop("EVENT_LABEL", axis=1)
y = train_df["EVENT_LABEL"]

In [37]:
cat_features = ['str_ip','str_app','str_device','str_os','str_channel','STR_EVENT_HOUR']

In [38]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [40]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(
    iterations=10,
#     verbose=5,
)

clf.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
)

Learning rate set to 0.5
0:	learn: 0.3169482	test: 0.3164411	best: 0.3164411 (0)	total: 32.6ms	remaining: 294ms
1:	learn: 0.1758481	test: 0.1754480	best: 0.1754480 (1)	total: 51.9ms	remaining: 208ms
2:	learn: 0.1044041	test: 0.1038631	best: 0.1038631 (2)	total: 66.9ms	remaining: 156ms
3:	learn: 0.0644762	test: 0.0640745	best: 0.0640745 (3)	total: 74.9ms	remaining: 112ms
4:	learn: 0.0415478	test: 0.0412070	best: 0.0412070 (4)	total: 86.4ms	remaining: 86.4ms
5:	learn: 0.0282423	test: 0.0279309	best: 0.0279309 (5)	total: 93.6ms	remaining: 62.4ms
6:	learn: 0.0202609	test: 0.0199464	best: 0.0199464 (6)	total: 103ms	remaining: 44.1ms
7:	learn: 0.0155297	test: 0.0153522	best: 0.0153522 (7)	total: 111ms	remaining: 27.7ms
8:	learn: 0.0126628	test: 0.0124792	best: 0.0124792 (8)	total: 119ms	remaining: 13.2ms
9:	learn: 0.0114287	test: 0.0113628	best: 0.0113628 (9)	total: 127ms	remaining: 0us

bestTest = 0.01136283006
bestIteration = 9



<catboost.core.CatBoostClassifier at 0x7f3e96b14828>

In [41]:
predictions = clf.predict(data=X_val)

In [42]:
from sklearn.metrics import classification_report

In [43]:
print(classification_report(y_val, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35908
           1       0.85      0.12      0.21        92

    accuracy                           1.00     36000
   macro avg       0.92      0.56      0.60     36000
weighted avg       1.00      1.00      1.00     36000



In [44]:
from sklearn.metrics import confusion_matrix

In [50]:
confusion_matrix(y_val, predictions, labels=[0, 1])

array([[35906,     2],
       [   81,    11]])