In [61]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import os
import glob
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [62]:
#parameters
date = ''
period = ''
airflow_home = os.environ['AIRFLOW_HOME']

In [78]:
df_raw = pd.read_csv(f'{airflow_home}/data/preprocessed/{period}/output_{date}.csv')

In [79]:
cols = ['ChannelID',
        'Cluster',
        'prepay',
        'count_edit',
        'interval_time',
        'order_weekday',
        'weekday',
        'interval_high',
        'CancelFlag',
]

In [80]:
data = df_raw[cols]

In [81]:
X = data.drop('CancelFlag', axis=1)
y = data['CancelFlag']
X.columns = range(len(X.columns))
y.columns = [0]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
test_data = lgb.Dataset(X_test, label=y_test, free_raw_data=False, reference=train_data)

In [94]:
models = [model.split('/')[-1] for model in glob.glob(f'{airflow_home}/model/*.txt')]
num_round = 100

if models:
    model = lgb.Booster(model_file=f'{airflow_home}/model/{models[-1]}')
    bst = model.refit(X_train, y_train)
else:
    params = {
    'num_leaves': 40,
    'objective': 'binary',
    'max_depth': -1,
    'learning_rate': 0.1,
    'tree_learner': 'data',
    'metric': 'auc',
    'boosting': 'dart'
    }
    bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=5)

In [57]:
y_pred = bst.predict(X_test)

In [58]:
score = str(round(roc_auc_score(y_test, y_pred), 4))[2:]
bst.save_model(f'{airflow_home}/model/model_{period}_{date}_{score}.txt', num_iteration=bst.best_iteration)

<lightgbm.basic.Booster at 0x7f7c4485b160>