In [1]:
import sys
sys.path.append('../')

In [2]:
from simulate import simulate_surgeries
from util import *
import seaborn as sns
import lightgbm as lgbm
import joblib

In [9]:
# # simulation
df = simulate_surgeries(70000)
continuous_columns = ['age', 'surgery_complexity']
ordinal_columns = [f"categorical_{i}" for i in range(1, 13)] + ['gender', 'race', 'marital_status', 'specialties']
date_columns = ['scheduled_date']
target_column = 'is_cancelled'

# dummy dataset
# df = sns.load_dataset('titanic')
# continuous_columns = ['age', 'fare', 'sibsp', 'parch']
# ordinal_columns = ['sex', 'class']
# date_columns = []
# target_column = 'survived'

# optimization params
hyperparams_stepwise_groups = [
    ['learning_rate'],
    ['lambda_l1'], 
    ['lambda_l2'], 
    ['num_leaves'], 
    ['feature_fraction'], 
    ['bagging_fraction'], 
    ['bagging_freq'],
    ['min_child_samples'],
]
fixed_params={
        'verbosity': -1,
        'objective': 'binary',
        'is_unbalance': True,
        'n_estimators': 100
    }
save_dir = '../runs/test_run'

In [10]:
df.head()

Unnamed: 0,age,gender,race,marital_status,surgery_complexity,specialties,scheduled_date,categorical_1,categorical_2,categorical_3,categorical_4,categorical_5,categorical_6,categorical_7,categorical_8,categorical_9,categorical_10,categorical_11,categorical_12,is_cancelled
0,24,f,c,single,2,cardiac,2021-09-18 22:30:40,A,E,A,A,D,E,C,D,D,E,E,B,0
1,51,f,i,married,4,urology,2022-12-24 06:51:37,E,D,D,E,A,C,E,A,E,A,B,A,0
2,55,m,m,married,4,gynecology,2023-05-03 07:00:04,C,C,C,C,B,B,D,B,D,D,C,B,0
3,66,m,c,married,7,urology,2021-11-02 01:39:28,B,D,C,B,E,A,D,D,E,E,D,C,0
4,75,f,c,single,3,urology,2022-11-09 05:37:59,E,B,C,B,A,A,B,A,D,D,D,B,0


In [11]:
best_params, best_value = get_lgbm_optimized_params(
    df,
    continuous_columns,
    ordinal_columns,
    date_columns,
    target_column,
    save_dir,
    hyperparams_stepwise_groups=hyperparams_stepwise_groups,
    # hyperparams_stepwise_groups=None,
    fixed_params=fixed_params,
    n_trials=10,
    random_state=0
    )

[I 2024-04-16 07:58:15,547] A new study created in memory with name: no-name-b664cdc6-5000-4229-a162-2716dece1a36


optimizing group: ['learning_rate']


[I 2024-04-16 07:58:18,672] Trial 0 finished with value: 0.6607851980792318 and parameters: {'learning_rate': 0.1}. Best is trial 0 with value: 0.6607851980792318.
[I 2024-04-16 07:58:21,811] Trial 1 finished with value: 0.667007931172469 and parameters: {'learning_rate': 0.044303752452182654}. Best is trial 1 with value: 0.667007931172469.
[I 2024-04-16 07:58:24,663] Trial 2 finished with value: 0.6569289035614246 and parameters: {'learning_rate': 0.13981961408994045}. Best is trial 1 with value: 0.667007931172469.
[I 2024-04-16 07:58:27,683] Trial 3 finished with value: 0.6656848259303721 and parameters: {'learning_rate': 0.06431172050131989}. Best is trial 1 with value: 0.667007931172469.
[I 2024-04-16 07:58:30,899] Trial 4 finished with value: 0.6666765426170468 and parameters: {'learning_rate': 0.0431171005868549}. Best is trial 1 with value: 0.667007931172469.
[I 2024-04-16 07:58:34,443] Trial 5 finished with value: 0.6688835654261704 and parameters: {'learning_rate': 0.018662266

KeyboardInterrupt: 

In [None]:
# save model
X, y = preprocess(df, continuous_columns, ordinal_columns, date_columns, target_column)
model = lgbm.LGBMClassifier(**best_params)
joblib.dump(model, '../runs/test_run/best_model.pkl')

['../runs/test_run/best_model.pkl']

In [14]:
from sklearn.model_selection import train_test_split

df_train, df_test, _, _ = train_test_split(df, df['is_cancelled'], test_size=0.2, random_state=0, stratify=df['is_cancelled'])

In [16]:
df_train.to_csv('../data/dummy_train.csv', index=False)
df_test.to_csv('../data/dummy_test.csv', index=False)