In [0]:
#Import libraries
import pandas as pd
import numpy as np
import mlflow
import optuna
import joblib

from sktime.classification.kernel_based import RocketClassifier
from sktime.transformations.panel.rocket import Rocket
from sktime.pipeline import make_pipeline
from sktime.datatypes import mtype, check_is_mtype, get_examples, convert

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, auc, roc_curve, roc_auc_score, precision_recall_curve, jaccard_score, f1_score
from sklearn.linear_model import LogisticRegression, SGDClassifier

import matplotlib.pyplot as plt

Collecting sktime
  Using cached sktime-0.32.1-py3-none-any.whl (28.9 MB)
Collecting scikit-base<0.9.0,>=0.6.1
  Using cached scikit_base-0.8.2-py3-none-any.whl (134 kB)
Installing collected packages: scikit-base, sktime
Successfully installed scikit-base-0.8.2 sktime-0.32.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting optuna
  Using cached optuna-3.6.1-py3-none-any.whl (380 kB)
Collecting colorlog
  Using cached colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Using cached alembic-1.13.2-py3-none

## Preprocess

In [0]:
#Read in data from SQL table
df_temp = sql('SELECT * FROM df').toPandas()



In [0]:
#Obtain count of each variable, ensure consistent
print((df_temp.count(),len(df_temp.columns))) 

(Time                        1699908
fallduringeoc               1699908
age                         1699908
Gender                      1699908
hxfall                      1699908
                             ...   
PROTEIN                     1699908
RBC                         1699908
SODIUM_lab                  1699908
UREA_NITROGEN/CREATININE    1699908
WBC                         1699908
Length: 325, dtype: int64, 325)


In [0]:
#Remove initial index, and sort variables for multi-idex set
df_temp.drop(['Date'],axis=1,inplace=True)

df_temp = df_temp.sort_values(['Pt_ID','Time'])

In [0]:
#Copy to manipulate
df_final = df_temp.copy()

In [0]:
#Remove duplicates and drop Fall_Status 
df_classes = df_final.drop_duplicates(subset=['Pt_ID','Fall_Status'])

df_final = df_final.drop('Fall_Status',axis=1)

In [0]:
#Update for multiindex & set to numeric
df_final.set_index(['Pt_ID','Time'],inplace=True)

df_final = df_final.apply(pd.to_numeric)

In [0]:
#Verify data structure before modeling
check_is_mtype(df_final,mtype='pd-multiindex',scitype="Panel",msg_return_dict='dict')

True

In [0]:
#Set Index
df_classes.set_index('Pt_ID',inplace=True)

In [0]:
#Establish train/val/test ratios
train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

train_ix, test_ix = train_test_split(df_final.index.levels[0],random_state = 123, test_size = 1 - train_ratio)
val_ix, test_ix = train_test_split(test_ix, random_state= 123, test_size = test_ratio/(test_ratio + val_ratio))

In [0]:
X_train = df_final.loc[train_ix]
X_val = df_final.loc[val_ix]
X_test = df_final.loc[test_ix]
  
y_train = np.array(df_classes.loc[train_ix]['Fall_Status'])
y_val = np.array(df_classes.loc[val_ix]['Fall_Status'])
y_test = np.array(df_classes.loc[test_ix]['Fall_Status'])

In [0]:
#Obtain counts 
f_cnts = df_classes.groupby('Fall_Status').size().reset_index(name='Count')

total_cnt = len(df_classes)
f_cnts['Prop'] = f_cnts['Count']/total_cnt
f_cnts

Unnamed: 0,fallduringeoc,Count,Prop
0,0,236879,0.975437
1,1,5965,0.024563


In [0]:
set(y_train) - set(y_test) #Verify no differences in structure 

set()

## ROCKET

### Base ROCKET

In [0]:
#Base Rocket model
rocket = RocketClassifier(num_kernels=10000,
                          use_multivariate = 'auto',
                          n_jobs = 1,
                          random_state = 123,
                          #rocket_transform = 'multirocket'
                          )

#pipe_sktime = square * rocket
rkt_sktime = rocket
rkt_sktime.fit(X_train,y_train)

In [0]:
from sktime.utils import mlflow_sktime
file_path = 'path_to_file'

#Save via sklearn
#mlflow_sktime.save_model(sktime_model = pipe_sktime,path = file_path)
#mlflow.sklearn.save_model(rkt_sktime,file_path)

### SGD Pipeline

In [0]:
#Rocket Using SGD as Classifier
rocket_SGD_pipeline = make_pipeline(
  Rocket(random_state = 123),
  SGDClassifier(loss='log_loss',random_state = 123) #Base model
  #SGDClassifier(loss='log_loss', random_state = 123, class_weight = 'balanced', learning_rate = 'constant', eta0 = 0.1)
)

In [0]:
rocket_SGD_pipeline.fit(X_train,y_train)

In [0]:
from sktime.utils import mlflow_sktime
file_path = 'path_to_file' #Rocket with SGD

#Save via sklearn
mlflow.sklearn.save_model(rocket_SGD_pipeline,file_path)

### LGBM Pipeline

In [0]:
#ROCKET with LGBM

import lightgbm as lgbm

#trans = Rocket(random_state=123,num_kernels=20000)
trans = Rocket(random_state=123)
trans.fit(X_train)

In [0]:
X_train_tran = trans.transform(X_train)
X_val_tran = trans.transform(X_val)
X_test_tran = trans.transform(X_test)

In [0]:
X_test_tran.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,19960,19961,19962,19963,19964,19965,19966,19967,19968,19969,19970,19971,19972,19973,19974,19975,19976,19977,19978,19979,19980,19981,19982,19983,19984,19985,19986,19987,19988,19989,19990,19991,19992,19993,19994,19995,19996,19997,19998,19999
0,0.0,-0.190489,1.0,0.192439,0.0,-0.896651,0.0,-0.471308,1.0,0.521024,0.0,-0.004229,0.0,-0.352162,0.0,-0.808605,1.0,0.752859,0.0,-0.853394,0.0,-0.345842,1.0,0.87964,0.0,-0.097563,0.0,-0.376401,0.0,-0.430864,0.0,-0.222929,1.0,0.973633,1.0,0.937253,0.714286,4.343083,0.0,-0.874629,...,0.0,-0.907349,1.0,0.871435,0.0,-4.134502,0.0,-0.796465,0.0,-0.984255,0.0,-0.82657,0.0,-0.775824,0.0,-0.604808,0.0,-0.426462,1.0,0.42453,0.0,-0.863306,1.0,0.732052,0.0,-0.82839,0.0,-0.016611,0.0,-0.362066,0.0,-0.990614,0.0,-0.083339,1.0,0.528461,0.0,-0.309796,1.0,0.26904
1,0.0,-0.190489,1.0,0.192439,0.0,-0.896651,0.0,-0.471308,1.0,0.521024,0.0,-0.00423,0.0,-0.352162,0.0,-0.808605,1.0,0.752859,0.0,-0.853395,0.0,-0.345842,1.0,0.87964,0.0,-0.097563,0.0,-0.376401,0.0,-0.430864,0.0,-0.222929,1.0,0.973633,1.0,0.937253,1.0,0.371102,0.0,-0.874629,...,0.0,-0.907349,1.0,0.871435,0.0,-0.690346,0.0,-6.761166,0.0,-0.984255,0.0,-0.82657,0.0,-0.775824,0.0,-0.604808,0.285714,4.145315,1.0,0.42453,0.0,-0.863306,1.0,0.732052,0.0,-0.82839,0.0,-0.016611,0.0,-0.362066,0.0,-0.990614,0.0,-0.083339,1.0,0.528461,0.0,-0.309796,1.0,0.26904
2,0.0,-0.190489,1.0,0.192439,0.0,-0.896651,0.0,-0.471308,1.0,0.521024,0.0,-0.00423,0.0,-0.352162,0.0,-0.808605,1.0,0.752859,0.0,-0.853395,0.0,-0.345842,1.0,0.87964,0.0,-0.097563,0.0,-0.376401,0.0,-0.430864,0.0,-0.222929,1.0,0.973633,1.0,0.937253,1.0,0.371102,0.0,-0.874629,...,0.0,-0.907349,1.0,0.871435,0.0,-0.690346,0.0,-0.796465,0.0,-0.984255,0.0,-0.82657,0.0,-0.775824,0.0,-0.604808,0.0,-0.426462,1.0,0.42453,0.0,-0.863306,1.0,0.732052,0.0,-0.82839,0.0,-0.016611,0.0,-2.999028,0.0,-0.990613,0.0,-0.083339,1.0,0.528461,0.0,-0.309796,1.0,0.26904
3,0.0,-0.190489,1.0,0.192439,0.0,-0.896651,0.0,-0.471308,1.0,0.521024,0.0,-0.00423,0.0,-0.352162,0.0,-0.808605,1.0,0.752859,0.0,-0.853395,0.0,-0.345842,1.0,0.87964,0.0,-0.097563,0.0,-0.376401,0.0,-0.430864,0.0,-0.222929,1.0,0.973633,1.0,0.937253,0.428571,1.186471,0.0,-0.874629,...,0.0,-0.907349,1.0,0.871435,0.0,-5.507941,0.0,-0.796465,0.0,-0.984254,0.0,-0.82657,0.0,-0.775824,0.0,-0.604808,0.0,-0.426462,1.0,0.42453,0.0,-0.863306,1.0,0.732052,0.0,-0.82839,0.0,-0.016611,0.0,-0.362066,0.0,-0.990613,0.0,-0.083339,1.0,0.528461,0.0,-0.309796,1.0,0.26904
4,0.0,-0.190489,1.0,0.192439,0.0,-0.896651,0.0,-0.471308,1.0,0.521024,0.0,-0.00423,0.0,-0.352162,0.0,-0.808605,1.0,0.752859,0.0,-0.853395,0.0,-0.345842,1.0,0.87964,0.0,-0.097563,0.0,-0.376401,0.0,-0.430864,0.0,-0.222929,1.0,0.973633,1.0,0.937253,1.0,0.371102,0.0,-0.874629,...,0.0,-0.907349,1.0,0.871435,0.0,-0.690346,0.0,-1.788411,0.0,-0.984254,0.0,-0.82657,0.0,-0.775824,0.0,-0.604808,0.285714,0.538494,1.0,0.42453,0.0,-0.863306,1.0,0.732053,0.0,-0.82839,0.0,-0.016611,0.0,-0.362066,0.0,-0.990614,0.0,-0.083339,1.0,0.528461,0.0,-0.309796,1.0,0.26904


In [0]:
lgbm_mod = lgbm.LGBMClassifier(n_estimators = 10000, #number of boosing iterations
                               learning_rate = 0.025,
                               num_leaves = 1000, 
                               subsample = 0.5, #bagging_fraction
                               colsample_bytree = 0.8, #feature fraction
                               min_child_samples = 500, #min_data_in_leaf
                               #max_depth = 100, 
                               random_state=123,
                               #is_unbalance = True,
                               scale_pos_weight = 40) #neg_n/pos_n

lgbm_mod.fit(X_train_tran,y_train,eval_metric="PRAUC")



<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

#### Parameter Tuning

In [0]:
#https://github.com/Microsoft/LightGBM/issues/695#issuecomment-315591634
##Based on basic model listed above, parameters search near default metrics 

import optuna
from optuna.samplers import RandomSampler

def objective(trial):
    params = {
        "objective": "binary",
        "metric": "f1",
        "verbosity": -1,

        "bagging_freq": 1, #Resampling data before each tree
        #"bagging_freq": trial.suggest_float("bagging_freq", 0, 1, step = 0.1),
        #"bagging_fraction": trial.suggest_float("bagging_fraction", 0, 1, step = 0.1),

        #"n_estimators": trial.suggest_int("n_estimators", 100, 5000, step = 100),
        "n_estimators": 10000,
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 7, 4095, step = 2**10), #tree complexity (terminal nodes)
        "subsample": trial.suggest_float("subsample", 0.1, 1.0, step = 0.01), #Proportion of data to be randomly selecteed
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0, step = 0.1), #Proportion of features for each tree
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 100, 1000), #Min data points each leaf node 
        "scale_pos_weight": trial.suggest_int("scale_pos_weight",1,100),
        #"n_iter_no_change": 50,  # early stopping
    }

    model = lgbm.LGBMClassifier(**params)
    model.fit(X_train_tran, y_train, verbose=False)
    
    y_pred_prob = model.predict_proba(X_test_tran)
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob[:,1])
    prauc = auc(recall,precision)
    return prauc
    
    #f1 = f1_score(y_test,y_pred_prob[:,1])
    #f_beta = sklearn.metrics.fbets_score(y_test,y_pred_prob, beta = 0.5) #set to [0.5,1,2]
    #accur = accuracy_score(y_train, predictions)
    #return f1

In [0]:
sampler = RandomSampler(seed=123)
study = optuna.create_study(study_name ="light_tune", direction = "maximize", sampler = sampler)
study.optimize(objective, n_trials = 5)

[I 2024-02-22 15:45:04,842] A new study created in memory with name: light_tune

The distribution is specified by [7, 4095] and step=1024, but the range is not divisible by `step`. It will be replaced by [7, 3079].


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.





[I 2024-02-22 15:55:58,756] Trial 0 finished with value: 0.14735216511639945 and parameters: {'learning_rate': 0.02471373418487883, 'num_leaves': 1031, 'subsample': 0.30000000000000004, 'colsample_bytree': 0.8, 'min_data_in_leaf': 748, 'scale_pos_weight': 43}. Best is trial 0 with value: 0.14735216511639945.

The distribution is specified by [7, 4095] and step=1024, but the range is not divisible by `step`. It will be replaced by [7, 3079].


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.





[I 2024-02-22 16:05:42,445] Trial 1 finished with value: 0.1527459757002671 and parameters: {'learning_rate': 0.09152261002780922, 'num_leaves': 2055, 'subsample': 0.53, 'colsample_bytree': 0.7, 'min_data_in_leaf': 409, 'scale_pos_weight': 73}. Best is trial 1 with value: 0.1527459757002671.

The distribution is specified by [7, 4095] and step=1024, but the range is not divisible by `step`. It will be replaced by [7, 3079].


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.





[I 2024-02-22 16:15:58,488] Trial 2 finished with value: 0.1530872589280664 and parameters: {'learning_rate': 0.007536062461086747, 'num_leaves': 7, 'subsample': 0.45999999999999996, 'colsample_bytree': 0.9, 'min_data_in_leaf': 264, 'scale_pos_weight': 18}. Best is trial 2 with value: 0.1530872589280664.

The distribution is specified by [7, 4095] and step=1024, but the range is not divisible by `step`. It will be replaced by [7, 3079].


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.





[I 2024-02-22 16:34:15,940] Trial 3 finished with value: 0.17581701629777358 and parameters: {'learning_rate': 0.011563857936940024, 'num_leaves': 2055, 'subsample': 0.67, 'colsample_bytree': 1.0, 'min_data_in_leaf': 752, 'scale_pos_weight': 62}. Best is trial 3 with value: 0.17581701629777358.

The distribution is specified by [7, 4095] and step=1024, but the range is not divisible by `step`. It will be replaced by [7, 3079].


'verbose' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead.





[I 2024-02-22 16:50:33,676] Trial 4 finished with value: 0.15564145630967766 and parameters: {'learning_rate': 0.02785394829873094, 'num_leaves': 1031, 'subsample': 0.42000000000000004, 'colsample_bytree': 0.6, 'min_data_in_leaf': 364, 'scale_pos_weight': 64}. Best is trial 3 with value: 0.17581701629777358.


<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

In [0]:
import joblib
joblib.dump(study,'path_to_file')

['/dbfs/FileStore/PH_Files/Rocket_LGBM_OPTUNA_5k_022224']

In [0]:
study = joblib.load('path_to_file')

In [0]:
study.best_trial.params

{'learning_rate': 0.011563857936940024,
 'num_leaves': 2055,
 'subsample': 0.67,
 'colsample_bytree': 1.0,
 'min_data_in_leaf': 752,
 'scale_pos_weight': 62}

In [0]:
best_parameters = study.best_trial.params

In [0]:
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show()

In [0]:
best_model = lgbm.LGBMClassifier(**best_parameters)

best_model.fit(X_train_tran,y_train,eval_metric="PRAUC")



<Figure size 640x200480 with 1 Axes>

<Figure size 640x200480 with 1 Axes>

In [0]:
lgbm_mod = lgbm.LGBMClassifier(n_estimators = 2000, #number of boosing iterations
                               num_leaves = 150, 
                               max_depth = 100,
                               min_child_samples = 500, 
                               #reg_lambda = 0.1, 
                               colsample_bytree= 0.75, 
                               random_state=123,
                               is_unbalance = True)

lgbm_mod.fit(X_train_tran,y_train,eval_metric="PRAUC")

[0;31m---------------------------------------------------------------------------[0m
[0;31mThe Python process exited with exit code 137 (SIGKILL: Killed). This may have been caused by an OOM error. Check your command's memory usage.[0m
[0;31m[0m
[0;31m[0m
[0;31m[0m
[0;31mThe last 10 KB of the process's stderr and stdout can be found below. See driver logs for full logs.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stderr:[0m
[0;31mThu Jan 25 21:02:49 2024 Connection to spark from PID  3122[0m
[0;31mThu Jan 25 21:02:49 2024 Initialized gateway on port 42663[0m
[0;31mThu Jan 25 21:02:49 2024 Connected to spark.[0m
[0;31m---------------------------------------------------------------------------[0m
[0;31mLast messages on stdout:[0m
[0;31mNOTE: When using the `ipython kernel` entry point, Ctrl-C will not work.[0m
[0;31m[0m
[0;31mTo exit, you will have to explicitly quit this process, by either 

In [0]:
from sktime.utils import mlflow_sktime

file_path = 'path_to_file' 

In [0]:
#Save via sklearn
mlflow.sklearn.save_model(lgbm_mod,file_path)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-1702104643796391>, line 2[0m
[1;32m      1[0m [38;5;66;03m#Save via sklearn[39;00m
[0;32m----> 2[0m mlflow[38;5;241m.[39msklearn[38;5;241m.[39msave_model([43mlgbm_mod[49m,file_path)

[0;31mNameError[0m: name 'lgbm_mod' is not defined