In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
import altair as alt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from optuna.logging import get_logger
import matplotlib.pyplot as plt
import seaborn as sns

random_state = 6
np.random.seed(random_state)

In [3]:
import mlflow
import mlflow.lightgbm

In [4]:
# Start mlflow with unique experiment name 
time_now = pd.Timestamp.now().strftime("%Y-%m-%d_%H-%M-%S")
experiment_name = f'earthquake_supervised_fine_tune_{time_now}'
mlflow.set_experiment(experiment_name)

2023/05/14 17:40:40 INFO mlflow.tracking.fluent: Experiment with name 'earthquake_supervised_fine_tune_2023-05-14_17-40-40' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/sabber/Documents/Research/Machine_learning/Earthquake_rupture_data_generation/mlruns/566379286685799737', creation_time=1684104040389, experiment_id='566379286685799737', last_update_time=1684104040389, lifecycle_stage='active', name='earthquake_supervised_fine_tune_2023-05-14_17-40-40', tags={}>

In [5]:
## look data with pandas
train_file = "data/rupturemodel_train.txt"
val_file = "data/rupturemodel_validate.txt"
test_file = "data/rupturemodel_test.txt"

df_train= pd.read_csv(train_file, sep=" ", header = None)
df_val= pd.read_csv(val_file, sep=" ", header = None)
df_test= pd.read_csv(test_file, sep=" ", header = None)

columns =  ['height', 'width', 'sxx', 'sxy', 'syy', 'sdrop', 'mud', 'dc', 'label']
df_train.columns = columns
df_val.columns = columns
df_test.columns = columns

frames = [df_train, df_val]
df_train = pd.concat(frames)
print(f'Train shape: {df_train.shape}\nVal shape: {df_val.shape}\nTest shape: {df_test.shape}')

Train shape: (1600, 9)
Val shape: (600, 9)
Test shape: (400, 9)


In [6]:
# data balance
print('train data label 1: {} and label 0: {}'.format(np.sum(df_train['label']==1), np.sum(df_train['label']==0)))

train data label 1: 559 and label 0: 1041


In [7]:
df_train.shape

(1600, 9)

### Balance data with SMOTE

In [8]:
### Data balance with SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=random_state)
X_train, y_train = sm.fit_resample(df_train.drop('label', axis=1), df_train['label'])
print('train data shape {} and test data shape {}'.format(np.shape(X_train), np.shape(y_train)))

train data shape (2082, 8) and test data shape (2082,)


In [9]:
df_train = pd.concat([X_train, y_train], axis=1)

### Create new features

In [10]:
def create_new_features(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df.copy()
    # Create new features
    df_new['height_width_ratio'] = df_new['height'] / df_new['width']
    df_new['normal_stress_diff'] = df_new['sxx'] - df_new['syy']
    df_new['friction_product'] = df_new['mud'] * (df_new['sdrop'])
    df_new['stress_ratio'] = df_new['sxy'] / df_new['syy']
    df_new['static_dynamic_friction_diff'] = (
        df_new['mud'] + df_new['sdrop']) - df_new['mud']
    df_new['stress_diff_dynamic_strength'] = df_new['sxy'] - \
        (df_new['syy'] * df_new['mud'])
    df_new['normalized_dc'] = df_new['dc'] / df_new['width']
    return df_new

In [11]:
print('train data shape {} and test data shape {}'.format(np.shape(df_train), np.shape(df_test)))

train data shape (2082, 9) and test data shape (400, 9)


In [12]:
X_train = df_train.drop('label', axis=1)
X_train = create_new_features(X_train)
y_train = df_train['label'].values

# Validation data
X_val = df_val.drop('label', axis=1)
X_val = create_new_features(X_val)
y_val = df_val['label'].values

# Test data
X_test = df_test.drop('label', axis=1)
X_test = create_new_features(X_test)
y_test = df_test['label'].values

In [13]:
print(f'Train data shape: {X_train.shape}\nValidation data shape: {X_val.shape}\nTest data shape: {X_test.shape}')
mlflow.log_param('train_data_shape', X_train.shape)
mlflow.log_param('val_data_shape', X_val.shape)
mlflow.log_param('test_data_shape', X_test.shape)

Train data shape: (2082, 15)
Validation data shape: (600, 15)
Test data shape: (400, 15)


(400, 15)

In [14]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'seed': random_state,
    }

    lgb_train = lgb.Dataset(X_train, y_train)
    model = lgb.train(params, lgb_train, verbose_eval=False)
    
    y_val_pred = model.predict(X_val)
    y_val_pred = np.round(y_val_pred).astype(int)
    
    score = f1_score(y_val, y_val_pred, average='macro')
    
    return score


In [15]:
# Define a silent callback
def silent_callback(study, trial):
    if study.trials[-1].state == optuna.trial.TrialState.COMPLETE:
        pass

logger = get_logger("optuna")
logger.setLevel("INFO")

study = optuna.create_study(direction='maximize')

# Run optimization
study.optimize(
    objective,
    n_trials=30,
    show_progress_bar=True,
    callbacks=[silent_callback]
)

[32m[I 2023-05-14 17:40:40,791][0m A new study created in memory with name: no-name-75d681f6-77a0-4897-86df-ce2816621392[0m


  0%|          | 0/30 [00:00<?, ?it/s]

[32m[I 2023-05-14 17:40:42,123][0m Trial 0 finished with value: 0.2528019925280199 and parameters: {'scale_pos_weight': 8.066737385996792, 'learning_rate': 4.078294564438749e-06, 'n_estimators': 492, 'boosting_type': 'dart', 'lambda_l1': 0.11838907110272899, 'lambda_l2': 0.018153503530517222, 'num_leaves': 155, 'feature_fraction': 0.9356022514605962, 'bagging_fraction': 0.6283592065507116, 'bagging_freq': 4, 'min_child_samples': 97}. Best is trial 0 with value: 0.2528019925280199.[0m
[32m[I 2023-05-14 17:40:44,286][0m Trial 1 finished with value: 1.0 and parameters: {'scale_pos_weight': 8.928877933770089, 'learning_rate': 0.028693784173809512, 'n_estimators': 712, 'boosting_type': 'gbdt', 'lambda_l1': 0.03441707548940204, 'lambda_l2': 2.776297183383623e-07, 'num_leaves': 223, 'feature_fraction': 0.9463813776896577, 'bagging_fraction': 0.7217499799748738, 'bagging_freq': 4, 'min_child_samples': 34}. Best is trial 1 with value: 1.0.[0m
[32m[I 2023-05-14 17:40:45,171][0m Trial 2 f

In [16]:
trial = study.best_trial
print(f'Best score: {trial.value}')
print(f'Best Params: {trial.params}')
mlflow.log_param('best_score', trial.value)
mlflow.log_param('best_params', trial.params)

Best score: 1.0
Best Params: {'scale_pos_weight': 8.928877933770089, 'learning_rate': 0.028693784173809512, 'n_estimators': 712, 'boosting_type': 'gbdt', 'lambda_l1': 0.03441707548940204, 'lambda_l2': 2.776297183383623e-07, 'num_leaves': 223, 'feature_fraction': 0.9463813776896577, 'bagging_fraction': 0.7217499799748738, 'bagging_freq': 4, 'min_child_samples': 34}


{'scale_pos_weight': 8.928877933770089,
 'learning_rate': 0.028693784173809512,
 'n_estimators': 712,
 'boosting_type': 'gbdt',
 'lambda_l1': 0.03441707548940204,
 'lambda_l2': 2.776297183383623e-07,
 'num_leaves': 223,
 'feature_fraction': 0.9463813776896577,
 'bagging_fraction': 0.7217499799748738,
 'bagging_freq': 4,
 'min_child_samples': 34}

In [17]:
# Train model with the best parameters
best_params = trial.params
lgb_train = lgb.Dataset(X_train, y_train)

In [18]:
best_supervised_model = lgb.train(best_params, lgb_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 2082, number of used features: 15
[LightGBM] [Info] Start training from score 0.500000


In [34]:
### Load model
best_supervised_model = lgb.Booster(
    model_file='models/supervised_model_roc_0.8991_f1_0.8266.txt')


In [39]:
from sklearn.metrics import roc_auc_score

y_pred = best_supervised_model.predict(X_test)
test_roc_auc_score = roc_auc_score(y_test, y_pred)
print(f'Test Roc-AUC score: {test_roc_auc_score:.4f}')
mlflow.log_metric('test_roc_auc_score', test_roc_auc_score)

y_pred = np.round(y_pred).astype(int)
test_f1_macro_score = f1_score(y_test, y_pred, average='macro')
print(f'Test F1 score macro: {test_f1_macro_score:.4f}')
mlflow.log_metric('test_f1_macro_score', test_f1_macro_score)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion matrix:\n{cm}')

false_positive = cm[0][1]
false_negative = cm[1][0]
true_positive = cm[1][1]
true_negative = cm[0][0]

print(f'True Positive: {true_positive}')
print(f'True Negative: {true_negative}')
print(f'False Positive: {false_positive}')
print(f'False Negative: {false_negative}')

mlflow.log_metric('TruePositive', true_positive)
mlflow.log_metric('TrueNegative', true_negative)
mlflow.log_metric('FalsePositive', false_positive)
mlflow.log_metric('FalseNegative', false_negative)

Test Roc-AUC score: 0.8991
Test F1 score macro: 0.8266
Confusion matrix:
[[239  33]
 [ 28 100]]
True Positive: 100
True Negative: 239
False Positive: 33
False Negative: 28


In [40]:
### Classification report
y_pred = np.round(y_pred).astype(int)
print(classification_report(y_test, y_pred))
mlflow.log_param('classification_report', classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89       272
         1.0       0.75      0.78      0.77       128

    accuracy                           0.85       400
   macro avg       0.82      0.83      0.83       400
weighted avg       0.85      0.85      0.85       400



'              precision    recall  f1-score   support\n\n         0.0       0.90      0.88      0.89       272\n         1.0       0.75      0.78      0.77       128\n\n    accuracy                           0.85       400\n   macro avg       0.82      0.83      0.83       400\nweighted avg       0.85      0.85      0.85       400\n'

In [21]:
### Save the model
roc_auc_score = round(test_roc_auc_score, 4)
f1_score_macro = round(test_f1_macro_score, 4)
model_name = f'./models/supervised_model_roc_{roc_auc_score}_f1_{f1_score_macro}.txt'
best_supervised_model.save_model(model_name)

# Save the model to MLflow
mlflow.sklearn.log_model(best_supervised_model, "model")

<mlflow.models.model.ModelInfo at 0x158a1e680>

In [41]:
### Features importance
features_importance = pd.DataFrame({'features': X_train.columns, 'importance': best_supervised_model.feature_importance( importance_type='split')})
# normalize the importances
features_importance['importance_normalized'] = features_importance['importance'] / features_importance['importance'].sum()
features_importance = features_importance.sort_values(by='importance', ascending=False)
features_importance = features_importance.reset_index(drop=True)
features_importance

Unnamed: 0,features,importance,importance_normalized
0,sdrop,5212,0.086835
1,mud,4912,0.081837
2,dc,4780,0.079637
3,stress_ratio,4731,0.078821
4,normal_stress_diff,4638,0.077272
5,normalized_dc,4158,0.069275
6,height,4147,0.069091
7,sxx,4123,0.068691
8,friction_product,3978,0.066276
9,width,3952,0.065843


In [22]:
mlflow.end_run()

### Visualization of hyperparameter importance using Optuna

In [23]:
import optuna.visualization as vis

In [24]:
# Plot optimization history
plot_optimization_history = vis.plot_optimization_history(study)
plot_optimization_history.show()

In [25]:
# Plot parameter importances
plot_param_importances = vis.plot_param_importances(study)
plot_param_importances.show()

In [26]:
# Plot parallel coordinate plot
plot_parallel_coordinate = vis.plot_parallel_coordinate(study)
plot_parallel_coordinate.show()

In [27]:
# Plot slice plot
plot_slice = vis.plot_slice(study)
plot_slice.show()

In [28]:
# Plot contour plot
# Best Params: {'scale_pos_weight': 8.849103938047977, 'learning_rate': 0.11651683871291287, 'n_estimators': 710, 'boosting_type': 'dart', 'lambda_l1': 1.0263206593735317e-05, 'lambda_l2': 3.729990954390867e-08, 'num_leaves': 48, 'feature_fraction': 0.49816303001038625, 'bagging_fraction': 0.7791692888330242, 'bagging_freq': 4, 'min_child_samples': 30}

# plot_contour = vis.plot_contour(study, params=['num_leaves', 'n_estimators'])
# plot_contour.show()
