In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import optuna
import joblib
import numpy as np
import pandas as pd
import lightgbm as lgb

import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt

import optuna.visualization as vis
#  
random_state = 6
np.random.seed(random_state)

## Load data

In [3]:
df_generated = pd.read_csv('./data/train_generated_data.csv')

In [9]:
df_generated.head()

Unnamed: 0,height,width,sxx,sxy,syy,sdrop,mud,dc,label,height_width_ratio,normal_stress_diff,friction_product,stress_ratio,static_dynamic_friction_diff,stress_diff_dynamic_strength,normalized_dc,data_type
0,0.103861,1.145663,-102.509086,58.619371,-117.766562,0.483821,0.216681,0.295842,0.0,,,,,,,,train
1,0.088714,1.30436,-136.06227,51.391037,-126.715571,0.345944,0.447964,0.406466,1.0,,,,,,,,train
2,0.099706,1.260377,-117.558936,40.972081,-115.529343,0.292719,0.501697,0.38936,1.0,,,,,,,,train
3,0.115749,1.191782,-128.169036,94.020712,-157.830504,0.57171,0.202831,0.408976,0.0,,,,,,,,train
4,0.0179,1.10815,-106.35032,29.148969,-101.379323,0.253122,0.324653,0.398592,1.0,,,,,,,,train


### Understand the parameters of the generator

In [13]:
### Scatter plot color by data_type
alt.Chart(df_generated).mark_circle(size=10).encode(
    x='height', y='width', color='data_type'
).interactive()
 

In [5]:
# Plot distribution plot using altaire
alt.Chart(df_generated.sample(4000)).mark_bar().encode(
    alt.X('dc', bin=alt.Bin(maxbins=100)),
    y='count()',
    color='data_type'
).properties(
    width=600,
    height=400
).interactive()

### Understanding the parameters using optimization

In [4]:
train_columns = ['height', 'width', 'sxx', 'sxy',
                 'syy', 'sdrop', 'mud', 'dc']


In [6]:
def create_new_features(df: pd.DataFrame) -> pd.DataFrame:
    df_new = df.copy()
    # Create new features
    df_new['height_width_ratio'] = df_new['height'] / df_new['width']
    df_new['normal_stress_diff'] = df_new['sxx'] - df_new['syy']
    df_new['friction_product'] = df_new['mud'] * (df_new['sdrop'])
    df_new['stress_ratio'] = df_new['sxy'] / df_new['syy']
    df_new['static_dynamic_friction_diff'] = (
        df_new['mud'] + df_new['sdrop']) - df_new['mud']
    df_new['stress_diff_dynamic_strength'] = df_new['sxy'] - \
        (df_new['syy'] * df_new['mud'])
    df_new['normalized_dc'] = df_new['dc'] / df_new['width']
    return df_new


In [7]:
# Pre-trained model
supervised_model = lgb.Booster(model_file='./models/best_supervised_model.txt')

# Generateds parameters
gp = df_generated.describe().to_dict()

In [11]:
def objective_function(trial):

    # Define parameter search space
    width = trial.suggest_float("width", gp['width']['min'], gp['width']['max'])
    height = trial.suggest_float("height", gp['height']['min'], gp['height']['max'])
    syy = trial.suggest_float("syy", gp['syy']['min'], gp['syy']['max'])
    sxx = trial.suggest_float("sxx", gp['sxx']['min'], gp['sxx']['max'])
    mud = trial.suggest_float("mud",  gp['mud']['min'], gp['mud']['max'])
    sdrop = trial.suggest_float("sdrop",  gp['sdrop']['min'], gp['sdrop']['max'])
    sxy = trial.suggest_float("sxy",  gp['sxy']['min'], gp['sxy']['max'])
    dc = trial.suggest_float("dc", gp['dc']['min'], gp['dc']['max'])

    # List of all parameters
    params = [height, width, sxx, sxy, syy, sdrop, mud, dc]
    train_columns = ['height', 'width', 'sxx', 'sxy', 'syy', 'sdrop', 'mud', 'dc']
    
    # Create a dataframe usinf the parameters
    df = pd.DataFrame([params], columns=train_columns)

    df = create_new_features(df)
    score = supervised_model.predict(
        df, num_iteration=supervised_model.best_iteration)

    # Optimize for high strength and low friction coefficient
    return score


In [13]:
# Define the optimization study
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="maximize")
study.optimize(objective_function, n_trials=1000, show_progress_bar=True)


  0%|          | 0/1000 [00:00<?, ?it/s]

In [14]:
# Print the best parameter settings and reward found
best_params = study.best_params
best_reward = study.best_value
print(f"Best parameter settings: {best_params}")
print(f"Best reward: {best_reward}")

Best parameter settings: {'width': 1.8926798901564152, 'height': 0.07580283117126675, 'syy': -398.674686260144, 'sxx': -428.9065164656232, 'mud': 1.0394691052692824, 'sdrop': -0.7222881924010455, 'sxy': 292.151125183087, 'dc': 0.5819320501327709}
Best reward: 1.4350148272449377


In [15]:
### Get data ffrom study
df_study = study.trials_dataframe()
df_study['datetime_start'] = pd.to_datetime(df_study['datetime_start'])
df_study['datetime_complete'] = pd.to_datetime(df_study['datetime_complete'])
df_study['duration'] = df_study['datetime_complete'] - df_study['datetime_start']
df_study['duration'] = df_study['duration'].dt.total_seconds()

In [16]:
df_study.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_dc,params_height,params_mud,params_sdrop,params_sxx,params_sxy,params_syy,params_width,state
0,0,-0.045207,2023-05-01 13:25:26.658753,2023-05-01 13:25:26.671332,0.012579,-0.386877,0.44066,1.484277,1.389087,-234.298767,273.645243,341.183644,3.349149,COMPLETE
1,1,0.127556,2023-05-01 13:25:26.672239,2023-05-01 13:25:26.678160,0.005921,1.179253,0.556782,1.360365,1.752044,-420.187281,-39.33948,195.811999,0.29828,COMPLETE
2,2,0.66635,2023-05-01 13:25:26.679950,2023-05-01 13:25:26.689942,0.009992,1.2943,0.566856,-0.735727,-0.648905,-476.428195,105.220608,107.317235,0.811614,COMPLETE
3,3,-0.077202,2023-05-01 13:25:26.691443,2023-05-01 13:25:26.699302,0.007859,0.052556,0.689274,1.599481,0.558125,-65.478611,-350.698031,-259.902839,3.116839,COMPLETE
4,4,0.038881,2023-05-01 13:25:26.700525,2023-05-01 13:25:26.706705,0.00618,1.097712,0.074066,0.244511,0.678494,-320.441958,-199.837396,449.911273,2.443808,COMPLETE


In [17]:
df_study['best_value'] = df_study['value'].cummax()

In [42]:
df_study.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_dc,params_height,params_mud,params_sdrop,params_sxx,params_sxy,params_syy,params_width,state,best_value
0,0,-0.045207,2023-05-01 13:25:26.658753,2023-05-01 13:25:26.671332,0.012579,-0.386877,0.44066,1.484277,1.389087,-234.298767,273.645243,341.183644,3.349149,COMPLETE,-0.045207
1,1,0.127556,2023-05-01 13:25:26.672239,2023-05-01 13:25:26.678160,0.005921,1.179253,0.556782,1.360365,1.752044,-420.187281,-39.33948,195.811999,0.29828,COMPLETE,0.127556
2,2,0.66635,2023-05-01 13:25:26.679950,2023-05-01 13:25:26.689942,0.009992,1.2943,0.566856,-0.735727,-0.648905,-476.428195,105.220608,107.317235,0.811614,COMPLETE,0.66635
3,3,-0.077202,2023-05-01 13:25:26.691443,2023-05-01 13:25:26.699302,0.007859,0.052556,0.689274,1.599481,0.558125,-65.478611,-350.698031,-259.902839,3.116839,COMPLETE,0.66635
4,4,0.038881,2023-05-01 13:25:26.700525,2023-05-01 13:25:26.706705,0.00618,1.097712,0.074066,0.244511,0.678494,-320.441958,-199.837396,449.911273,2.443808,COMPLETE,0.66635


In [24]:
### Plot trilal vs value using altair
# also plot a line that shows the best value found sofar 
alt.Chart(df_study).mark_point().encode(
    x='number',
    y='value',
    color='state'
) + alt.Chart(df_study).mark_line().encode(
    x='number',
    y='best_value',
    color = alt.value('red'),
    tooltip=['number', 'best_value']
).properties(
    width=600,
    height = 300
).interactive()

#### Feature importance

In [40]:
importance = optuna.importance.get_param_importances(study)

# OrderedDict to dataframe
df_importance = pd.DataFrame(list(importance.items()), columns=['feature', 'importance'])

In [41]:
## Plot feature importance in sorting order
alt.Chart(df_importance).mark_bar().encode(
    y=alt.Y('feature', sort=None),
    x='importance',
    tooltip=['feature', 'importance']
).properties(
    width=200,
    height = 300
).interactive()


In [43]:
# Plot parallel coordinate plot
plot_parallel_coordinate = vis.plot_parallel_coordinate(study)
plot_parallel_coordinate.show()


### Inter feature relationship with probability to propagate

In [58]:
alt.Chart(df_study).mark_rect().encode(
    alt.X('params_height:Q', title='height'),
    alt.Y('params_width:Q', title='width'),
    alt.Color('value:Q', scale=alt.Scale(scheme='greenblue'))
).interactive()


In [63]:
# Plot the show the relationship between the parameters using altair
def plot_feature_relatinship(feature1, feature2, feature1_title, feature2_title):
    heatmap = alt.Chart(df_study).mark_rect().encode(
        alt.X(feature1, title=feature1_title),
        alt.Y(feature2, title=feature2_title),
        alt.Color('value:Q', scale=alt.Scale(scheme='greenblue'))
    )

    points = alt.Chart(df_study).mark_circle(
        color = 'red',
        size = 5
        ).encode(
        x=feature1,
        y=feature2,
        tooltip=[feature1, feature2, 'value']
    ).properties(
        width=400,
        height = 200
    ).interactive()
    
    return heatmap + points


plot_feature_relatinship('params_height', 'params_width', 'Height', 'Width') | plot_feature_relatinship('params_height', 'params_syy', 'Height', 'Syy') | plot_feature_relatinship('params_sxx', 'params_sxy', 'Sxx', 'Sxy') | plot_feature_relatinship('params_sdrop', 'params_mud', 'Sdrop', 'Mud') | plot_feature_relatinship('params_dc', 'params_sxy', 'Dc', 'Sxy')

In [None]:
# plot_contour = vis.plot_contour(study, params=["mud", "height"])
# plot_contour.show()