In [1]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.4 MB 3.4 MB/s eta 0:00:01
   ------------------ --------------------- 0.7/1.4 MB 8.3 MB/s eta 0:00:01
   ---------------------------------------  1.4/1.4 MB 13.2 MB/s eta 0:00:01
   ---------------------------------------- 1.4/1.4 MB 11.5 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.5.0



[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
import joblib

In [9]:
# Load the dataset
file_path = 'new_dataset/output.csv'
data = pd.read_csv(file_path)

In [10]:
# # Extract the first 12 rows to determine the range of each parameter
# parameter_ranges = data.iloc[:12][['Q1', 'Q2', 'EN', 'SN', 'FN', 'F', 'M']]
# # Determine the min and max values for each parameter based on the first 12 rows
# min_values = parameter_ranges.min().values
# max_values = parameter_ranges.max().values

In [11]:
# Extract the relevant portion for training
data = data.iloc[12:].reset_index(drop=True)
data = data.drop(columns=['Unnamed: 0', 'sim_status', 'final_neck_diameter'])

In [12]:


# Define features and target
X = data[['Q1', 'Q2', 'EN', 'SN', 'FN', 'F', 'M']]
y = data['MAPE']

# Clean data by dropping NaN values
X = X.dropna()
y = y.dropna()


In [14]:

# Define the minimum MAPE threshold from your dataset
min_mape_threshold = y.min()

# Define the LightGBM model
lightgbm_model = lgb.LGBMRegressor(random_state=42)

# Define the parameter grid for Randomized Search
param_dist = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [-1, 5, 10, 15],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Set up the Randomized Search with cross-validation
random_search = RandomizedSearchCV(
    estimator=lightgbm_model,
    param_distributions=param_dist,
    n_iter=50,  # Number of different combinations to try
    scoring='neg_mean_absolute_percentage_error',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# Fit the Randomized Search model
random_search.fit(X, y)

# Best model from Randomized Search
best_lgbm_model = random_search.best_estimator_

# Save the trained model to a file
model_file_path = 'best_lgbm_model.joblib'
joblib.dump(best_lgbm_model, model_file_path)

# Function to generate new parameters
def generate_parameters(n_samples, min_values, max_values):
    min_values = np.array(min_values)
    max_values = np.array(max_values)
    random_parameters = np.random.rand(n_samples, len(min_values)) * (max_values - min_values) + min_values
    parameter_df = pd.DataFrame(random_parameters, columns=['Q1', 'Q2', 'EN', 'SN', 'FN', 'F', 'M'])
    parameter_df['Q3'] = parameter_df['Q1'] ** 2  # Enforcing the new relationship
    return parameter_df



Fitting 3 folds for each of 50 candidates, totalling 150 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000108 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 938
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 7
[LightGBM] [Info] Start training from score 31.744118


In [18]:
# Generate new parameter sets within the specified ranges
n_samples = 1000  # Generate a larger set to increase chances of finding lower MAPE
parameter_ranges = data[['Q1', 'Q2', 'EN', 'SN', 'FN', 'F', 'M']]
min_values = parameter_ranges.min().values
max_values = parameter_ranges.max().values

new_parameters = generate_parameters(n_samples, min_values, max_values)

# Predict MAPE using the trained LightGBM model
predicted_mape_new_lgbm = best_lgbm_model.predict(new_parameters[['Q1', 'Q2', 'EN', 'SN', 'FN', 'F', 'M']])

# Filter the new parameter sets to only those with MAPE lower than the minimum threshold
lower_mape_parameters_lgbm = new_parameters[predicted_mape_new_lgbm < min_mape_threshold]
lower_mape_parameters_lgbm['Predicted_MAPE'] = predicted_mape_new_lgbm[predicted_mape_new_lgbm < min_mape_threshold]

# Display the lower MAPE parameter sets from LightGBM
lower_mape_parameters_sorted_lgbm = lower_mape_parameters_lgbm.sort_values(by='Predicted_MAPE').reset_index(drop=True)

# Save the lower MAPE parameters to a CSV file for further analysis
lower_mape_parameters_sorted_lgbm.to_csv('lower_mape_parameters_lgbm.csv', index=False)

# Print out the top 10 results
print(lower_mape_parameters_sorted_lgbm.head(10))


Empty DataFrame
Columns: [Q1, Q2, EN, SN, FN, F, M, Q3, Predicted_MAPE]
Index: []
