<a href="https://colab.research.google.com/github/msquareddd/ai-engineering-notebooks/blob/main/machine_time_prediction_xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
!pip install optuna

In [None]:
import numpy as np
import pandas as pd
import optuna
import pickle
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# Dataset Reading

In [None]:
# Uploading excel file
from google.colab import files
uploaded = files.upload()

# Get the filename
file_names = uploaded.keys()
FILE_NAME = list(file_names)[0]
# print(FILE_NAME)

In [None]:
# Create dataframe
dataset = pd.read_excel(FILE_NAME, sheet_name="Foglio3")
# dataset.head()

# Data Preprocessing

In [None]:
# Rename columns to more suitable names
dataset.rename(columns={'NUM FACCE':"NUM_FACCE","RETT TANG":"RETT_TANG",
                        'EDM DRILL':"EDM_DRILL","EDM FILO":"EDM_FILO",
                        "EDM TUFFO":"EDM_TUFFO", "RETT TANG.1":"RETT_TANG_1",
                        "RETT TOND":"RETT_TOND","RETT TOND.1":"RETT_TOND_1",
                        "RETT PLAN":"RETT_PLAN"},
                 inplace=True)
# print(dataset.columns)

# Select feature columns
feature_cols = ['CICLO', 'X', 'Y','Z','NUM_FACCE','MATERIALE','TRATTAMENTO',]

# Select target columns
target_cols = [col for col in dataset.columns if col not in feature_cols]

# print(feature_cols)
# print(target_cols)

# Prepare empty dictionary to contain all models
models_dict = {}

In [None]:
# Copy dataset to avoid spoiling original one
features_df = dataset.copy()

# Get cycle numbers
cycle_list = features_df["CICLO"].unique().tolist()
# print(cycle_list)

# Use vectorized operations to check for zeros
piece_is_round_X = (features_df["X"] == 0).any()
piece_is_round_Y = (features_df["Y"] == 0).any()

# Add surface feature depending on shape
if piece_is_round_X:
    features_df["SUPERF"] = (features_df["Y"] / 2) ** 2 * np.pi
elif piece_is_round_Y:
    features_df["SUPERF"] = (features_df["X"] / 2) ** 2 * np.pi
else:
    features_df["SUPERF"] = features_df["X"] * features_df["Y"]

# Drop unnecessary columns in one go
features_df.drop(target_cols + ["X", "Y"], axis=1, inplace=True)

# print(features_df.head())


# XGB Hyperparameters Optimization

In [None]:
# Define hyperparameters optimization function
def objective(trial):

  params = {
        "n_estimators": trial.suggest_int("n_estimators", 1000, 12000, step=100),
        "max_depth":trial.suggest_int("max_depth", 1, 5),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
        "gamma": trial.suggest_float("gamma", 0.1, 1.0, step=0.1),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 7),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-6, 100.),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-6, 100.),
    }


  model = XGBRegressor(
        **params,
        n_jobs=-1,
        # tree_method='gpu_hist',
        gpu_id=0,
        early_stopping_rounds=300
        )

  model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)],
              verbose=0
            )

  y_hat = model.predict(X_test)

  return mean_squared_error(y_test, y_hat)

# XGB Training and Fitting

In [None]:
for cycle in cycle_list:
  model_list = []
  for col in target_cols:
      y = dataset[dataset["CICLO"] == cycle][col]

      # Copy dataset to avoid corruption
      features_df_copy = features_df.copy()

      # Drop cycle column
      features_df_copy.drop("CICLO", axis=1, inplace=True)

      # Identify categorical columns
      cat_cols = features_df_copy.columns[features_df_copy.dtypes == "object"]

      # Create a ColumnTransformer for one-hot encoding categorical variables
      ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), cat_cols)],
                            remainder='passthrough')

      # Apply the transformation to all features
      X_transformed = np.array(ct.fit_transform(features_df_copy))
      # Get feature names after OHE to visualize importance later on
      feature_names = ct.get_feature_names_out()

      # print(X_transformed.shape)

      # Use train_test_split for a hold-out evaluation
      X_train, X_test, y_train, y_test = train_test_split(
          X_transformed, y, test_size=0.2, random_state=42)

      # Hyperparameters optmization with optuna
      study = optuna.create_study(direction="minimize")
      optuna.logging.set_verbosity(optuna.logging.WARNING)
      study.optimize(objective, n_trials=50)

      best_params = study.best_params
      best_mae = study.best_value
      print(f"Best MAE: {best_mae}")
      print(f"Best params: {best_params}")

      # Define a pipeline with preprocessing and XGBRegressor
      pipeline = Pipeline([
          ('regressor', XGBRegressor(**best_params, random_state=42))
      ])

      fit_params = {"regressor__eval_set":[(X_test, y_test)],
          "regressor__verbose":False
      }

      # Fit the pipeline
      pipeline.fit(X_train, y_train, **fit_params)

      # Predict test set
      preds = pipeline.predict(X_test)

      # Evaluation metrics
      # Mean Absolute Error
      result = mean_absolute_error(y_test, preds)
      print(f"{col} - Train/Test Split MAE: {result:.4f}")

      # Pipeline score
      pip_score = pipeline.score(X_test, y_test)
      print(f"{col} - Train/Test Split R2 Score: {pip_score:.4f}")

      # Cross-validation
      cv_scores = cross_val_score(pipeline, X_transformed, y, scoring='neg_mean_absolute_error', cv=5)
      print(f"{col} - CV MAE: {-np.mean(cv_scores):.4f}")

      # Visualize feature importance
      importance = pipeline.steps[0][1].feature_importances_

      sorted_idx = np.argsort(importance)[::-1]

      plt.figure(figsize=(10, 6))
      plt.barh([feature_names[i] for i in sorted_idx], importance[sorted_idx])
      plt.xlabel("Feature Importance")
      plt.ylabel("Features")
      plt.title("XGBoost Feature Importance")
      plt.gca().invert_yaxis()
      plt.show()

      #Saving model
      model_name = str(cycle) + "_" + col + '_model.pkl'
      print(model_name)
      model_list.append(model_name)
      with open(model_name, 'wb') as file:
          pickle.dump(pipeline, file)

  # Updating dictionary with list of models for every cycle
  models_dict[cycle] = model_list


# Single Prediction

In [None]:
# New list of values never seen by the models
X_new = [5000,209,153,60,520,"1.2379","TR 1"]

# Convert X_new to a DataFrame
X_new_df = pd.DataFrame([X_new], columns=feature_cols)

# Use vectorized operations to check for zeros
piece_is_round_X = (X_new_df["X"] == 0).any()
piece_is_round_Y = (X_new_df["Y"] == 0).any()

# Add surface feature depending on shape
if piece_is_round_X:
    X_new_df["SUPERF"] = (X_new_df["Y"] / 2) ** 2 * np.pi
elif piece_is_round_Y:
    X_new_df["SUPERF"] = (X_new_df["X"] / 2) ** 2 * np.pi
else:
    X_new_df["SUPERF"] = X_new_df["X"] * X_new_df["Y"]

# Drop unnecessary columns in one go
X_new_df.drop(["X", "Y"], axis=1, inplace=True)

# print(X_new_df.head())

# Transform the new sample and encode categorical variables
X_new_transformed = ct.transform(X_new_df)

In [None]:
# Get the cycle number from new prediction values
single_pred_cycle = X_new_df["CICLO"].unique().tolist()[0]
# print(single_pred_cycle)

machine_times_list = []

# Load every model and for each one predict machine time
for model in models_dict[single_pred_cycle]:
  # print(model)
  with open(model, 'rb') as file:
      current_model = pickle.load(file)
      current_model_pred = current_model.predict(X_new_transformed)
      # print(current_model_pred)
      machine_times_list.append(current_model_pred[0])
# print(machine_times_list)

# Print machine times
for i, time in enumerate(machine_times_list):
  print(f"{target_cols[i]}: {time:.1f}")