---
title: "Manufacturing Yield Optimization"
format: html
---

# ⚙️ Process Optimization
**Portfolio Project 4 — Manufacturing Yield Optimisation**

---

## Objective
Use Bayesian optimisation and sensitivity analysis to find the
process-parameter settings that maximise product yield in a
simulated chemical / manufacturing process.

## Dataset
**Simulated Process Data** (structure mirrors common chemical-process benchmarks)

---

In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
from scipy.optimize import minimize, differential_evolution
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

## 1. Simulated Process Environment

In [None]:
# 2. Black-box process simulator
def process_simulator(params, noise_std=0.03, seed=None):
    """
    Simulate a yield (0-100%) as a function of 5 process parameters.
    Mimics a real reactor with nonlinear interactions and constraints.
    """
    rng = np.random.default_rng(seed)
    T, P, pH, flow, cat = params

    # Base yield from smooth nonlinear interactions
    y = (
        50
        # temperature optimum ~75
        + 20 * np.exp(-0.5 * ((T - 75)/10)**2)
        # pressure optimum ~3 bar
        + 15 * (1 - ((P - 3)/2)**2)
        - 8 * (pH - 7.0)**2 / 4                          # pH optimum ~7
        + 10 * np.sin(np.pi * flow / 5)                   # flow periodicity
        + 5 * cat                                         # catalyst loading linear
        - 3 * (T - 75) * (P - 3) / 100                   # T-P interaction
    )
    y = np.clip(y, 0, 100)
    if noise_std > 0:
        y += rng.normal(0, noise_std * 100, np.shape(y))
        y = np.clip(y, 0, 100)
    return y


# Parameter bounds
PARAM_NAMES = ['Temperature(°C)', 'Pressure(bar)',
               'pH', 'Flow(L/h)', 'Catalyst(g/L)']
PARAM_BOUNDS = [(50, 100), (1, 5), (5, 9), (0.5, 10), (0, 1)]

print('Simulator defined. Testing a sample:')
sample = [75, 3, 7, 2.5, 0.5]
print(
    f'  Params={sample} → Yield={process_simulator(sample, noise_std=0):.1f}%')

## 2. Generate Exploration Data

In [None]:
# 3. Random exploration — Latin Hypercube Sampling
def latin_hypercube(n, bounds, seed=10):
    rng = np.random.default_rng(seed)
    d = len(bounds)
    samples = np.zeros((n, d))
    for j, (lo, hi) in enumerate(bounds):
        perm = rng.permutation(n)
        samples[:, j] = (perm + rng.uniform(0, 1, n)) / n
        samples[:, j] = lo + samples[:, j] * (hi - lo)
    return samples


N_EXPLORE = 300
X_explore = latin_hypercube(N_EXPLORE, PARAM_BOUNDS)
y_explore = np.array([process_simulator(x, noise_std=0.03, seed=i)
                     for i, x in enumerate(X_explore)])

df_exp = pd.DataFrame(X_explore, columns=PARAM_NAMES)
df_exp['Yield'] = y_explore
print(f'Exploration data: {df_exp.shape}')
df_exp.describe().round(2)

## 3. Surrogate Model — Gradient Boosting

In [None]:
# 4. Fit surrogate
surrogate = GradientBoostingRegressor(
    n_estimators=500, max_depth=5, learning_rate=0.05,
    subsample=0.8, random_state=0
)
surrogate.fit(X_explore, y_explore)

# Cross-validation
scores = cross_val_score(surrogate, X_explore, y_explore, cv=5, scoring='r2')
print(f'Surrogate CV R²: {scores.mean():.3f} ± {scores.std():.3f}')

# Feature importance
imp = pd.Series(surrogate.feature_importances_,
                index=PARAM_NAMES).sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(8, 4))
imp.plot(kind='barh', ax=ax, color='steelblue', edgecolor='white')
ax.set_title('Feature Importance (Surrogate)')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()

## 4. Sensitivity Analysis — One-at-a-Time

In [None]:
# 5. OAT sensitivity
base_point = np.array([75, 3, 7, 2.5, 0.5])

fig, axes = plt.subplots(2, 3, figsize=(16, 9))
axes = axes.flatten()

for i, (name, (lo, hi)) in enumerate(zip(PARAM_NAMES, PARAM_BOUNDS)):
    sweep = np.linspace(lo, hi, 60)
    yields = []
    for v in sweep:
        p = base_point.copy()
        p[i] = v
        yields.append(surrogate.predict(p.reshape(1, -1))[0])

    axes[i].plot(sweep, yields, lw=2, color='steelblue')
    axes[i].axvline(base_point[i], color='crimson', ls='--',
                    lw=1, label=f'Base={base_point[i]}')
    axes[i].set_title(name, fontsize=11)
    axes[i].set_xlabel(name)
    axes[i].set_ylabel('Predicted Yield (%)')
    axes[i].legend(fontsize=8)

axes[-1].set_visible(False)
plt.suptitle('One-at-a-Time Sensitivity Analysis', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. Global Optimization — Differential Evolution

In [None]:
# 6. Optimise using scipy differential_evolution on the surrogate
def neg_surrogate(x):
    return -surrogate.predict(x.reshape(1, -1))[0]


result = differential_evolution(
    neg_surrogate,
    bounds=PARAM_BOUNDS,
    maxiter=1000,
    tol=1e-8,
    seed=42,
    polish=True
)

opt_params = result.x
opt_yield_surrogate = -result.fun
opt_yield_true = process_simulator(opt_params, noise_std=0)

print('=' * 55)
print('  OPTIMISATION RESULTS')
print('=' * 55)
for name, val in zip(PARAM_NAMES, opt_params):
    print(f'  {name:25s}: {val:8.3f}')
print(f'  {"Surrogate Yield":25s}: {opt_yield_surrogate:8.2f} %')
print(f'  {"True Yield (no noise)":25s}: {opt_yield_true:8.2f} %')
print('=' * 55)

In [None]:
# 7. Visualise optimal vs explored
fig = plt.figure(figsize=(14, 5))
gs = GridSpec(1, 3, wspace=0.4)

# Parallel coordinates — top 20 vs bottom 20
ax1 = fig.add_subplot(gs[0])
top20 = df_exp.nlargest(20, 'Yield')
bot20 = df_exp.nsmallest(20, 'Yield')

for _, row in top20.iterrows():
    ax1.plot(range(len(PARAM_NAMES)),
             row[PARAM_NAMES].values, color='green', alpha=0.3, lw=0.8)
for _, row in bot20.iterrows():
    ax1.plot(range(len(PARAM_NAMES)),
             row[PARAM_NAMES].values, color='red', alpha=0.3, lw=0.8)
ax1.plot(range(len(PARAM_NAMES)), opt_params,
         'k-o', lw=2, ms=8, label='Optimal')
ax1.set_xticks(range(len(PARAM_NAMES)))
ax1.set_xticklabels(PARAM_NAMES, rotation=20, ha='right', fontsize=8)
ax1.set_ylabel('Normalised Parameter Value')
ax1.set_title('Parallel Coordinates')
ax1.legend()

# Yield histogram
ax2 = fig.add_subplot(gs[1])
ax2.hist(df_exp['Yield'], bins=40, color='steelblue',
         edgecolor='white', alpha=0.7)
ax2.axvline(opt_yield_true, color='crimson', ls='--',
            lw=2, label=f'Optimal={opt_yield_true:.1f}%')
ax2.set_title('Yield Distribution')
ax2.set_xlabel('Yield (%)')
ax2.legend()

# 2-D scatter: Temperature vs Pressure coloured by Yield
ax3 = fig.add_subplot(gs[2])
sc = ax3.scatter(df_exp['Temperature(°C)'], df_exp['Pressure(bar)'],
                 c=df_exp['Yield'], cmap='viridis', s=20, edgecolors='none')
ax3.scatter(opt_params[0], opt_params[1], s=200, c='red',
            edgecolors='black', zorder=5, label='Optimal')
plt.colorbar(sc, ax=ax3, label='Yield (%)')
ax3.set_xlabel('Temperature (°C)')
ax3.set_ylabel('Pressure (bar)')
ax3.set_title('T vs P (colour = Yield)')
ax3.legend()

plt.suptitle('Optimisation Summary', fontsize=14, y=1.03)
plt.tight_layout()
plt.show()

## Summary
- Built a surrogate model (GB regressor) from Latin-Hypercube sampled process data
- One-at-a-Time sensitivity analysis identified critical parameters
- Differential Evolution found the global optimum on the surrogate landscape
- The optimal settings can now be validated in the real process