---
title: "Manufacturing Quality Analytics"
format: html
---

# âœ… Quality Data Analysis & Defect Prediction
**Portfolio Project 7 â€” Manufacturing Quality Analytics**

---

## Objective
Analyse product quality data, perform root-cause analysis using
correlation and regression, and build a defect-prediction model.

## Dataset
**Simulated manufacturing quality log** (mirrors structure of common industry datasets
such as the Steel Defect Detection dataset on Kaggle or the UCI Concrete Quality dataset)

---

In [None]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
print('Imports OK')

## 1. Synthetic Quality Data

In [None]:
# 2. Generate manufacturing quality records
def gen_quality_data(n=5000, seed=2025):
    rng = np.random.default_rng(seed)

    # Process inputs
    mach_speed = rng.normal(500, 30, n)        # RPM
    mach_temp = rng.normal(220, 8, n)         # Â°C
    raw_mat_qual = rng.normal(85, 5, n)          # supplier quality score
    humidity = rng.uniform(30, 80, n)        # %
    operator_exp = rng.choice([1, 2, 3, 5, 8], n)    # years

    # Measured quality dimensions
    thickness = 10.0 + 0.02*mach_speed - 0.05*mach_temp + rng.normal(0, 0.3, n)
    surface_fin = 3.0 - 0.005*mach_speed + \
        0.01*raw_mat_qual + rng.normal(0, 0.2, n)
    tensile_str = 400 + 0.5*raw_mat_qual - 0.2*humidity + rng.normal(0, 15, n)
    hardness = 55 + 0.1*mach_temp - 0.05*humidity + rng.normal(0, 2, n)

    # Defect probability (logistic)
    log_odds = (
        -3.0
        + 0.05 * (mach_temp - 220)
        - 0.04 * raw_mat_qual
        + 0.02 * humidity
        - 0.3 * operator_exp
        + 0.01 * np.abs(thickness - 10.0) * 100
    )
    p_defect = 1 / (1 + np.exp(-log_odds))
    defect = (rng.uniform(0, 1, n) < p_defect).astype(int)

    df = pd.DataFrame({
        'Machine_Speed': mach_speed.round(1),
        'Machine_Temp': mach_temp.round(1),
        'Raw_Material_Quality': raw_mat_qual.round(1),
        'Humidity': humidity.round(1),
        'Operator_Experience': operator_exp,
        'Thickness': thickness.round(3),
        'Surface_Finish': surface_fin.round(3),
        'Tensile_Strength': tensile_str.round(1),
        'Hardness': hardness.round(2),
        'Defect': defect
    })
    return df


df = gen_quality_data()
print(f'Shape: {df.shape}  |  Defect rate: {df["Defect"].mean()*100:.1f}%')
df.describe().round(2)

## 2. Exploratory Quality Analysis

In [None]:
# 3. Defect rate by operator experience
defect_by_exp = df.groupby('Operator_Experience')['Defect'].mean()*100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
defect_by_exp.plot(kind='bar', ax=axes[0],
                   color='steelblue', edgecolor='white')
axes[0].set_title('Defect Rate by Operator Experience')
axes[0].set_ylabel('Defect Rate (%)')
axes[0].set_xlabel('Experience (years)')
axes[0].tick_params(axis='x', rotation=0)

# Violin: Machine_Temp by Defect
sns.violinplot(data=df, x='Defect', y='Machine_Temp',
               ax=axes[1], palette=['#4c72b0', '#c44e52'])
axes[1].set_title('Machine Temperature by Defect Status')
axes[1].set_xticklabels(['OK', 'Defective'])

plt.tight_layout()
plt.show()

In [None]:
# 4. Correlation heatmap
corr_cols = [c for c in df.columns if c != 'Defect']
corr = df[corr_cols + ['Defect']].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            vmin=-1, vmax=1, linewidths=0.5, ax=ax)
ax.set_title('Correlation Matrix (incl. Defect)', fontsize=13)
plt.tight_layout()
plt.show()

## 3. Root-Cause Analysis â€” Feature Importance

In [None]:
# 5. Logistic regression coefficients as root-cause proxy
FEAT_COLS = [c for c in df.columns if c != 'Defect']
scaler = StandardScaler()
X_s = scaler.fit_transform(df[FEAT_COLS])
y = df['Defect']

lr = LogisticRegression(max_iter=1000, random_state=0)
lr.fit(X_s, y)

coef_df = pd.DataFrame({'Feature': FEAT_COLS, 'Coefficient': lr.coef_[0]})
coef_df = coef_df.sort_values('Coefficient', key=abs, ascending=True)

fig, ax = plt.subplots(figsize=(9, 5))
colors = ['#c44e52' if c > 0 else '#4c72b0' for c in coef_df['Coefficient']]
coef_df.plot(kind='barh', x='Feature', y='Coefficient', ax=ax,
             color=colors, edgecolor='white', legend=False)
ax.axvline(0, color='black', lw=0.8)
ax.set_title('Logistic Regression Coefficients (Root-Cause Proxy)')
ax.set_xlabel('Coefficient (standardised)')
ax.text(0.02, -0.5, 'Red = increases defect risk',
        color='#c44e52', fontsize=8, transform=ax.transAxes)
ax.text(0.02, -0.65, 'Blue = decreases defect risk',
        color='#4c72b0', fontsize=8, transform=ax.transAxes)
plt.tight_layout()
plt.show()

## 4. Defect Prediction Model

In [None]:
# 6. Train classifiers
X_train, X_test, y_train, y_test = train_test_split(
    X_s, y, test_size=0.25, stratify=y, random_state=42)

models = {
    'Logistic Regression':  LogisticRegression(max_iter=1000, random_state=0),
    'Random Forest':        RandomForestClassifier(n_estimators=200, max_depth=8, random_state=0),
    'Gradient Boosting':    GradientBoostingClassifier(n_estimators=200, max_depth=4, random_state=0),
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, proba)
    auc_val = auc(fpr, tpr)
    results[name] = {'preds': preds, 'proba': proba,
                     'fpr': fpr, 'tpr': tpr, 'auc': auc_val, 'model': model}
    print(f'{name:25s} | AUC = {auc_val:.3f}')

print(f'\nðŸ“‹ Best AUC: {max(results, key=lambda k: results[k]["auc"])}')

In [None]:
# 7. ROC + Confusion matrix for best model
best = max(results, key=lambda k: results[k]['auc'])

fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# ROC
for name, r in results.items():
    axes[0].plot(r['fpr'], r['tpr'], lw=2,
                 label=f"{name} (AUC={r['auc']:.3f})")
axes[0].plot([0, 1], [0, 1], 'k--', lw=0.8)
axes[0].set_xlabel('FPR')
axes[0].set_ylabel('TPR')
axes[0].set_title('ROC Curves â€” Defect Prediction')
axes[0].legend(loc='lower right', fontsize=8)

# Confusion matrix of best
cm = confusion_matrix(y_test, results[best]['preds'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['OK', 'Defect'], yticklabels=['OK', 'Defect'])
axes[1].set_title(f'Confusion Matrix â€” {best}')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()

print(classification_report(
    y_test, results[best]['preds'], target_names=['OK', 'Defect']))

In [None]:
# 8. Feature importance from best tree-based model
if hasattr(results[best]['model'], 'feature_importances_'):
    imp = pd.Series(results[best]['model'].feature_importances_,
                    index=FEAT_COLS).sort_values(ascending=True)
    fig, ax = plt.subplots(figsize=(9, 5))
    imp.plot(kind='barh', ax=ax, color='steelblue', edgecolor='white')
    ax.set_title(f'Feature Importance â€” {best}')
    ax.set_xlabel('Importance')
    plt.tight_layout()
    plt.show()

## Summary
- Modelled a realistic manufacturing quality pipeline with process â†’ quality â†’ defect linkage
- Root-cause analysis via logistic coefficients highlighted temperature, humidity, and operator experience
- Gradient Boosting delivered the best AUC for defect prediction
- Feature importance confirmed alignment with the engineered data-generating process