In [1]:
# Setup: Connect to PostgreSQL database
import os
import psycopg2
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Connect to PostgreSQL
conn = psycopg2.connect(
    host=os.getenv("DB_HOST", "localhost"),
    port=os.getenv("DB_PORT", "5434"),
    user=os.getenv("DB_USER", "postgres"),
    password=os.getenv("DB_PASSWORD"),
    database=os.getenv("DB_NAME", "manufacturing_db")
)

print("âœ… Successfully connected to PostgreSQL database!")

âœ… Successfully connected to PostgreSQL database!


![manufacturing gears](manufacturing.jpg)

Manufacturing processes for any product is like putting together a puzzle. Products are pieced together step by step, and keeping a close eye on the process is important.

For this project, you're supporting a team that wants to improve how they monitor and control a manufacturing process. The goal is to implement a more methodical approach known as statistical process control (SPC). SPC is an established strategy that uses data to determine whether the process works well. Processes are only adjusted if measurements fall outside of an acceptable range. 

This acceptable range is defined by an upper control limit (UCL) and a lower control limit (LCL), the formulas for which are:

$ucl = avg\_height + 3 * \frac{stddev\_height}{\sqrt{5}}$

$lcl = avg\_height - 3 * \frac{stddev\_height}{\sqrt{5}}$

The UCL defines the highest acceptable height for the parts, while the LCL defines the lowest acceptable height for the parts. Ideally, parts should fall between the two limits.

Using SQL window functions and nested queries, you'll analyze historical manufacturing data to define this acceptable range and identify any points in the process that fall outside of the range and therefore require adjustments. This will ensure a smooth running manufacturing process consistently making high-quality products.

## The data
The data is available in the `manufacturing_parts` table which has the following fields:
- `item_no`: the item number
- `length`: the length of the item made
- `width`: the width of the item made
- `height`: the height of the item made
- `operator`: the operating machine

In [2]:
# Preview the manufacturing_parts data
query = """
SELECT *
FROM manufacturing_parts
LIMIT 10;
"""

df_preview = pd.read_sql_query(query, conn)
display(df_preview)

  df_preview = pd.read_sql_query(query, conn)


Unnamed: 0,index,item_no,length,width,height,operator
0,0,1,102.67,49.53,19.69,Op-1
1,1,2,102.5,51.42,19.63,Op-1
2,2,3,95.37,52.25,21.51,Op-1
3,3,4,94.77,49.24,18.6,Op-1
4,4,5,104.26,47.9,19.46,Op-1
5,5,6,105.18,49.39,20.36,Op-1
6,6,7,97.35,48.05,20.22,Op-1
7,7,8,99.35,44.59,21.03,Op-1
8,8,9,90.62,47.29,19.78,Op-1
9,9,10,97.22,52.14,20.71,Op-1


In [3]:
# Statistical Process Control Analysis using Window Functions
query = """
WITH stats AS (
    SELECT
        operator,
        item_no,
        height,
        ROW_NUMBER() OVER (PARTITION BY operator ORDER BY item_no) AS row_number,
        AVG(height) OVER (
            PARTITION BY operator
            ORDER BY item_no
            ROWS BETWEEN 4 PRECEDING AND CURRENT ROW
        ) AS avg_height,
        STDDEV(height) OVER (
            PARTITION BY operator
            ORDER BY item_no
            ROWS BETWEEN 4 PRECEDING AND CURRENT ROW
        ) AS stddev_height,
        COUNT(*) OVER (
            PARTITION BY operator
            ORDER BY item_no
            ROWS BETWEEN 4 PRECEDING AND CURRENT ROW
        ) AS window_count
    FROM manufacturing_parts
)
SELECT
    operator,
    row_number,
    height,
    avg_height,
    stddev_height,
    (avg_height + 3 * stddev_height) AS ucl,
    (avg_height - 3 * stddev_height) AS lcl,
    CASE
        WHEN height > (avg_height + 3 * stddev_height)
          OR height < (avg_height - 3 * stddev_height)
        THEN TRUE ELSE FALSE
    END AS alert
FROM stats
WHERE window_count = 5
ORDER BY item_no;
"""

df_spc = pd.read_sql_query(query, conn)
display(df_spc)

# Summary statistics
print("\nðŸ“Š Statistical Process Control Summary:")
print(f"Total measurements analyzed: {len(df_spc)}")
print(f"Alerts triggered (out of control): {df_spc['alert'].sum()}")
print(f"Percentage out of control: {(df_spc['alert'].sum() / len(df_spc) * 100):.2f}%")
print(f"\nOperators analyzed: {df_spc['operator'].nunique()}")

  df_spc = pd.read_sql_query(query, conn)


Unnamed: 0,operator,row_number,height,avg_height,stddev_height,ucl,lcl,alert
0,Op-1,5,19.46,19.778,1.062812,22.966437,16.589563,False
1,Op-1,6,20.36,19.912,1.090812,23.184435,16.639565,False
2,Op-1,7,20.22,20.030,1.084574,23.283721,16.776279,False
3,Op-1,8,21.03,19.934,0.931225,22.727675,17.140325,False
4,Op-1,9,19.78,20.170,0.598832,21.966497,18.373503,False
...,...,...,...,...,...,...,...,...
415,Op-20,17,20.96,20.370,0.853698,22.931094,17.808906,False
416,Op-20,18,19.68,20.362,0.861464,22.946392,17.777608,False
417,Op-20,19,19.19,20.098,0.996454,23.087361,17.108639,False
418,Op-20,20,21.60,20.146,1.075119,23.371356,16.920644,False



ðŸ“Š Statistical Process Control Summary:
Total measurements analyzed: 420
Alerts triggered (out of control): 0
Percentage out of control: 0.00%

Operators analyzed: 20
