In [None]:
USE ROLE AI_ENGINEER;
USE SCHEMA AI_DEVELOPMENT.SI_BOTTLING_COMPANY

# Imports

In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import random
import numpy as np
import json
from datetime import datetime, timedelta

np.random.seed(42)
random.seed(42)

from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from snowflake.cortex import complete
session = get_active_session()

# 1. Generate Machine Data

In [None]:
# load machine definitions
machine_definitions = json.load(open('machines.json','r'))

plants = 3
num_lines = 10

minutes_to_generate = 60*24*14 # 14 days of sensor data
start_generation_time = datetime.now()-timedelta(minutes=minutes_to_generate)

# select machines
machines_with_anomalies = 7

## 1.1 Generate Lines

In [None]:
lines = []
for i in range(num_lines):
    plant_id = f'P_{random.randint(1,3):04d}'
    line_id = f'L_{i+1:04d}'
    line_name = f'Bottling Line {i+1:04d}'
    lines.append({'PLANT_ID': plant_id, 'LINE_ID': line_id, 'LINE_NAME': line_name})
lines_df = pd.DataFrame(lines)
lines_df.head()

## 1.2 Generate Machines per Line

In [None]:
machines = []
for ix, row in lines_df.iterrows():
    line_id = row['LINE_ID']
    for i, machine in enumerate(machine_definitions):
        machine_id = f'M_{ix+1:04d}_{i+1:02d}'
        machine_name = machine
        machine_definition = np.random.choice(machine_definitions[machine]['machines'])
        machine_manufacturer = machine_definition['manufacturer']
        machine_model = machine_definition['machine_model']
        machines.append({
            'LINE_ID': line_id, 
            'MACHINE_ID': machine_id, 
            'MACHINE_NAME': machine_name,
            'MACHINE_MANUFACTURER': machine_manufacturer,
            'MACHINE_MODEL': machine_model
        })

machines_df = pd.DataFrame(machines)
machines_df.head()

## 1.3 Generate Sensors per Machine

In [None]:
machine_sensors = []
for ix, row in machines_df.iterrows():
    machine_id = row['MACHINE_ID']
    sensors = machine_definitions[row['MACHINE_NAME']]['sensors']
    for i, sensor in enumerate(sensors):
        machine_sensors.append({
            'MACHINE_ID':machine_id,
            'SENSOR_ID': f'S_{machine_id[2:]}_{i+1:02d}',
            'SENSOR_NAME': sensor['sensor_name'],
            'SENSOR_METRIC': sensor['metric'],
            'SENSOR_UNIT': sensor['unit'],
            'SENSOR_MIN': sensor['min'],
            'SENSOR_MAX': sensor['max']
        })

machine_sensors_df = pd.DataFrame(machine_sensors)
machine_sensors_df.head()

## 1.4 Generate Sensor Data

In [None]:
def generate_sensor_data_walk(sensor_id, min_val, max_val, start_time, duration_minutes):
    """
    Generates realistic sensor data with timestamps using a random walk model.
    """
    sensor_data = []
    
    # Initialize the first value
    mean_val = np.mean([min_val, max_val])
    current_value = np.random.uniform(mean_val * 0.9, mean_val * 1.1)
    
    # Define the maximum step size for each minute
    # This value controls how "smooth" the data is. A smaller step size means less fluctuation.
    # We can set this as a small percentage of the total range.
    max_step = (max_val - min_val) * 0.05 
    
    for i in range(duration_minutes):
        # Generate a random step (positive or negative)
        step = np.random.uniform(-max_step, max_step)
        
        # Calculate the next value by adding the step to the current value
        next_value = current_value + step
        
        # Clamp the value to ensure it stays within the specified min/max range
        next_value = np.clip(next_value, min_val, max_val)
        
        # Update the current value for the next iteration
        current_value = next_value
        
        # Create the timestamp for the current minute
        current_time = start_time + timedelta(minutes=i)
        
        sensor_data.append((sensor_id, current_value, current_time))
        
    return sensor_data

sensor_values = []
for ix, row in machine_sensors_df.iterrows():
    sensor_id = row['SENSOR_ID']
    sensor_min = row['SENSOR_MIN']
    sensor_max = row['SENSOR_MAX']
    sensor_values.extend(generate_sensor_data_walk(sensor_id, sensor_min, sensor_max, start_generation_time, minutes_to_generate))

sensor_values_df = pd.DataFrame(sensor_values, columns=['SENSOR_ID','VALUE','TIMESTAMP'])
sensor_values_df.head()

## 1.5 Generate Production Data

In [None]:
def generate_production_data_walk(machine_id, min_val, max_val, start_time, duration_minutes):
    """
    Generates realistic sensor data with timestamps using a random walk model.
    """
    production_data = []
    
    # Initialize the first value
    mean_val = np.mean([min_val, max_val])
    current_value = int(np.random.uniform(mean_val * 0.9, mean_val * 1.1))
    
    # Define the maximum step size for each minute
    # This value controls how "smooth" the data is. A smaller step size means less fluctuation.
    # We can set this as a small percentage of the total range.
    max_step = (max_val - min_val) * 0.05 
    
    for i in range(duration_minutes):
        # Generate a random step (positive or negative)
        step = int(np.random.uniform(-max_step, max_step))
        
        # Calculate the next value by adding the step to the current value
        next_value = current_value + step
        
        # Clamp the value to ensure it stays within the specified min/max range
        next_value = np.clip(next_value, min_val, max_val)
        
        # Update the current value for the next iteration
        current_value = next_value
        
        # Create the timestamp for the current minute
        current_time = start_time + timedelta(minutes=i)

        scrap = int((max_val-current_value)*0.25)
        
        production_data.append((machine_id, current_value, max_val, scrap, current_time))
        
    return production_data

produced_products = []
for ix, row in machines_df.iterrows():
    machine_id = row['MACHINE_ID']
    production_value_min = 90
    production_value_max = 100
    produced_products.extend(generate_production_data_walk(
        machine_id, 
        production_value_min, 
        production_value_max, 
        start_generation_time, 
        minutes_to_generate))

produced_products_df = pd.DataFrame(produced_products, columns=['MACHINE_ID','UNITS_PRODUCED','UNITS_EXPECTED','UNITS_SCRAPED','TIMESTAMP'])
produced_products_df.head()

## 1.6 Add Anomalies

In [None]:
# filter sensors with 0 or negative min values to not break anomaly logic
filtered_machines = machine_sensors_df[machine_sensors_df['SENSOR_MIN'] > 0]
anomaly_machines = filtered_machines['MACHINE_ID'].sample(n=machines_with_anomalies)

# select sensors for machines
anomaly_sensors = []
for anomaly_machine in anomaly_machines:
    all_sensors = filtered_machines[filtered_machines['MACHINE_ID'] == anomaly_machine]
    anomaly_start = start_generation_time + timedelta(minutes=random.randint(60*24*2, 60*24*14))
    anomaly_duration = random.randint(60,600)
    anomaly_end = anomaly_start + timedelta(minutes=anomaly_duration)
    anomaly_df = all_sensors.sample(n=2)
    anomaly_df['ANOMALY_START'] = anomaly_start
    anomaly_df['ANOMALY_END'] = anomaly_end
    anomaly_df['ANOMALY_DURATION'] = anomaly_duration
    if random.randint(0,1) == 1:
        anomaly_df['TARGET_VALUE'] = anomaly_df['SENSOR_MAX']*(1+random.randint(50,100)/100)
    else:
        anomaly_df['TARGET_VALUE'] = anomaly_df['SENSOR_MIN']*(random.randint(20,50)/100)
    anomaly_sensors.append(anomaly_df)

anomalies_df = pd.concat(anomaly_sensors)
anomalies_df

In [None]:
def apply_gradual_change(data_series: pd.Series, target_value: float, steps: int, fill_value: int) -> pd.Series:
    # Get the starting value
    start_value = data_series.iloc[0]
    
    # Calculate the total change required
    total_change = target_value - start_value
    
    # Calculate the change per step
    change_per_step = total_change / steps

    # Create an array of values representing the gradual change
    gradual_change = np.arange(1, steps + 1) * change_per_step

    # Apply the change to the new data points
    new_values = start_value + gradual_change

    # fill values
    fill_values = np.full(len(data_series)-len(new_values), fill_value)

    # Concatenate the original series with the new, gradually changing values
    new_values = pd.concat([pd.Series(new_values), pd.Series(fill_values)])
    new_values.index = data_series.index
    return new_values

In [None]:
for ix, row in anomalies_df.iterrows():
    sensor_id = row['SENSOR_ID']
    machine_id = row['MACHINE_ID']
    anomaly_start = row['ANOMALY_START']
    anomaly_end = row['ANOMALY_END']
    anomaly_duration = row['ANOMALY_DURATION']
    target_value = row['TARGET_VALUE']
    steps = int(random.randint(25,50)/100*anomaly_duration)
    
    # Get sensor values
    sensor_values_anomaly = sensor_values_df[
        (sensor_values_df['SENSOR_ID'] == sensor_id) &
        (sensor_values_df['TIMESTAMP'] >= anomaly_start) &
        (sensor_values_df['TIMESTAMP'] <= anomaly_end)
    ].copy()

    sensor_values_anomaly['VALUE'] = apply_gradual_change(sensor_values_anomaly['VALUE'], target_value, steps, np.nan)
    sensor_values_df.loc[sensor_values_anomaly.index] = sensor_values_anomaly

    # Get machine values
    product_products_anomaly = produced_products_df[
        (produced_products_df['MACHINE_ID'] == machine_id) &
        (produced_products_df['TIMESTAMP'] >= anomaly_start) &
        (produced_products_df['TIMESTAMP'] <= anomaly_end)
    ].copy()

    product_products_anomaly['UNITS_PRODUCED'] = apply_gradual_change(product_products_anomaly['UNITS_PRODUCED'], 0, steps, 0).astype(int)
    product_products_anomaly['UNITS_SCRAPED'] = apply_gradual_change(product_products_anomaly['UNITS_SCRAPED'], 50, steps, 0).astype(int)
    produced_products_df.loc[product_products_anomaly.index] = product_products_anomaly

## 1.7 Generate OEE Data

In [None]:
# add OEE metrics
produced_products_df['OEE_PERFORMANCE'] = produced_products_df['UNITS_PRODUCED'] / produced_products_df['UNITS_EXPECTED']
produced_products_df['OEE_QUALITY'] = produced_products_df['UNITS_PRODUCED'] / (produced_products_df['UNITS_PRODUCED'] + produced_products_df['UNITS_SCRAPED'])
produced_products_df['OEE_AVAILABILITY'] = (produced_products_df['UNITS_PRODUCED'] > 0).astype(int)
produced_products_df.sample(n=100)

## 1.8 Generate Maintenace Reports

In [None]:
maintenance_reports = []
for ix, row in anomalies_df.iterrows():
    prompt = f"""
    You are a technican that works on a machine incident.
    
    MACHINE_ID: {row['MACHINE_ID']}
    SENSOR_ID: {row['SENSOR_ID']}
    SENSOR_NAME: {row['SENSOR_NAME']}
    SENSOR_METRIC: {row['SENSOR_METRIC']}
    SENSOR_UNIT: {row['SENSOR_UNIT']}
    Sensor Min Value (Normal behavior): {row['SENSOR_MIN']}
    Sensor Max Value (Normal Behavior): {row['SENSOR_MAX']}
    Anomaly Start: {row['ANOMALY_START']}
    Anomaly End: {row['ANOMALY_END']}
    Actual Sensor Value during incident: {row['TARGET_VALUE']}
    
    Generate a realistic looking maintenance report.
    Explain the issue and the steps taken to resolve the issue.
    Resolving the issue could include replacing a part, recalibrating a sensor, etc.

    Use this template to create maintenance reports:

    # MAINTENANCE REPORT
    
    **Report ID:** MR-2025-0902-M0007  
    **Date:** September 3, 2025  
    **Technician:** J. Martinez (ID: T-4471)  
    **Supervisor:** K. Thompson  
    
    ---
    
    ## INCIDENT SUMMARY
    
    **Machine ID:** M_0007_02  
    **Sensor ID:** S_0007_02_01  
    **Sensor Name:** Fill Level Sensor  
    **Incident Start:** September 2, 2025 - 20:13:37  
    **Incident End:** September 3, 2025 - 00:58:37  
    **Total Downtime:** 4 hours 45 minutes  
    
    ---
    
    ## PROBLEM DESCRIPTION
    
    The Fill Level Sensor (S_0007_02_01) on Machine M_0007_02 reported critically high readings of 284.24mm, significantly exceeding the normal operating range of 148.0-152.0mm. This represents an 87% deviation above maximum threshold, triggering automatic safety shutdown protocols.
    
    **Symptoms Observed:**
    - Sensor reading stuck at 284.24mm for entire incident duration
    - No response to actual fill level changes
    - Machine safety interlock engaged
    - Production line halted
    
    ---
    
    ## ROOT CAUSE ANALYSIS
    
    Upon investigation, the following issues were identified:
    
    1. **Primary Cause:** Ultrasonic sensor face contaminated with dried adhesive residue from packaging material
    2. **Secondary Cause:** Sensor mounting bracket had loosened, causing slight misalignment
    3. **Contributing Factor:** Inadequate cleaning schedule for sensor maintenance
    
    The contamination caused false echo returns, resulting in erroneous distance calculations and inflated fill level readings.
    
    ---
    
    ## CORRECTIVE ACTIONS TAKEN
    
    ### Immediate Actions:
    1. **20:45** - Isolated machine and locked out power supply
    2. **20:50** - Accessed sensor housing and documented contamination
    3. **21:15** - Cleaned sensor face using approved solvent (IPA 99%)
    4. **21:30** - Removed and inspected mounting hardware
    
    ### Repair Work:
    1. **21:45** - Replaced loose mounting bolts (Part #: MB-8x25-SS)
    2. **22:00** - Realigned sensor to manufacturer specifications
    3. **22:15** - Applied thread locker to mounting hardware
    4. **22:30** - Performed sensor calibration procedure
    
    ### Testing & Validation:
    1. **22:45** - Conducted 5-point calibration verification
    2. **23:00** - Performed operational test with various fill levels
    3. **23:30** - Monitored sensor readings for 30-minute stability test
    4. **00:15** - Final system integration test
    5. **00:45** - Production restart and monitoring
    
    ---
    
    ## PARTS USED
    
    | Part Number | Description | Quantity | Cost |
    |-------------|-------------|----------|------|
    | MB-8x25-SS | Stainless Steel Mounting Bolt | 4 | $12.50 |
    | TL-242 | Thread Locker Medium Strength | 1 tube | $8.75 |
    
    **Total Parts Cost:** $21.25

    ---
    
    ## PREVENTIVE MEASURES
    
    1. **Immediate:** Added sensor cleaning to weekly maintenance checklist
    2. **Short-term:** Scheduled monthly sensor alignment verification
    3. **Long-term:** Recommended installation of protective sensor shroud (Part #: PS-ULT-001)
    
    ---
    
    ## POST-REPAIR VERIFICATION
    
    - Sensor readings stable within normal range (149.2-151.8mm)
    - No false alarms or erratic behavior observed
    - Machine returned to full production capacity
    - 24-hour follow-up monitoring completed successfully
    
    ---
    
    ## RECOMMENDATIONS
    
    1. Implement protective covering for sensor to prevent future contamination
    2. Review packaging material handling procedures near sensor location
    3. Consider upgrading to sealed sensor housing for harsh environment applications
    
    ---
    
    **Report Completed:** September 3, 2025 - 08:30  
    **Technician Signature:** J. Martinez  
    **Supervisor Approval:** K. Thompson  
    **Next Scheduled Maintenance:** September 17, 2025
    
    """
    maintenance_reports.append({
        'INCIDENT_ID': f'INC_ID_{ix:05d}',
        'MACHINE_ID': row['MACHINE_ID'],
        'TIMESTAMP': row['ANOMALY_END']+timedelta(hours=random.randint(6,48)),
        'DOCUMENT_PROMPT': prompt
    })

maintenance_reports = pd.DataFrame(maintenance_reports)
maintenance_reports_df = (
    session
        .create_dataframe(maintenance_reports)
        .with_column('MAINTENANCE_DOCUMENT', complete('claude-4-sonnet', F.col('DOCUMENT_PROMPT')))
        .drop('DOCUMENT_PROMPT')
)

## 1.9 Save Data

In [None]:
# save data
machine_sensors_df = machine_sensors_df.drop(['SENSOR_MIN','SENSOR_MAX'], axis=1)


session.write_pandas(
    df=lines_df,
    table_name='DIM_LINES',
    overwrite=True,
    use_logical_type=True, 
    auto_create_table=True
)

session.write_pandas(
    df=machines_df,
    table_name='DIM_MACHINES',
    overwrite=True,
    use_logical_type=True, 
    auto_create_table=True
)

session.write_pandas(
    df=machine_sensors_df,
    table_name='DIM_SENSORS',
    overwrite=True,
    use_logical_type=True, 
    auto_create_table=True
)

session.write_pandas(
    df=sensor_values_df,
    table_name='FACT_SENSOR_VALUES',
    overwrite=True,
    use_logical_type=True, 
    auto_create_table=True
)

maintenance_reports_df.write.save_as_table(table_name='MAINTENANCE_REPORTS', mode='overwrite')

session.write_pandas(
    df=produced_products_df,
    table_name='FACT_OEE',
    overwrite=True,
    use_logical_type=True, 
    auto_create_table=True
)

## 1.10 Generate downsampled Sensor Data

In [None]:
CREATE OR REPLACE TABLE FACT_SENSOR_VALUES_10_MINUTES AS
SELECT
  sensor_id,
  AVG(value) value,
  TIME_SLICE(timestamp, 10, 'minute') as time_bucket
FROM
  AI_DEVELOPMENT.SI_BOTTLING_COMPANY.FACT_SENSOR_VALUES
GROUP BY
  sensor_id, time_bucket
ORDER BY SENSOR_ID, time_bucket desc;

## 1.11 Create Table to store Anomalies

In [None]:
CREATE OR REPLACE TABLE ANOMALIES (
    MACHINE_ID TEXT,
    SENSOR_ID TEXT,
    SENSOR_NAME TEXT,
    ANOMALY_TIMESTAMP TIMESTAMP,
    SENSOR_VALUE DOUBLE
);

# Add Metadata

In [None]:
-- Table Descriptions
COMMENT ON TABLE DIM_LINES IS 'A dimension table containing static information about each unique production line, including its name and the plant it belongs to. This table provides the core business context for analyzing machine and sensor data.';
COMMENT ON TABLE DIM_MACHINES IS 'A dimension table that catalogs all machines on the factory floor. It links each machine to a specific production line and contains descriptive attributes such as the machine''s name, manufacturer, and model.';
COMMENT ON TABLE DIM_SENSORS IS 'A dimension table that stores the details of every sensor installed on the machines. It provides critical context by defining what each sensor measures, the metric name, and its corresponding unit of measurement.';
COMMENT ON TABLE FACT_SENSOR_VALUES IS 'A fact table that records all time-series sensor measurements. This is the central repository for dynamic data, storing the specific value and timestamp for each sensor reading, which is used for performance analysis and monitoring.';
COMMENT ON TABLE FACT_SENSOR_VALUES_10_MINUTES IS 'A fact table that records all time-series sensor measurements. The data has been downsampled to 10 minute intervals.';
COMMENT ON TABLE FACT_OEE IS 'This table stores OEE information.';
COMMENT ON TABLE MAINTENANCE_REPORTS IS 'This table stores maintenance reports from technicians.';
COMMENT ON TABLE ANOMALIES IS 'This table stores sensor anomalies identifed using Outlier Detection in Python.';

-- Column Descriptions
COMMENT ON COLUMN DIM_LINES.PLANT_ID IS 'A unique identifier for the manufacturing plant where the line is located. It serves as a foreign key to a plant dimension table.';
COMMENT ON COLUMN DIM_LINES.LINE_ID IS 'The primary key and unique identifier for each production line.';
COMMENT ON COLUMN DIM_LINES.LINE_NAME IS 'The functional name of the production line (e.g., ''Water Bottling Line A'').';

COMMENT ON COLUMN DIM_MACHINES.LINE_ID IS 'The foreign key linking to DIM_LINES, specifying which production line the machine is part of.';
COMMENT ON COLUMN DIM_MACHINES.MACHINE_ID IS 'The primary key and unique identifier for each machine.';
COMMENT ON COLUMN DIM_MACHINES.MACHINE_NAME IS 'The descriptive name of the machine based on its function (e.g., ''Bottle Filler'').';
COMMENT ON COLUMN DIM_MACHINES.MACHINE_MANUFACTURER IS 'The manufacturer of the machine (e.g., ''Krones'').';
COMMENT ON COLUMN DIM_MACHINES.MACHINE_MODEL IS 'The specific model number or name of the machine (e.g., ''Variopac Pro'').';

COMMENT ON COLUMN DIM_SENSORS.MACHINE_ID IS 'The foreign key linking to DIM_MACHINES, specifying the machine the sensor is installed on.';
COMMENT ON COLUMN DIM_SENSORS.SENSOR_ID IS 'The primary key and unique identifier for each sensor.';
COMMENT ON COLUMN DIM_SENSORS.SENSOR_NAME IS 'The descriptive name of the sensor based on its purpose (e.g., ''Torque Sensor'').';
COMMENT ON COLUMN DIM_SENSORS.SENSOR_METRIC IS 'The physical quantity the sensor measures (e.g., ''pressure'').';
COMMENT ON COLUMN DIM_SENSORS.SENSOR_UNIT IS 'The unit of measurement for the sensor readings (e.g., ''bar'').';

COMMENT ON COLUMN FACT_SENSOR_VALUES.SENSOR_ID IS 'The foreign key linking to DIM_SENSORS, identifying the source of the measurement.';
COMMENT ON COLUMN FACT_SENSOR_VALUES.VALUE IS 'The recorded numerical value of the sensor reading at the specific timestamp.';
COMMENT ON COLUMN FACT_SENSOR_VALUES.TIMESTAMP IS 'The exact date and time when the sensor measurement was recorded.';

COMMENT ON COLUMN FACT_SENSOR_VALUES_10_MINUTES.SENSOR_ID IS 'The foreign key linking to DIM_SENSORS, identifying the source of the measurement.';
COMMENT ON COLUMN FACT_SENSOR_VALUES_10_MINUTES.VALUE IS 'The recorded numerical value of the sensor reading at the specific timestamp.';
COMMENT ON COLUMN FACT_SENSOR_VALUES_10_MINUTES.TIME_BUCKET IS 'The date and time when the sensor measurement was recorded. Sensor values were downsampled to 10 minute intervals.';

COMMENT ON COLUMN FACT_OEE.MACHINE_ID IS 'The foreign key linking to DIM_MACHINES, specifying the machine.';
COMMENT ON COLUMN FACT_OEE.UNITS_PRODUCED IS 'The number of units produced.';
COMMENT ON COLUMN FACT_OEE.UNITS_EXPECTED IS 'The number of units expected';
COMMENT ON COLUMN FACT_OEE.UNITS_SCRAPED IS 'The number of units scraped due to poor quality.';
COMMENT ON COLUMN FACT_OEE.OEE_PERFORMANCE IS 'Overall Equipment Effectiveness (Performance). Defined as (Actual Production Rate / Ideal Production Rate)';
COMMENT ON COLUMN FACT_OEE.OEE_QUALITY IS 'Overall Equipment Effectiveness (Quality). Defines as (Good Units Produced / Total Units Produced)';
COMMENT ON COLUMN FACT_OEE.OEE_AVAILABILITY IS 'verall Equipment Effectiveness (Availability). Defines as (Total Operating Time / Planned Production Time)';
COMMENT ON COLUMN FACT_OEE.TIMESTAMP IS 'The exact date and time when the machine produced products.';

COMMENT ON COLUMN MAINTENANCE_REPORTS.INCIDENT_ID IS 'The primary key and unique identifier for each incident.';
COMMENT ON COLUMN MAINTENANCE_REPORTS.MACHINE_ID IS 'The foreign key linking to DIM_MACHINES, specifying the machine the sensor is installed on.';
COMMENT ON COLUMN MAINTENANCE_REPORTS.TIMESTAMP IS 'The exact date and time when the technician provided the report.';
COMMENT ON COLUMN MAINTENANCE_REPORTS.MAINTENANCE_DOCUMENT IS 'The maintenance document from the technician.';

COMMENT ON COLUMN ANOMALIES.MACHINE_ID IS 'Unique identifier of the machine whose sensors had anomalies.';
COMMENT ON COLUMN ANOMALIES.SENSOR_ID IS 'Unique identifier for the specific sensor on the machine that had anomalies.';
COMMENT ON COLUMN ANOMALIES.SENSOR_NAME IS 'Human-readable name for the sensor.';
COMMENT ON COLUMN ANOMALIES.ANOMALY_TIMESTAMP IS 'The date and time when the sensor value (anomaly) was recorded.';
COMMENT ON COLUMN ANOMALIES.SENSOR_VALUE IS 'The value of the sensor at the time of the anomaly.';

-- Primarey / Foreign Keys
ALTER TABLE DIM_LINES ADD CONSTRAINT PK_DIM_LINES PRIMARY KEY (LINE_ID);

ALTER TABLE DIM_MACHINES ADD CONSTRAINT PK_DIM_MACHINES PRIMARY KEY (MACHINE_ID);
ALTER TABLE DIM_MACHINES
ADD CONSTRAINT FK_DIM_MACHINES FOREIGN KEY (LINE_ID)
REFERENCES DIM_LINES(LINE_ID);

ALTER TABLE DIM_SENSORS ADD CONSTRAINT PK_DIM_SENSORS PRIMARY KEY (SENSOR_ID);
ALTER TABLE DIM_SENSORS
ADD CONSTRAINT FK_DIM_SENSORS FOREIGN KEY (MACHINE_ID)
REFERENCES DIM_MACHINES(MACHINE_ID);

ALTER TABLE FACT_SENSOR_VALUES
ADD CONSTRAINT FK_FACT_SENSOR_VALUES FOREIGN KEY (SENSOR_ID)
REFERENCES DIM_SENSORS(SENSOR_ID);

ALTER TABLE FACT_SENSOR_VALUES_10_MINUTES
ADD CONSTRAINT FK_FACT_SENSOR_VALUES_10_MINUTES FOREIGN KEY (SENSOR_ID)
REFERENCES DIM_SENSORS(SENSOR_ID);

ALTER TABLE MAINTENANCE_REPORTS ADD CONSTRAINT PK_MAINTENANCE_REPORTS PRIMARY KEY (INCIDENT_ID);
ALTER TABLE MAINTENANCE_REPORTS
ADD CONSTRAINT FK_MAINTENANCE_REPORTS FOREIGN KEY (MACHINE_ID)
REFERENCES DIM_MACHINES(MACHINE_ID);

# 2. Search Services for High Cardinality Columns

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE _CA_LINE_NAME
  ON LINE_NAME
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT LINE_NAME
  FROM DIM_LINES
);

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE _CA_MACHINE_NAME
  ON MACHINE_NAME
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT MACHINE_NAME
  FROM DIM_MACHINES
);

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE _CA_MACHINE_MANUFACTURER
  ON MACHINE_MANUFACTURER
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT MACHINE_MANUFACTURER
  FROM DIM_MACHINES
);

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE _CA_MACHINE_MODEL
  ON MACHINE_MODEL
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT MACHINE_MODEL
  FROM DIM_MACHINES
);

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE _CA_SENSOR_NAME
  ON SENSOR_NAME
  WAREHOUSE = AI_WH
  TARGET_LAG = '12 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT SENSOR_NAME
  FROM DIM_SENSORS
);

# 3. Create Semantic View

In [None]:
create or replace semantic view AI_DEVELOPMENT.SI_BOTTLING_COMPANY.FACTORY_DATA_MODEL tables (
    ANOMALIES with synonyms =(
        'anomalies',
        'sensor_anomalies',
        'machine_anomalies',
        'outlier_data',
        'sensor_outliers',
        'machine_outlier_data'
    ) comment = 'This table stores sensor anomalies identified using Outlier Detection in Python, providing a record of unusual sensor readings from industrial machines, including the machine and sensor ID, sensor name, timestamp of the anomaly, and the anomalous sensor value.',
    DIM_LINES primary key (LINE_ID) with synonyms =(
        'production_lines',
        'manufacturing_lines',
        'factory_lines',
        'assembly_lines',
        'production_facilities',
        'manufacturing_facilities',
        'factory_floor',
        'production_areas'
    ) comment = 'A dimension table containing static information about each unique production line, including its name and the plant it belongs to. This table provides the core business context for analyzing machine and sensor data.',
    DIM_MACHINES primary key (MACHINE_ID) with synonyms =(
        'Machines',
        'Factory Equipment',
        'Production Machines',
        'Manufacturing Assets',
        'Factory Floor Assets',
        'Industrial Equipment',
        'Production Line Machines',
        'Manufacturing Machinery'
    ) comment = 'A dimension table that catalogs all machines on the factory floor. It links each machine to a specific production line and contains descriptive attributes such as the machine''''s name, manufacturer, and model.',
    DIM_SENSORS primary key (SENSOR_ID) with synonyms =(
        'Sensor Details',
        'Sensor Information',
        'Sensor Master',
        'Sensor Reference',
        'Sensor Registry',
        'Sensor Catalog'
    ) comment = 'A dimension table that stores the details of every sensor installed on the machines. It provides critical context by defining what each sensor measures, the metric name, and its corresponding unit of measurement.',
    FACT_OEE with synonyms =(
        'OEE_Facts',
        'OEE_Data',
        'Equipment_Effectiveness',
        'Machine_Performance',
        'Production_Efficiency',
        'Manufacturing_OEE',
        'OEE_Statistics',
        'Factory_Performance',
        'Production_OEE',
        'Equipment_Performance_Data'
    ) comment = 'This table stores Overall Equipment Effectiveness (OEE) information for manufacturing machines, capturing key performance metrics such as production rates, quality, and availability, to measure the efficiency and effectiveness of each machine.',
    FACT_SENSOR_VALUES_10_MINUTES with synonyms =(
        'Sensor Readings',
        'Sensor Measurements',
        'Time Series Data',
        'Fact Sensor Data',
        'Downsamlped Sensor Values',
        '10 Minute Sensor Intervals',
        'Sensor Records',
        'Measurement Data',
        'Time Series Sensor Records'
    ) comment = 'A fact table that records all time-series sensor measurements. The data has been downsampled to 10 minute intervals.'
) relationships (
    SENSORS_X_ANOMALIES as ANOMALIES(SENSOR_ID) references DIM_SENSORS(SENSOR_ID),
    LINES_X_MACHINES as DIM_MACHINES(LINE_ID) references DIM_LINES(LINE_ID),
    MACHINES_X_SENSORS as DIM_SENSORS(MACHINE_ID) references DIM_MACHINES(MACHINE_ID),
    MACHINES_X_OEE as FACT_OEE(MACHINE_ID) references DIM_MACHINES(MACHINE_ID),
    SENSORS_X_SENSOR_VALUES as FACT_SENSOR_VALUES_10_MINUTES(SENSOR_ID) references DIM_SENSORS(SENSOR_ID)
) facts (
    PUBLIC ANOMALIES.SENSOR_VALUE as SENSOR_VALUE with synonyms =(
        'sensor_reading',
        'sensor_measurement',
        'anomaly_value',
        'sensor_data',
        'reading_value',
        'measurement_value',
        'sensor_output'
    ) comment = 'The value of the sensor at the time of the anomaly.',
    PUBLIC FACT_OEE.OEE_AVAILABILITY as OEE_AVAILABILITY with synonyms =(
        'operating_time_percentage',
        'planned_production_time_utilization',
        'uptime_percentage',
        'availability_rate',
        'operating_efficiency'
    ) comment = 'verall Equipment Effectiveness (Availability). Defines as (Total Operating Time / Planned Production Time)',
    PUBLIC FACT_OEE.OEE_PERFORMANCE as OEE_PERFORMANCE with synonyms =(
        'performance_efficiency',
        'production_rate',
        'actual_production_rate',
        'ideal_production_rate_ratio',
        'equipment_performance',
        'production_efficiency_ratio',
        'oee_performance_rate'
    ) comment = 'Overall Equipment Effectiveness (Performance). Defined as (Actual Production Rate / Ideal Production Rate)',
    PUBLIC FACT_OEE.OEE_QUALITY as OEE_QUALITY with synonyms =(
        'good_units_produced_rate',
        'quality_rate',
        'defect_rate',
        'yield_rate',
        'quality_efficiency',
        'quality_performance'
    ) comment = 'Overall Equipment Effectiveness (Quality). Defines as (Good Units Produced / Total Units Produced)',
    PUBLIC FACT_OEE.UNITS_EXPECTED as UNITS_EXPECTED with synonyms =(
        'target_units',
        'expected_production',
        'planned_units',
        'forecasted_units',
        'projected_units',
        'desired_output',
        'expected_output',
        'target_production',
        'planned_production'
    ) comment = 'The number of units expected',
    PUBLIC FACT_OEE.UNITS_PRODUCED as UNITS_PRODUCED with synonyms =(
        'units_manufactured',
        'items_produced',
        'production_output',
        'total_units_made',
        'produced_quantity',
        'manufactured_goods'
    ) comment = 'The number of units produced.',
    PUBLIC FACT_OEE.UNITS_SCRAPED as UNITS_SCRAPED with synonyms =(
        'units_discarded',
        'defective_units',
        'rejected_units',
        'units_defective',
        'scrapped_items',
        'non_conforming_units',
        'defective_production',
        'rejected_production'
    ) comment = 'The number of units scraped due to poor quality.',
    PUBLIC FACT_SENSOR_VALUES_10_MINUTES.VALUE as VALUE with synonyms =(
        'measurement',
        'reading',
        'sensor_reading',
        'recorded_value',
        'numerical_value',
        'data_point',
        'metric',
        'sensor_measurement',
        'value_reading'
    ) comment = 'Sensor reading values recorded at 10-minute intervals.'
) dimensions (
    PUBLIC ANOMALIES.ANOMALY_TIMESTAMP as ANOMALY_TIMESTAMP with synonyms =(
        'anomaly_date',
        'anomaly_time',
        'anomaly_recording_time',
        'sensor_anomaly_timestamp',
        'anomaly_detection_time',
        'anomaly_occurrence_time'
    ) comment = 'The date and time when the sensor value (anomaly) was recorded.',
    PUBLIC ANOMALIES.MACHINE_ID as MACHINE_ID with synonyms =(
        'machine_identifier',
        'equipment_id',
        'device_number',
        'asset_id',
        'equipment_identifier',
        'unit_id'
    ) comment = 'Unique identifier of the machine whose sensors had anomalies.',
    PUBLIC ANOMALIES.SENSOR_ID as SENSOR_ID with synonyms =(
        'sensor_identifier',
        'device_id',
        'sensor_code',
        'sensor_tag',
        'device_tag',
        'sensor_serial_number'
    ) comment = 'Unique identifier for the specific sensor on the machine that had anomalies.',
    PUBLIC ANOMALIES.SENSOR_NAME as SENSOR_NAME with synonyms =(
        'sensor_label',
        'device_name',
        'sensor_description',
        'device_label',
        'measurement_name'
    ) comment = 'Human-readable name for the sensor.',
    PUBLIC DIM_LINES.LINE_ID as LINE_ID with synonyms =(
        'production_line_id',
        'line_key',
        'line_identifier',
        'production_line_key',
        'line_code'
    ) comment = 'The primary key and unique identifier for each production line.',
    PUBLIC DIM_LINES.LINE_NAME as LINE_NAME with synonyms =(
        'production_line_name',
        'line_description',
        'line_label',
        'manufacturing_line_name',
        'production_line_label'
    ) comment = 'The functional name of the production line (e.g., ''Water Bottling Line A'').',
    PUBLIC DIM_LINES.PLANT_ID as PLANT_ID with synonyms =(
        'plant_code',
        'manufacturing_facility_id',
        'factory_identifier',
        'production_site_id',
        'facility_number'
    ) comment = 'A unique identifier for the manufacturing plant where the line is located. It serves as a foreign key to a plant dimension table.',
    PUBLIC DIM_MACHINES.LINE_ID as LINE_ID with synonyms =(
        'production_line_id',
        'line_key',
        'production_line_key',
        'factory_line_id',
        'manufacturing_line_id'
    ) comment = 'The foreign key linking to DIM_LINES, specifying which production line the machine is part of.',
    PUBLIC DIM_MACHINES.MACHINE_ID as MACHINE_ID with synonyms =(
        'machine_key',
        'unique_machine_identifier',
        'equipment_id',
        'device_id',
        'asset_id'
    ) comment = 'The primary key and unique identifier for each machine.',
    PUBLIC DIM_MACHINES.MACHINE_MANUFACTURER as MACHINE_MANUFACTURER with synonyms =(
        'machine_maker',
        'equipment_supplier',
        'device_producer',
        'machine_builder',
        'equipment_vendor',
        'device_manufacturer',
        'machine_supplier'
    ) comment = 'The manufacturer of the machine (e.g., ''Krones'').',
    PUBLIC DIM_MACHINES.MACHINE_MODEL as MACHINE_MODEL with synonyms =(
        'machine_type',
        'equipment_model',
        'device_number',
        'machine_designation',
        'equipment_designation',
        'model_number',
        'machine_identifier'
    ) comment = 'The specific model number or name of the machine (e.g., ''Variopac Pro'').',
    PUBLIC DIM_MACHINES.MACHINE_NAME as MACHINE_NAME with synonyms =(
        'machine_description',
        'equipment_name',
        'device_name',
        'machine_label',
        'equipment_label',
        'device_description'
    ) comment = 'The descriptive name of the machine based on its function (e.g., ''Bottle Filler'').',
    PUBLIC DIM_SENSORS.MACHINE_ID as MACHINE_ID with synonyms =(
        'machine_identifier',
        'equipment_id',
        'device_number',
        'asset_number',
        'equipment_reference'
    ) comment = 'The foreign key linking to DIM_MACHINES, specifying the machine the sensor is installed on.',
    PUBLIC DIM_SENSORS.SENSOR_ID as SENSOR_ID with synonyms =(
        'sensor_key',
        'sensor_identifier',
        'sensor_unique_id',
        'sensor_code',
        'sensor_reference'
    ) comment = 'The primary key and unique identifier for each sensor.',
    PUBLIC DIM_SENSORS.SENSOR_METRIC as SENSOR_METRIC with synonyms =(
        'physical_quantity',
        'measured_value',
        'sensor_measurement',
        'metric_name',
        'measured_attribute',
        'sensor_reading_type'
    ) comment = 'The physical quantity the sensor measures (e.g., ''pressure'').',
    PUBLIC DIM_SENSORS.SENSOR_NAME as SENSOR_NAME with synonyms =(
        'sensor_description',
        'sensor_label',
        'sensor_title',
        'device_name',
        'measurement_name',
        'sensor_tag'
    ) comment = 'The descriptive name of the sensor based on its purpose (e.g., ''Torque Sensor'').',
    PUBLIC DIM_SENSORS.SENSOR_UNIT as SENSOR_UNIT with synonyms =(
        'unit_of_measurement',
        'measurement_unit',
        'sensor_measurement_unit',
        'unit',
        'measurement_type',
        'data_unit'
    ) comment = 'The unit of measurement for the sensor readings (e.g., ''bar'').',
    PUBLIC FACT_OEE.MACHINE_ID as MACHINE_ID with synonyms =(
        'machine_number',
        'equipment_id',
        'device_id',
        'asset_id',
        'production_unit_id',
        'manufacturing_unit_id',
        'factory_machine_id'
    ) comment = 'The foreign key linking to DIM_MACHINES, specifying the machine.',
    PUBLIC FACT_OEE.TIMESTAMP as TIMESTAMP with synonyms =(
        'date',
        'datetime',
        'creation_time',
        'record_time',
        'log_time',
        'event_time',
        'timestamp_ntz',
        'time_of_event',
        'date_and_time'
    ) comment = 'The exact date and time when the machine produced products.',
    PUBLIC FACT_SENSOR_VALUES_10_MINUTES.SENSOR_ID as SENSOR_ID with synonyms =(
        'sensor_key',
        'measurement_source',
        'sensor_identifier',
        'device_id',
        'measurement_device',
        'sensor_reference'
    ) comment = 'Unique identifier for a specific sensor.',
    PUBLIC FACT_SENSOR_VALUES_10_MINUTES.TIME_BUCKET as TIME_BUCKET with synonyms =(
        'time_interval',
        'measurement_time',
        'sampling_time',
        'timestamp',
        'time_slot',
        'bucket_time',
        'interval_time',
        'measurement_interval',
        'time_period'
    ) comment = 'The time at which sensor values were recorded, aggregated into 10-minute intervals.'
) metrics (
    PUBLIC FACT_OEE.OEE_OVERALL as AVG(OEE_AVAILABILITY * OEE_PERFORMANCE * OEE_QUALITY) with synonyms =('overall_oee') comment = 'Overall Equipment Effectiveness defined as AVG(Availability * Performance * Quality).'
) comment = 'This semantic data model provides a comprehensive view of manufacturing operations by linking machine performance, sensor data, and production line information. It helps teams monitor equipment efficiency and quickly identify and analyze anomalies.
 
 ---
 
 What the Model Contains
 
 The model is built on four core tables:
 
 * ANOMALIES: Contains records of unusual sensor readings, including which machine and sensor were affected, the type of anomaly, and the specific timestamp and value.
 * DIM_LINES: Provides details about each production line, such as its name and the manufacturing plant it''s in.
 * DIM_MACHINES: Lists all the machines, detailing their manufacturer, model, and the production line they belong to.
 * DIM_SENSORS: Describes each sensor, including the machine it''s installed on, what physical quantity it measures (e.g., pressure), and the unit of measurement.
 * FACT_OEE: Tracks Overall Equipment Effectiveness (OEE) metrics like Availability, Performance, and Quality for each machine over time, along with the number of units produced and scrapped.
 * FACT_SENSOR_VALUES_10_MINUTES: Stores sensor readings aggregated into 10-minute intervals.
 
 ---
 
 Key Business Questions It Can Answer
 
 This model helps you answer critical questions about factory performance and potential issues, such as:
 
 * Which machines and production lines have the lowest OEE scores?
 * What is the average OEE for a specific manufacturing plant or production line?
 * How does a machine''s performance or quality change over time?
 * Which sensors are showing the most frequent or severe anomalies?' with extension (
    CA = '{"tables":[{"name":"ANOMALIES","dimensions":[{"name":"MACHINE_ID","sample_values":["M_0007_02"]},{"name":"SENSOR_ID","sample_values":["S_0007_02_02","S_0007_02_03","S_0007_02_01"]},{"name":"SENSOR_NAME","sample_values":["Fill Level Sensor","Pressure Sensor","Flow Meter"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_SENSOR_NAME"}}],"facts":[{"name":"SENSOR_VALUE","sample_values":["151.99","240.46","35.44"]}],"time_dimensions":[{"name":"ANOMALY_TIMESTAMP","sample_values":["2025-08-26T13:40:00.000+0000","2025-08-26T13:30:00.000+0000","2025-08-22T23:00:00.000+0000"]}]},{"name":"DIM_LINES","dimensions":[{"name":"LINE_ID","sample_values":["L_0003","L_0001","L_0002"]},{"name":"LINE_NAME","sample_values":["Bottling Line 0001","Bottling Line 0002","Bottling Line 0003"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_LINE_NAME"}},{"name":"PLANT_ID","sample_values":["P_0003","P_0002","P_0001"]}]},{"name":"DIM_MACHINES","dimensions":[{"name":"LINE_ID","sample_values":["L_0003","L_0001","L_0002"]},{"name":"MACHINE_ID","sample_values":["M_0001_02","M_0009_01","M_0001_01"]},{"name":"MACHINE_MANUFACTURER","sample_values":["Accutek","Sidel","Lanfranchi"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_MACHINE_MANUFACTURER"}},{"name":"MACHINE_MODEL","sample_values":["Actima","Rotary Air Rinser","AccuWeight Fillers"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_MACHINE_MODEL"}},{"name":"MACHINE_NAME","sample_values":["Bottle Rinser","Bottle Filler","Capper"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_MACHINE_NAME"}}]},{"name":"DIM_SENSORS","dimensions":[{"name":"MACHINE_ID","sample_values":["M_0001_02","M_0009_01","M_0001_01"]},{"name":"SENSOR_ID","sample_values":["S_0009_04_01","S_0002_03_01","S_0005_07_03"]},{"name":"SENSOR_METRIC","sample_values":["motor_speed","fill_level","pressure"]},{"name":"SENSOR_NAME","sample_values":["Fill Level Sensor","Pressure Sensor","Flow Sensor"],"cortex_search_service":{"database":"AI_DEVELOPMENT","schema":"SI_BOTTLING_COMPANY","service":"_CA_SENSOR_NAME"}},{"name":"SENSOR_UNIT","sample_values":["m3/h","bar","°C"]}]},{"name":"FACT_OEE","dimensions":[{"name":"MACHINE_ID","sample_values":["M_0009_01","M_0008_07","M_0008_06"]}],"facts":[{"name":"OEE_AVAILABILITY","sample_values":["1","0"]},{"name":"OEE_PERFORMANCE","sample_values":["0.9","0.97","0.92"]},{"name":"OEE_QUALITY","sample_values":["1","0.9782608696","0.9787234043"]},{"name":"UNITS_EXPECTED","sample_values":["100"]},{"name":"UNITS_PRODUCED","sample_values":["90","92","97"]},{"name":"UNITS_SCRAPED","sample_values":["2","1","0"]}],"metrics":[{"name":"OEE_OVERALL"}],"time_dimensions":[{"name":"TIMESTAMP","sample_values":["2025-08-30T08:39:20.663+0000","2025-08-30T08:40:20.663+0000","2025-08-30T08:41:20.663+0000"]}]},{"name":"FACT_SENSOR_VALUES_10_MINUTES","dimensions":[{"name":"SENSOR_ID","sample_values":["S_0002_02_04","S_0002_02_03","S_0002_02_05"]}],"facts":[{"name":"VALUE","sample_values":["145.525780401","72.908447532","114.251841949"]}],"time_dimensions":[{"name":"TIME_BUCKET","sample_values":["2025-08-30T03:10:00.000+0000","2025-08-26T20:00:00.000+0000","2025-09-03T13:50:00.000+0000"]}]}],"relationships":[{"name":"SENSORS_X_ANOMALIES"},{"name":"LINES_X_MACHINES"},{"name":"MACHINES_X_SENSORS"},{"name":"MACHINES_X_OEE"},{"name":"SENSORS_X_SENSOR_VALUES"}],"verified_queries":[{"name":"Which machine manufacturers had the lowest overall OEE (calculated as AVG(oee_availability * oee_performance * oee_quality)) in the past 14 days? Provide the bottom 5 manufacturers ranked by OEE.","question":"Which machine manufacturers had the lowest overall OEE (calculated as AVG(oee_availability * oee_performance * oee_quality)) in the past 14 days? Provide the bottom 5 manufacturers ranked by OEE.","sql":"SELECT\\n  dm.machine_manufacturer,\\n  AVG(\\n    fo.oee_availability * fo.oee_performance * fo.oee_quality\\n  ) AS overall_oee\\nFROM\\n  fact_oee AS fo\\n  LEFT OUTER JOIN dim_machines AS dm ON fo.machine_id = dm.machine_id\\nWHERE\\n  fo.timestamp >= DATEADD(DAY, -14, CURRENT_DATE)\\nGROUP BY\\n  dm.machine_manufacturer\\nORDER BY\\n  overall_oee ASC\\nLIMIT\\n  5","use_as_onboarding_question":false,"verified_by":"Michael Gorkow","verified_at":1757098301},{"name":"List all machines that had anomalies in the past 14 days (previous 14 calendar days from current date) along with the count of anomalies for each machine, showing line ID, machine ID, machine name, and anomaly count.","question":"List all machines that had anomalies in the past 14 days (previous 14 calendar days from current date) along with the count of anomalies for each machine, showing line ID, machine ID, machine name, and anomaly count.","sql":"SELECT\\n  dm.line_id,\\n  dm.machine_id,\\n  dm.machine_name,\\n  COUNT(a.anomaly_timestamp) AS anomaly_count\\nFROM\\n  anomalies AS a\\n  LEFT OUTER JOIN dim_sensors AS ds ON a.sensor_id = ds.sensor_id\\n  LEFT OUTER JOIN dim_machines AS dm ON ds.machine_id = dm.machine_id\\nWHERE\\n  a.anomaly_timestamp >= DATEADD(DAY, -14, CURRENT_DATE)\\nGROUP BY\\n  dm.line_id,\\n  dm.machine_id,\\n  dm.machine_name\\nORDER BY\\n  dm.machine_id","use_as_onboarding_question":false,"verified_by":"Michael Gorkow","verified_at":1757098393},{"name":"Which bottle filler machines had the lowest oee in the past 14 days?","question":"Which bottle filler machines had the lowest oee in the past 14 days?","sql":"SELECT\\n  dm.machine_id,\\n  dm.machine_name,\\n  AVG(\\n    fo.oee_availability * fo.oee_performance * fo.oee_quality\\n  ) AS overall_oee\\nFROM\\n  fact_oee AS fo\\n  LEFT OUTER JOIN dim_machines AS dm ON fo.machine_id = dm.machine_id\\nWHERE\\n  fo.timestamp >= DATEADD(DAY, -14, CURRENT_DATE)\\n  AND dm.machine_name = ''Bottle Filler''\\nGROUP BY\\n  dm.machine_id,\\n  dm.machine_name\\nORDER BY\\n  overall_oee ASC","use_as_onboarding_question":false,"verified_by":"Michael Gorkow","verified_at":1757098878}]}'
);

# 4. Create Search Service for Maintenance Reports

In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE MAINTENANCE_REPORTS
  ON MAINTENANCE_DOCUMENT
  ATTRIBUTES INCIDENT_ID, MACHINE_ID, TIMESTAMP
  WAREHOUSE = AI_WH
  TARGET_LAG = '1 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      *
  FROM MAINTENANCE_REPORTS
);

# 5. Create Custom Tools

In [None]:
CREATE OR REPLACE PROCEDURE detect_anomalies(MACHINE_ID TEXT, START_DATE TEXT, END_DATE TEXT)
RETURNS TEXT
LANGUAGE PYTHON
RUNTIME_VERSION = '3.11'
PACKAGES = ('snowflake-snowpark-python','scikit-learn')
HANDLER = 'detect_outliers'
AS
$$
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F

def detect_outliers(session: Session, machine_id: str, start_date: str, end_date: str) -> str:
    sensors_sdf = session.table('DIM_SENSORS')
    sensor_values_sdf = session.table('FACT_SENSOR_VALUES_10_MINUTES')
    
    # Retrieve all sensors for this machine
    outlier_input_data = (
        sensors_sdf.join(sensor_values_sdf, on=['SENSOR_ID'])
            .filter(F.col('MACHINE_ID') == machine_id)
            .filter(F.col('TIME_BUCKET').between(start_date,end_date))
            .filter(F.col('VALUE').is_not_null())
            .to_pandas()
    )
    if len(outlier_input_data) > 0:
        # Run outlier detection per sensor
        anomalies = []
        unique_sensors = outlier_input_data['SENSOR_ID'].unique()
        for sensor_id in unique_sensors:
            results = outlier_input_data[outlier_input_data['SENSOR_ID'] == sensor_id]
            clf = LocalOutlierFactor(contamination='auto')
            labels = clf.fit_predict(results[['VALUE']])
            results['OUTLIER'] = labels
            results = results[results['OUTLIER'] == -1]
            results = results[['MACHINE_ID','SENSOR_ID','SENSOR_NAME','TIME_BUCKET','VALUE']]
            anomalies.append(results)
    
        anomalies = pd.concat(anomalies)
        if len(anomalies) > 0:
            anomalies = anomalies.round(2)
        
            # Insert detected anomalies
            sql_stmt = f"""
            MERGE INTO ANOMALIES AS anomalies
            USING (
              SELECT * FROM VALUES
                {",".join([str(r) for r in anomalies.to_records(index=False)])}
              AS new_anomalies(MACHINE_ID, SENSOR_ID, SENSOR_NAME, ANOMALY_TIMESTAMP, VALUE)
            ) AS new_anomalies
            ON anomalies.anomaly_timestamp = new_anomalies.anomaly_timestamp
            WHEN NOT MATCHED THEN
              INSERT
              VALUES (MACHINE_ID, SENSOR_ID, SENSOR_NAME, ANOMALY_TIMESTAMP, VALUE)
            """
            session.sql(sql_stmt).collect()
    
            response = f"""Found {len(unique_sensors)} for machine {machine_id}. A total total of {len(anomalies)} anomalies were found and stored in the table ANOMALIES. Records with anomalies: {anomalies.to_dict(orient='records')}
            """
            
            return response
        else:
            return f'Found {len(unique_sensors)} for machine {machine_id}. None of them had anomalies.'
    else:
        return f'Machine with {machine_id} not found.'
$$;

# 6. Create the Agent

In [None]:
CREATE OR REPLACE AGENT SNOWFLAKE_INTELLIGENCE.AGENTS.FACTORY_AGENT
profile='{"display_name":"Factory Agent","avatar":"AiIcon","color":"var(--chartDim_3-x11sbcwy)"}'
comment='This agent provides insights into factory data.'
FROM SPECIFICATION 
$$
{
  "models": {
    "orchestration": "claude-4-sonnet"
  },
  "instructions": {
    "orchestration": "When sending emails, make sure to provide well formatted content using html.\nWhen being asked about anomalies, first check the ANOMALY table if there are already anomalies for the relevant time period.\nIf there are no anomalies found, run the anomaly-detect tool. Otherwise use the data from the ANOMALY table unless the users explicitly asks to run the anomaly-detection tool."
  },
  "tools": [
    {
      "tool_spec": {
        "type": "cortex_analyst_text_to_sql",
        "name": "Factory-Data",
        "description": "This semantic data model provides a comprehensive view of manufacturing operations by linking machine performance, sensor data, and production line information. It helps teams monitor equipment efficiency and quickly identify and analyze anomalies.\n \nThe model is built on four core tables:\n \n * ANOMALIES: Contains records of unusual sensor readings, including which machine and sensor were affected, the type of anomaly, and the specific timestamp and value.\n * DIM_LINES: Provides details about each production line, such as its name and the manufacturing plant it's in.\n * DIM_MACHINES: Lists all the machines, detailing their manufacturer, model, and the production line they belong to.\n * DIM_SENSORS: Describes each sensor, including the machine it's installed on, what physical quantity it measures (e.g., pressure), and the unit of measurement.\n * FACT_OEE: Tracks Overall Equipment Effectiveness (OEE) metrics like Availability, Performance, and Quality for each machine over time, along with the number of units produced and scrapped.\n * FACT_SENSOR_VALUES_10_MINUTES: Stores sensor readings aggregated into 10-minute intervals.\n \nThis model helps you answer critical questions about factory performance and potential issues, such as:\n \n * Which machines and production lines have the lowest OEE scores?\n * What is the average OEE for a specific manufacturing plant or production line?\n * How does a machine's performance or quality change over time?\n * Which sensors are showing the most frequent or severe anomalies?\n * Can we link a recent drop in OEE to a specific sensor anomaly on a machine?"
      }
    },
    {
      "tool_spec": {
        "type": "cortex_search",
        "name": "Maintenance-Reports",
        "description": "This tool provides access to maintenance reports from technicians. The reports provide details about machine incidents and actions taken to mitigate them."
      }
    },
    {
      "tool_spec": {
        "type": "generic",
        "name": "send-email",
        "description": "Use this tool to send emails.",
        "input_schema": {
          "type": "object",
          "properties": {
            "recipient": {
              "description": "The email address of the recipient.",
              "type": "string"
            },
            "subject": {
              "description": "The subject of the email.",
              "type": "string"
            },
            "text": {
              "description": "The text of the email. Supports html code for formatted emails.",
              "type": "string"
            }
          },
          "required": [
            "recipient",
            "subject",
            "text"
          ]
        }
      }
    },
    {
      "tool_spec": {
        "type": "generic",
        "name": "detect-anomalies",
        "description": "Use this tool to perform anomaly detection for all sensors of a machine.",
        "input_schema": {
          "type": "object",
          "properties": {
            "end_date": {
              "description": "End date for historical data. Data on and before that date are included in anomaly detection.\nProvided as YYYY-MM-DD.",
              "type": "string"
            },
            "machine_id": {
              "description": "The ID of the machine for which to detect anomalies.",
              "type": "string"
            },
            "start_date": {
              "description": "Start date for historical data. Data on and after that date are included in anomaly detection.\nProvided as YYYY-MM-DD.",
              "type": "string"
            }
          },
          "required": [
            "end_date",
            "machine_id",
            "start_date"
          ]
        }
      }
    }
  ],
  "tool_resources": {
    "Factory-Data": {
      "semantic_view": "AI_DEVELOPMENT.SI_BOTTLING_COMPANY.FACTORY_DATA_MODEL"
    },
    "Maintenance-Reports": {
      "max_results": 4,
      "name": "AI_DEVELOPMENT.SI_BOTTLING_COMPANY.MAINTENANCE_REPORTS",
      "title_column": "INCIDENT_ID"
    },
    "detect-anomalies": {
      "execution_environment": {
        "type": "warehouse",
        "warehouse": "AI_WH"
      },
      "identifier": "AI_DEVELOPMENT.SI_BOTTLING_COMPANY.DETECT_ANOMALIES",
      "name": "DETECT_ANOMALIES(VARCHAR, VARCHAR, VARCHAR)",
      "type": "procedure"
    },
    "send-email": {
      "execution_environment": {
        "type": "warehouse",
        "warehouse": "AI_WH"
      },
      "identifier": "AI_DEVELOPMENT.PUBLIC.SEND_MAIL",
      "name": "SEND_MAIL(VARCHAR, VARCHAR, VARCHAR)",
      "type": "procedure"
    }
  }
}
$$