# Roche Capstone - Advanced Data Generation V2.0

This notebook generates complex, highly realistic synthetic data for the lab simulation. Unlike V1, this version uses **Structural Causality** instead of simple randomness.

### Key Features (The "Robust" Logic)
1. **Seasonality**: Non-Homogeneous Poisson Process for booking times (peaks at 10am/2pm, Mon-Wed).
2. **Machine Degradation**: Instruments lose 'health' over usage hours, increasing failure probability.
3. **Bad Batches**: Specific reagent batches cause cluster failures (5x risk).
4. **Probabilistic Delays**: Delays are drawn from a Gamma distribution based on a multi-factor Risk Score.

In [13]:
import pandas as pd
import numpy as np
import random
from scipy.stats import gamma

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

N_ROWS = 350000

### 1. The Time Engine (Seasonality)

In [14]:
def generate_seasonal_timestamps(n, start_date='2024-01-01', days=365):
    print("Generating Seasonal Timestamps...")
    
    # Base: Uniform random seconds over the year
    dates = pd.date_range(start=start_date, periods=days*24, freq='H') # Placeholder backbone
    
    # We simulate by rejection sampling or weighted choice. Weighted choice is faster for N=350k.
    
    # 1. Select Day of Year based on Weekly Cycle
    # Weights: Mon(0)-Sun(6)
    # Mon(4), Tue(4), Wed(4), Thu(3), Fri(2), Sat(0.5), Sun(0.5)
    day_weights = {0:4, 1:4, 2:4, 3:3, 4:2, 5:0.5, 6:0.5}
    
    all_days = pd.date_range(start_date, periods=days, freq='D')
    day_probs = [day_weights[d.dayofweek] for d in all_days]
    day_probs = np.array(day_probs) / sum(day_probs)
    
    chosen_days = np.random.choice(all_days, n, p=day_probs)
    
    # 2. Select Hour of Day based on Daily Cycle
    # Peaks: 10am, 2pm. Low: Lunch(12), Night(20-06)
    hours = np.arange(24)
    # Heuristic weight curve
    hour_weights = [
        0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.5, 2, 4, 3, 5, 4, # 00-11 (Peak 10)
        2, 3, 5, 4, 3, 2, 1, 0.5, 0.2, 0.1, 0.1, 0.1      # 12-23 (Lunch dip, Peak 14)
    ]
    hour_probs = np.array(hour_weights) / sum(hour_weights)
    chosen_hours = np.random.choice(hours, n, p=hour_probs)
    
    # Add random minutes/seconds
    timestamps = []
    for d, h in zip(chosen_days, chosen_hours):
        ts = d + pd.Timedelta(hours=h, minutes=np.random.randint(0, 60), seconds=np.random.randint(0, 60))
        timestamps.append(ts)
        
    return pd.Series(timestamps).sort_values().reset_index(drop=True)

### 2. Core Data Generation

In [15]:
# --- Setup ---
timestamps = generate_seasonal_timestamps(N_ROWS)
experiment_types = ['Validation', 'QC', 'Pilot', 'Screening', 'R&D']
inst_map = {
    'Validation': ['Microscope', 'Centrifuge'], 'QC': ['Spectrometer', 'HPLC'],
    'Pilot': ['Incubator', 'PCR'], 'Screening': ['PCR', 'Centrifuge'], 'R&D': ['HPLC', 'Incubator']
}

# --- Assignments ---
exp_types = np.random.choice(experiment_types, N_ROWS, p=[0.25, 0.25, 0.2, 0.2, 0.1])
inst_types = [random.choice(inst_map[et]) for et in exp_types]

# --- Instrument ID Assignment (Critical for Degradation) ---
# Assume 20 instruments total
n_instruments = 20
instrument_ids_pool = [f'INST_{i:03d}' for i in range(n_instruments)]
# Deterministic mapping based on type to ensure consistency? Or just random pool?
# Simpler: Each instrument has a fixed type. 
inst_inventory = []
for itype in set([i for sublist in inst_map.values() for i in sublist]):
    for k in range(5): # 5 of each type
        inst_inventory.append({'id': f'{itype}_{k:02d}', 'type': itype, 'health': 1.0, 'hours_used': 0})
inst_df = pd.DataFrame(inst_inventory)

# Assign an instrument_id available for that type
# This approximation randomly picks valid instrument for the row
assigned_inst_ids = []
for itype in inst_types:
    valid_ids = inst_df[inst_df['type'] == itype]['id'].values
    assigned_inst_ids.append(np.random.choice(valid_ids))

# --- Reagent Batches ---
batches = [f'BATCH_{i:03d}' for i in range(50)]
BAD_BATCHES = np.random.choice(batches, 3, replace=False)
print(f"Selected Bad Batches: {BAD_BATCHES}")
reagent_batch_ids = np.random.choice(batches, N_ROWS)

# Create DataFrame
df = pd.DataFrame({
    'experiment_id': [f'EXP_{i:06d}' for i in range(N_ROWS)],
    'booking_time': timestamps,
    'experiment_type': exp_types,
    'instrument_type': inst_types,
    'instrument_id': assigned_inst_ids,
    'reagent_batch_id': reagent_batch_ids,
    'scientist_workload': np.random.poisson(5, N_ROWS).clip(1, 15),
    'scientist_experience_level': np.random.choice(['Junior', 'Mid', 'Senior'], N_ROWS, p=[0.4, 0.4, 0.2]),
    'lab_occupancy_level': np.random.normal(70, 15, N_ROWS).clip(0, 100).astype(int)
})

# Expected Durations
duration_map = {'Validation': 60, 'QC': 45, 'Pilot': 90, 'Screening': 30, 'R&D': 120}
df['expected_duration'] = df['experiment_type'].map(duration_map)

Generating Seasonal Timestamps...


  dates = pd.date_range(start=start_date, periods=days*24, freq='H') # Placeholder backbone


Selected Bad Batches: ['BATCH_044' 'BATCH_027' 'BATCH_022']


### 3. Calculating Machine Degradation (The Drift)
We simulate the chronological usage of machines. Every hour of use slightly damages the machine.

In [16]:
# We need to iterate chronologically to accumulate wear
df = df.sort_values('booking_time').reset_index(drop=True)

health_scores = []
current_health = {inst_id: 1.0 for inst_id in inst_df['id']}

print("Simulating Machine Degradation...")
for idx, row in df.iterrows():
    iid = row['instrument_id']
    dur = row['expected_duration']
    
    # Record current health
    health_scores.append(current_health[iid])
    
    # Degrade: 0.01 health loss per 100 hours (6000 mins)
    # Decay = duration_mins / 600000 (Slow decay)
    decay = dur / 500000 
    current_health[iid] = max(0.0, current_health[iid] - decay)

df['instrument_health'] = health_scores

# Visual check
print(f"Final average health: {df['instrument_health'].mean():.4f}")

Simulating Machine Degradation...
Final average health: 0.3606


### 4. The Probabilistic Output Engine
Calculating `Delay_Risk` and Gamma-distributed delays.

In [17]:
# --- 1. Temperature Simulation ---
# Normal temp is 22C. Deviation increases risk.
df['mean_ambient_temp'] = np.random.normal(22, 2, N_ROWS)
df['temp_deviation'] = abs(df['mean_ambient_temp'] - 22)

# --- 2. Risk Calculation ---
# Normalize factors to 0-1 scale approximately
norm_occ = df['lab_occupancy_level'] / 100
norm_wear = 1.0 - df['instrument_health'] # 0 is new, 1 is dead
norm_temp = df['temp_deviation'] / 10 # Assuming deviations >10 are rare

risk_score = (0.4 * norm_occ) + (0.3 * norm_wear) + (0.3 * norm_temp)

# --- 3. Bad Batch Multiplier ---
batch_mask = df['reagent_batch_id'].isin(BAD_BATCHES)
risk_score[batch_mask] *= 5.0

# --- 4. Gamma Delay ---
# Gamma(shape=k, scale=theta). Mean = k*theta.
# We want Risk to drive the Shape (k). Higher risk = more skew/longer tail.
# Scale=5 mins.
df['delay'] = np.random.gamma(shape=(risk_score * 10) + 0.1, scale=5)

# --- 5. Clean Up ---
# Add incidents for high delays
df['incident_type'] = 'None'
df.loc[df['delay'] > 30, 'incident_type'] = 'Resource_Contention'
df.loc[df['delay'] > 60, 'incident_type'] = 'Instrument_Failure'
df.loc[batch_mask & (df['delay'] > 45), 'incident_type'] = 'Reagent_Quality'

# Actual Duration
df['actual_duration'] = df['expected_duration'] + df['delay']

display(df[['booking_time', 'instrument_health', 'reagent_batch_id', 'delay']].head(10))

Unnamed: 0,booking_time,instrument_health,reagent_batch_id,delay
0,2024-01-01 00:18:04,1.0,BATCH_013,8.721965
1,2024-01-01 00:44:22,1.0,BATCH_016,1.459434
2,2024-01-01 01:05:07,1.0,BATCH_005,17.299596
3,2024-01-01 01:21:09,1.0,BATCH_046,9.812713
4,2024-01-01 01:42:56,1.0,BATCH_034,25.88854
5,2024-01-01 02:32:41,1.0,BATCH_005,12.662073
6,2024-01-01 02:38:47,1.0,BATCH_032,5.124272
7,2024-01-01 02:52:22,0.99982,BATCH_046,11.171419
8,2024-01-01 04:05:38,0.99994,BATCH_019,11.987329
9,2024-01-01 04:27:58,1.0,BATCH_044,66.874387


In [18]:
# Save Files
# Workflow Logs
df_wk = df[['experiment_id', 'experiment_type', 'instrument_type', 'instrument_id', 
            'scientist_workload', 'scientist_experience_level', 'lab_occupancy_level', 
            'expected_duration', 'booking_time', 'actual_duration', 'delay', 'incident_type']].copy()

# Reagent Logs
df_rg = df[['experiment_id', 'reagent_batch_id', 'booking_time']].copy()

# Telemetry Logs (Simulated aggregate for now to match V1 schema)
df_tl = df[['experiment_id', 'mean_ambient_temp']].copy()
df_tl['ambient_temp'] = df_tl['mean_ambient_temp'] # simplified
df_tl = df_tl.drop('mean_ambient_temp', axis=1)
df_tl['timestamp'] = df['booking_time'] # simplified

df_wk.to_csv('workflow_logs.csv', index=False)
df_rg.to_csv('reagent_logs.csv', index=False)
# Telemetry needs to be long format? For now keeping 1-to-1 to match merger logic in notebook 2/3
df_tl.to_csv('telemetry_logs.csv', index=False)

print("Data Generation V2 Complete. Saved csvs.")

Data Generation V2 Complete. Saved csvs.
