# Imports & Auth

In [2]:
import os
import re
import json
import math
import pandas as pd

from pathlib import Path
from datetime import datetime

## Utils

In [3]:
def extract_file_id(url):
    patterns = [
        r"/spreadsheets/d/([^/]+)",
        r"/file/d/([^/]+)",     # Matches /file/d/{file_id}
        r"[?&]id=([^&]+)",       # Matches ?id={file_id} or &id={file_id}
        r"/drive/([^/?#]+)",     # Matches /drive/{file_id} and stops at /, ?, or #
        r"/folders/([^/]+)"      # Matches /folders/{folder_id}
    ]

    for pattern in patterns:
        match_ = re.search(pattern, url)
        if match_:
            return match_.group(1).strip()  

# Parse Proto
Please ensure you have copied your protobuf file in the root of this project. Once done, update the name of `proto_file_name` below.

In [43]:
from proto_reader import parse_proto_file

proto_file_name = 'tool_use_metadata_set_v2.pb'
samples = parse_proto_file(proto_file_name)

with open('pb_jsons.json', 'w') as f:
    json.dump({'result': samples}, f, indent=4)     

# Create Batches and Configuration Files for Docker Runs

In [20]:
proto_json = f'pb_jsons.json'
with open(proto_json, 'r') as f:
    all_samples = json.load(f)['result']

[sample for sample in all_samples if '1WLs202u3t0ybjjkt4G1P6XqZ42FtMMUa' in sample['colab_link']]
# 1WLs202u3t0ybjjkt4G1P6XqZ42FtMMUa

[{'user_simulation_metadata': {'description_of_desired_final_state': 'A Google Calendar titled "Appointments" exists and has no appointment scheduled in the next week containing the word "vaccination appointment" in the title (case-insensitive). No further action will be taken.',
   'initial_query': 'Send a reminder email via Gmail to each pet owner with a vaccination appointment scheduled in the next week in my "Appointments" Google Calendar using the template for vaccination reminder saved as a draft in my Gmail.'},
  'environment_config': {'tools_needed_for_task': [{'connector': 'GOOGLE_CALENDAR'},
    {'connector': 'GMAIL'}],
   'initialization': {'initialization_env_code': 'import google_calendar\nimport gmail\n\n# --- Load existing simulation states ---\ngoogle_calendar.SimulationEngine.db.load_state("/content/DBs/CalendarDefaultDB.json")\ngmail.SimulationEngine.db.load_state("/content/DBs/GmailDefaultDB.json")\n\n# --- 1) Create the "Appointments" calendar ---\ncalendar = google

In [45]:
proto_json = f'pb_jsons.json'
with open(proto_json, 'r') as f:
    all_samples = json.load(f)['result']

colab_urls = [sample['colab_url'] if 'colab_url' in sample else sample['colab_link'] for sample in all_samples]
colab_ids = [extract_file_id(colab_url) for colab_url in colab_urls]

total_samples = len(colab_ids)
max_container = 500
max_batch_size = math.ceil(total_samples / max_container)
print(f'Max Batches: {math.ceil(total_samples/max_batch_size)}\nMax Samples Per Batch: {max_batch_size}')

api_version = '0.1.0'
notebooks = [{'path': notebook, 'api_version': api_version} for notebook in colab_ids]
notebooks_df = pd.DataFrame(notebooks)
for idx, api_version in enumerate(set(notebooks_df['api_version'])):
    count_notebooks = len(notebooks_df[notebooks_df['api_version']==api_version])
    batches = []
    for idx in range(count_notebooks):
        batches.append(idx//max_batch_size)
    batch_ids = [f"{api_version}_{batch}" for batch in batches]
    notebooks_df.loc[notebooks_df['api_version'] == api_version, 'batch_id'] = batch_ids

notebooks_df.to_csv('execution_configs.csv', index=False)

Max Batches: 434
Max Samples Per Batch: 7


# Docker Orchestration

## For Local Run (where you have root access)

In [None]:
import sanity_orchestrator_with_download as orchestrator

orchestrator.DOCKER_IMAGE = 'gen-agents-auto-qc'

exec_config = pd.read_csv("execution_configs.csv")
run_identifiers = list(set(exec_config['batch_id']))
run_identifiers.sort()
run_identifiers[:5]

try:
    start_time = datetime.now()
    run_name = f'sanity_check_{start_time.strftime("%Y%m%d_%H%M%S")}'
    orchestrator.run_orchestration(run_name, run_identifiers[:5], "Proto")
    print(f"Finished Docker Run. Time Taken: {(datetime.now()-start_time).seconds} Seconds")
except (FileNotFoundError, FileExistsError, ConnectionError) as e:
    print(f"\n❌ A critical error occurred: {e}")

--- Step 1: Validating Host Environment ---
✅ Docker client connected.

--- Step 2: Preparing Host Directories ---
✅ Created log directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/execution_logs/sanity_check_20250822_123530
✅ Created result directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/results/sanity_check_20250822_123530
✅ Created result directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/executed_notebooks/sanity_check_20250822_123530

--- Step 4: Launching Containers in Parallel ---
  -> Launching container 'sanity_check_20250822_123530-0' for batch 0...
  -> Launching container 'sanity_check_20250822_123530-1' for batch 1...
  -> Launching container 'sanity_check_20250822_123530-2' for batch 2...
  -> Launching container 'sanity_check_20250822_123530-3' for batch 3...
  -> Launching container 'sanity_check_20250822_123530-4' for batch 4...

--- Step 5: Waiting for All Containers to Finish ---
  -> ✅ SUCCESS

## For where you need to use sudo to run docker

In [1]:
!sudo .venv/bin/python runner.py

Password:sudo: a password is required


# Process Results

In [5]:
output_dir = f'results/{run_name}'
output_files = os.listdir(output_dir)
complete_data = []
for file in output_files:
    full_path = Path(output_dir) / file
    with open(full_path, 'r') as f:
        complete_data += json.load(f)['result']
# Use json_normalize to flatten the data
sanity_df = pd.json_normalize(complete_data)
sanity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 11 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   notebook                                              35 non-null     object
 1   contains_golden_answer                                35 non-null     object
 2   contains_final_assert                                 35 non-null     bool  
 3   script_passed                                         35 non-null     bool  
 4   script_failure_msg                                    35 non-null     object
 5   Set Up - Install Dependencies and Clone Repositories  35 non-null     object
 6   Set Up - Import APIs and initiate DBs                 35 non-null     object
 7   Final Assertion_NO_ACTION                             35 non-null     object
 8   Initial Assertion                                     35 non-null     ob

In [6]:
FA_FAILED_ASSERTION = 'FA Failed - Assertion Error'
IA_FAILED_ASSERTION = 'IA Failed - Assertion Error'
NON_ASSERTION_ERROR = 'Non Assertion Error'
ASSERTION_ERROR = "Assertion Error"
NO_ERROR_FOUND = 'No Error Found'
UNDEFINED_ERROR = 'Undefined Error Type'

NEEDS_FIXES = 'Needs Fixes'
GOOD_TO_GO = 'Good To Go'
NEEDS_MANUAL_REVIEW = 'Needs Manual Review'
CHECK_NOT_EXECUTED = 'Check Not Executed'

def add_error_type(error_message):
    if error_message == "":
        return NO_ERROR_FOUND
    error_type = error_message.split('\n')[0].split(':')[-1].strip()

    if error_type != 'AssertionError':
        return NON_ASSERTION_ERROR
    if error_type == 'AssertionError':
        return ASSERTION_ERROR

    return UNDEFINED_ERROR

def get_auto_qc_status(row):
    init_status = row['Execution Status Initialisation']
    status_fa_no_action = row['Execution Status FA w/o Action']
    status_ia = row['Execution Status IA']
    status_action = row['Execution Status Action']
    status_fa = row['Execution Status FA'] 
    contains_final_assert = row['contains_final_assert']
    script_success = row['script_passed']


    status = ""
    message = ""
    
    if not script_success:
        status = NEEDS_FIXES
        message = "Failed: Script to run Auto QC failed"
        return pd.Series((status, message))
    
    if NON_ASSERTION_ERROR in [init_status, status_fa_no_action, status_ia, status_action, status_fa]:
        status = NEEDS_FIXES
        message = "Failed: One of the code block contains Non Assertion Error(s)"

    elif ASSERTION_ERROR in [status_ia]:
        status = NEEDS_FIXES
        message = "Failed: Assertion Failure in Initial Assertion."

    elif ASSERTION_ERROR in [status_fa]:
        status = NEEDS_FIXES
        message = "Failed: Final Assertion Failure even when Action is executed. Either Final Assertion or Action needs to be fixed."

    elif ASSERTION_ERROR in [status_fa_no_action]:
        status = GOOD_TO_GO
        message = "Passes: All Steps executed successfully and FA failed w/o action."
        
    else:
        if all(status==NO_ERROR_FOUND for status in [init_status, status_fa_no_action, status_ia, status_action, status_fa]):
            if contains_final_assert:
                status = NEEDS_FIXES
                message = "Failed: If FA is present, it must fail in absense of the action"
            else:
                status = GOOD_TO_GO
                message = "Passed: No FA block found so FA without action is expected to pass."
        
    return pd.Series((status, message))


In [7]:
sanity_df['Execution Status Install Dependencies and Clone Repositories'] = sanity_df['Set Up - Install Dependencies and Clone Repositories'].apply(add_error_type)
sanity_df['Execution Status Initialisation'] = sanity_df['Set Up - Import APIs and initiate DBs'].apply(add_error_type)
sanity_df['Execution Status FA w/o Action'] = sanity_df['Final Assertion_NO_ACTION'].apply(add_error_type)
sanity_df['Execution Status IA'] = sanity_df['Initial Assertion'].apply(add_error_type)
sanity_df['Execution Status Action'] = sanity_df['Action'].apply(add_error_type)
sanity_df['Execution Status FA'] = sanity_df['Final Assertion'].apply(add_error_type)

sanity_df = sanity_df.rename(columns={'notebook': 'colab_id'})

sanity_df[['Auto QC Status', 'Auto QC Message']] = sanity_df.apply(get_auto_qc_status, axis=1)
sanity_df['Auto QC Status'].value_counts()

Auto QC Status
Good To Go     23
Needs Fixes    12
Name: count, dtype: int64

In [None]:
sanity_df['script_passed'].value_counts()
# sanity_df[sanity_df['Auto QC Status']=='Needs Fixes'].sort_values(['colab_id'])['colab_id']
# sanity_df.sort_values(['Auto QC Status', 'colab_id'])[['colab_id', 'Auto QC Status']]

Unnamed: 0,colab_id,Auto QC Status
11,10f0vvJUTIYQPJO0WeIpNr1vEiUkLi2qS,Good To Go
34,145bFTPyeqF77isaP9uTU1f5v_hQqLv9u,Good To Go
15,151c0r7t6kqeFl5TNCQ-1aRgzp32BaqyZ,Good To Go
33,16loMQwMXpQLm9ritJ2vzQn4UftbikaUE,Good To Go
26,1C2tSchsGNsqO5Eq6fk96SUCwYBG7Vxrk,Good To Go
1,1DG9D64M2fXLYqTj_9lGSvcwCa5mtlAuk,Good To Go
14,1GvNQs7rQl2ZpFTM1L70d8PW99McWyVAf,Good To Go
30,1MN41-UjBE9GjzO01ZNuhvqtwwvkjle0K,Good To Go
23,1PZ25p3xrm4jS2a04djRozJLNBi7uhvQS,Good To Go
19,1Q17ut341xeHh8juMY5yDtyiU7O5Cffzf,Good To Go


In [52]:
def trim_text(text):
    return text[:49999]

for col in sanity_df.select_dtypes(include=['object', 'string']).columns.tolist():
    sanity_df[col] = sanity_df[col].apply(trim_text)

In [53]:
sanity_df.to_csv('auto_qc_result.csv', index=False)