# Imports & Auth

In [1]:
import os
# import io
import re
# import sys
import json
import math
# import copy
# import time
# import shlex
# import shutil
# import random
# import pathlib
# import subprocess
# import traceback
# import concurrent.futures

# import docker
# import nbformat
# import gspread

from pathlib import Path
# from functools import partial
from datetime import datetime
# from tqdm.notebook import tqdm
# from concurrent.futures import ThreadPoolExecutor, as_completed
# from typing import Callable, Dict, List, Sequence, Iterable, Union

import pandas as pd
# from rclone_python import rclone
# from nbclient import NotebookClient
# from rclone_python.remote_types import RemoteTypes

# from google.auth import default
from google.oauth2 import service_account
from googleapiclient.discovery import build, Resource
# from googleapiclient.http import BatchHttpRequest, MediaIoBaseDownload, MediaIoBaseUpload
from googleapiclient.errors import HttpError

In [2]:
SERVICE_ACCOUNT_FILE = 'turing-delivery-g-ga-e36eb2300714.json'

# Combine scopes for both Drive and Sheets
SCOPES = [
    "https://www.googleapis.com/auth/drive",
    "https://www.googleapis.com/auth/spreadsheets",
]

def authenticate_with_service_account():
    """Authenticate using a service account and return credentials."""
    creds = service_account.Credentials.from_service_account_file(
        SERVICE_ACCOUNT_FILE,
        scopes=SCOPES
    )
    return creds

# Get the shared credentials object
credentials = authenticate_with_service_account()

# Setup

In [3]:
from rich.console import Console
from rich.theme import Theme

# @title Logger Configs
custom_theme = Theme({
    "info": "cyan",
    "warning": "magenta",
    "error": "bold red"
})
console = Console(theme=custom_theme)

class Logger:
  @staticmethod
  def log(message):
    console.print(message, style="info")

  def error(message):
    console.print(message, style="error")

In [4]:
# @title GoogleService Class
class GoogleService:

  @classmethod
  def extract_file_id(cls, url):
      patterns = [
          r"/spreadsheets/d/([^/]+)",
          r"/file/d/([^/]+)",     # Matches /file/d/{file_id}
          r"[?&]id=([^&]+)",       # Matches ?id={file_id} or &id={file_id}
          r"/drive/([^/?#]+)",     # Matches /drive/{file_id} and stops at /, ?, or #
          r"/folders/([^/]+)"      # Matches /folders/{folder_id}
      ]

      for pattern in patterns:
          match_ = re.search(pattern, url)
          if match_:
              return match_.group(1).strip()  

In [5]:
# @title GoogleDrive Functionality
class GoogleDrive(GoogleService):
    
    service = build("drive", "v3", credentials=credentials)

    @classmethod
    def get_file_names_in_batch(cls, file_ids):
        """
        Retrieves the names of multiple files from Google Drive in a single batch request.
        
        Args:
            drive_service: An authenticated Google Drive API service object.
            file_ids: A list of file IDs.
            
        Returns:
            A dictionary mapping file IDs to their names.
        """
        file_names = []
    
        def callback(request_id, response, exception):
            """
            Callback function to process the result of each individual request.
            """
            if exception:
                print(f"Error for file ID {request_id}: {exception}")
                
                file_names.append(
                    {
                        'colab_id': request_id,
                        'colab_name': None
                    }
                )
            else:
                file_names.append(
                    {
                        'colab_id': request_id,
                        'colab_name': response.get('name')
                    }
                )
    
        # Create a batch request with the callback
        batch = cls.service.new_batch_http_request(callback=callback)
    
        # Add a 'files().get()' request for each file ID
        for file_id in file_ids:
            batch.add(
                cls.service.files().get(
                    fileId=file_id,
                    fields='name',
                    supportsAllDrives=True
                ),
                request_id=file_id  # Use the file ID to track each request
            )
    
        # Execute the batch request
        batch.execute()
    
        return file_names

In [6]:
# @title GoogleSheets Functionality
class GoogleSheet(GoogleService):

  # service = build("sheets", "v4")
  service = build("sheets", "v4", credentials=credentials)

  @classmethod
  def get_sheet_data(cls, sheet_id: str, tab_name: str, **kwargs):
    """
    Gets data from existing Google Sheet and returns it as Pandas DataFrame.

    Args:
        sheet_id: The ID of the existing Google Sheet.
        tab_name: The desired name for the new tab.
        filter_col [Optional]: column name to filter the data.
        filter_val [Optional]: value to filter the data on.
    """
    vals = (
        cls.service.spreadsheets()
        .values()
        .get(spreadsheetId=sheet_id, range=tab_name)
        .execute()
        .get("values", [])
    )
    if len(vals) > 0:
      header = vals[0]
      data_values = vals[1:]
      max_columns = min(len(header), len(data_values[0]))
      data_values = [row[:max_columns] for row in data_values]
      header = header[:max_columns]
      df = pd.DataFrame(data_values, columns=header)
      df.columns = [column.strip() for column in df.columns]
      filter_cols = [col.strip() for col in kwargs.keys()]
      if filter_cols:
        if all(col in df.columns for col in filter_cols):
          query = " & ".join([
              f"{col}=='{kwargs[col]}'"
              if isinstance(kwargs[col], str)
              else f"{col}=={kwargs[col]}"
              for col in filter_cols])
          df = df.query(query)
        else:
          missing_cols = [col for col in filter_cols if col not in df.columns]
          raise Exception(f"Could not find column(s) in the sheet. {missing_cols}")
      return df
    sheet_name = cls.get_spreadsheet_name_by_id(sheet_id)
    raise Exception(f"No data found in the Tab: {tab_name}. Sheet ID: {sheet_name}")


  @classmethod
  def tab_exists(cls, spreadsheet_id, tab_name):

    spreadsheet_metadata = cls.service.spreadsheets().get(
        spreadsheetId=spreadsheet_id,
        fields='sheets.properties'
    ).execute()

    sheets = spreadsheet_metadata.get('sheets', [])
    for sheet in sheets:
        properties = sheet.get('properties')
        if properties and (properties.get('title') == tab_name):
            return True
    return False


  @classmethod
  def add_dataframe_to_sheet(cls, spreadsheet_id, df, tab_name, valueInputOption='RAW', drop_duplicates_on=['sample_id']):
    """
    Adds a new tab to an existing Google Sheet and populates it with data from a Pandas DataFrame.

    Args:
        spreadsheet_id: The ID of the existing Google Sheet.
        df: The Pandas DataFrame to export.
        tab_name: The desired name for the new tab.
    """
    try:
      if cls.tab_exists(spreadsheet_id, tab_name):
        Logger.log(f"Tab '{tab_name}' already exists in the spreadsheet.")
        existing_df = cls.get_sheet_data(spreadsheet_id, tab_name)
        # TODO: Add dataframe validation check
        Logger.log(f"Existing Dataframe")
        Logger.log(existing_df.info())

        combined_df = pd.concat([df, existing_df], ignore_index=True)
        df_to_upload = combined_df.drop_duplicates(subset=drop_duplicates_on, keep='first', ignore_index=True)
        Logger.log(f"Combined Dataframe")
        Logger.log(df_to_upload.info())

      else:
        Logger.log(f"Tab '{tab_name}' does not exist in the spreadsheet. Creating a new tab.")
        requests = [{
            'addSheet': {
                'properties': {
                    'title': tab_name
                }
            }
        }]
        batch_update_body = {
            'requests': requests
        }
        response = cls.service.spreadsheets().batchUpdate(
            spreadsheetId=spreadsheet_id,
            body=batch_update_body
        ).execute()
        # Get the ID of the newly created sheet (optional, but useful)
        new_sheet_id = response.get('replies')[0].get('addSheet').get('properties').get('sheetId')
        Logger.log(f"Successfully added new tab: '{tab_name}' with ID: {new_sheet_id}")
        df_to_upload = df

      values = [df_to_upload.columns.tolist()] + df_to_upload.values.tolist()
      Logger.log(f"Uploading {len(df_to_upload)} rows to tab '{tab_name}'.")
      range_name = f"'{tab_name}'!A1" # Ensure tab name is quoted if it has spaces or special characters
      body = {
          'values': values
      }
      result = cls.service.spreadsheets().values().update(
          spreadsheetId=spreadsheet_id,
          range=range_name,
          valueInputOption=valueInputOption,
          body=body
      ).execute()

      Logger.log(f"{result.get('updatedCells')} cells updated in tab '{tab_name}'.")

    except HttpError as err:
      Logger.error(f"An error occurred: {err}")
      if err.resp.status == 400: # Bad Request, often due to sheet name already existing
        Logger.error("Error details: The tab name might already exist or the request is malformed.")
      elif err.resp.status == 403: # Forbidden, often due to incorrect permissions
        Logger.error("Error details: Check your API permissions or if the service account/user has access to the sheet.")
      elif err.resp.status == 404: # Not Found, often due to incorrect spreadsheet ID
        Logger.error("Error details: The spreadsheet ID might be incorrect.")


  @classmethod
  def get_spreadsheet_name_by_id(cls, spreadsheet_id):
      """
      Retrieves the name (title) of a Google Spreadsheet given its ID.

      Args:
          spreadsheet_id: The ID of the Google Spreadsheet.

      Returns:
          The title of the spreadsheet, or None if an error occurs or spreadsheet is not found.
      """
      try:
          # Use spreadsheets().get() to retrieve metadata
          # We only request the 'properties.title' field for efficiency
          spreadsheet_metadata = cls.service.spreadsheets().get(
              spreadsheetId=spreadsheet_id,
              fields='properties.title'
          ).execute()

          # Extract the title from the properties
          title = spreadsheet_metadata.get('properties', {}).get('title')
          return title
      except HttpError as error:
          print(f'An error occurred: {error}')
          if error.resp.status == 404:
              print(f"Spreadsheet with ID '{spreadsheet_id}' not found.")
          return None



  @classmethod
  def add_dropdown_to_range(cls, spreadsheet_id: str, sheet_id: str,
                            dropdown_options: list,
                            range_start_row: int, range_end_row: int,
                            range_start_col: int, range_end_col: int):
    requests = [
        {
            'setDataValidation': {
                'range': {
                    'sheetId': sheet_id,
                    'startRowIndex': range_start_row,
                    'endRowIndex': range_end_row,
                    'startColumnIndex': range_start_col,
                    'endColumnIndex': range_end_col
                },
                'rule': {
                    'condition': {
                        'type': 'ONE_OF_LIST',
                        'values': [{'userEnteredValue': option} for option in dropdown_options]
                    },
                    'strict': True,  # Users can only enter values from the list
                    'showCustomUi': True, # Show dropdown arrow
                }
            }
        }
    ]

    # --- Execute the batch update request ---
    try:
        body = {
            'requests': requests
        }
        response = cls.service.spreadsheets().batchUpdate(
            spreadsheetId=spreadsheet_id,
            body=body
        ).execute()
        print(f"Dropdown added to Sheet ID {sheet_id}, Range row{range_start_row+1}:row{range_end_row}.")
        # You can inspect the response for more details if needed
        # print(response)
    except Exception as e:
        print(f"An error occurred: {e}")


  @classmethod
  def get_sheet_id_by_name(cls, spreadsheet_id: str, tab_name: str):
      """
      Retrieves the numerical Sheet ID (gid) for a given tab name within a spreadsheet.

      Args:
          spreadsheet_id (str): The ID of the Google Spreadsheet.
          tab_name (str): The exact name (title) of the tab/sheet to find.

      Returns:
          int: The numerical sheet ID (gid) if found.
          None: If an error occurs, the spreadsheet is not found, or the tab name is not found.
      """
      try:
          # Use spreadsheets().get() to retrieve metadata
          # We only request 'sheets.properties' to get sheet IDs and titles efficiently
          spreadsheet_metadata = cls.service.spreadsheets().get(
              spreadsheetId=spreadsheet_id,
              fields='sheets.properties'
          ).execute()

          sheets = spreadsheet_metadata.get('sheets', [])
          for sheet in sheets:
              properties = sheet.get('properties')
              # Check if properties exist and if the title matches the tab_name
              if properties and properties.get('title') == tab_name:
                  return properties.get('sheetId') # Return the sheetId (gid)

          # If the loop completes, the tab was not found
          print(f"Tab '{tab_name}' not found in spreadsheet with ID '{spreadsheet_id}'.")
          return None
      except HttpError as error:
          if error.resp.status == 404:
              print(f"Spreadsheet with ID '{spreadsheet_id}' not found. Error: {error}")
          else:
              print(f'An HTTP error occurred: {error}')
          return None
      except Exception as e:
          print(f"An unexpected error occurred while retrieving sheet ID for tab '{tab_name}': {e}")
          return None

# Download APIs Code

In [47]:
def download_apis(VERSION="0.1.0", download_datasets=False, save_directory='clean_workspace'):
    import io
    import os
    import sys
    import zipfile
    import shutil
    import re
    # from google.colab import auth
    from googleapiclient.discovery import build
    from googleapiclient.http import MediaIoBaseDownload

    drive_service = GoogleDrive.service
    # Version to download
    # VERSION = "0.0.rev22final" # Version of the API
    
    # Define paths
    CONTENT_DIR = os.path.join(save_directory, VERSION)
    if os.path.exists(CONTENT_DIR):
        os.remove(CONTENT_DIR)
    os.makedirs(CONTENT_DIR, exist_ok=True)
    
    APIS_DIR = os.path.join(CONTENT_DIR, 'APIs')
    DBS_DIR = os.path.join(CONTENT_DIR, 'DBs')
    SCRIPTS_DIR = os.path.join(CONTENT_DIR, 'Scripts')
    FC_DIR = os.path.join(CONTENT_DIR, 'Schemas')
    ZIP_PATH = os.path.join(CONTENT_DIR, f'APIs_V{VERSION}.zip')
    
    # Google Drive Folder ID where versioned APIs zip files are stored
    APIS_FOLDER_ID = '1QpkAZxXhVFzIbm8qPGPRP1YqXEvJ4uD4'
    
    # List of items to extract from the zip file
    ITEMS_TO_EXTRACT = ['APIs/', 'DBs/', 'Scripts/']
    
    # Clean up existing directories and files
    for path in [APIS_DIR, DBS_DIR, SCRIPTS_DIR, FC_DIR, ZIP_PATH]:
        if os.path.exists(path):
            if os.path.isdir(path):
                shutil.rmtree(path)
            else:
                os.remove(path)
    
    # Authenticate and create the drive service
    # auth.authenticate_user()
    # drive_service = build('drive', 'v3')
    # drive_service
    # Helper function to download a file from Google Drive
    def download_drive_file(service, file_id, output_path, file_name=None, show_progress=True):
        """Downloads a file from Google Drive"""
        destination = output_path
        request = service.files().get_media(fileId=file_id)
        with io.FileIO(destination, 'wb') as fh:
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while not done:
                status, done = downloader.next_chunk()
                if show_progress:
                    print(f"Download progress: {int(status.progress() * 100)}%")
    
    
    # 1. List files in the specified APIs folder
    print(f"Searching for APIs zip file with version {VERSION} in folder: {APIS_FOLDER_ID}...")
    apis_file_id = None
    
    try:
        query = f"'{APIS_FOLDER_ID}' in parents and trashed=false"
        results = drive_service.files().list(q=query, fields="files(id, name)").execute()
        files = results.get('files', [])
        for file in files:
            file_name = file.get('name', '')
            if file_name.lower() == f'apis_v{VERSION.lower()}.zip':
                apis_file_id = file.get('id')
                print(f"Found matching file: {file_name} (ID: {apis_file_id})")
                break
    
    except Exception as e:
        print(f"An error occurred while listing files in Google Drive: {e}")
    
    if not apis_file_id:
        print(f"Error: Could not find APIs zip file with version {VERSION} in the specified folder.")
        sys.exit("Required APIs zip file not found.")
    
    # 2. Download the found APIs zip file
    print(f"Downloading APIs zip file with ID: {apis_file_id}...")
    download_drive_file(drive_service, apis_file_id, ZIP_PATH, file_name=f'APIs_V{VERSION}.zip')
    
    # 3. Extract specific items from the zip file to /content
    print(f"Extracting specific items from {ZIP_PATH} to {CONTENT_DIR}...")
    try:
        with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
            zip_contents = zip_ref.namelist()
    
            for member in zip_contents:
                extracted = False
                for item_prefix in ITEMS_TO_EXTRACT:
                  if member == item_prefix or member.startswith(item_prefix):
                        zip_ref.extract(member, CONTENT_DIR)
                        extracted = True
                        break
    
    except zipfile.BadZipFile:
        print(f"Error: The downloaded file at {ZIP_PATH} is not a valid zip file.")
        sys.exit("Invalid zip file downloaded.")
    except Exception as e:
        print(f"An error occurred during extraction: {e}")
        sys.exit("Extraction failed.")
    
    
    # 4. Clean up
    if os.path.exists(ZIP_PATH):
        os.remove(ZIP_PATH)
    
    # 5. Add APIs to path
    if os.path.exists(APIS_DIR):
        sys.path.append(APIS_DIR)
    else:
        print(f"Error: APIS directory not found at {APIS_DIR} after extraction. Cannot add to path.")
    
    # 6. Quick verification
    # Check for the presence of the extracted items
    verification_paths = [APIS_DIR, DBS_DIR, SCRIPTS_DIR]
    all_present = True
    print("\nVerifying extracted items:")
    for path in verification_paths:
        if os.path.exists(path):
            print(f"‚úÖ {path} is present.")
        else:
            print(f"‚ùå {path} is MISSING!")
            all_present = False
    
    if all_present:
        print(f"\n‚úÖ Setup complete! Required items extracted to {CONTENT_DIR}.")
    else:
        print("\n‚ùå Setup failed! Not all required items were extracted.")

    # 7. Generate Schemas

    # Add Scripts to path
    if os.path.exists(CONTENT_DIR):
        sys.path.append(CONTENT_DIR)
    else:
        print(f"Error: CONTENT_DIR directory not found at {CONTENT_DIR} after extraction. Cannot add to path.")
    
    from Scripts.FCSpec import generate_package_schema
    
    print("\nGenerating FC Schemas")
    os.makedirs(FC_DIR, exist_ok=True)
    
    
    # Iterate through the packages in the /content/APIs directory
    for package_name in os.listdir(APIS_DIR):
        package_path = os.path.join(APIS_DIR, package_name)
    
        # Check if it's a directory (to avoid processing files)
        if os.path.isdir(package_path):
            # Call the function to generate schema for the current package
            generate_package_schema(package_path, output_folder_path=FC_DIR)
    print(f"‚úÖ Successfully generated {len(os.listdir(FC_DIR))} FC Schemas to {FC_DIR}")

    if download_datasets:
        def download_drive_folder(service, folder_id, destination_path):
            """
            Recursively downloads all files in a Google Drive folder using the `download_drive_file`
            """
            os.makedirs(destination_path, exist_ok=True)
        
            query = f"'{folder_id}' in parents and trashed=false"
            page_token = None
        
            while True:
                results = service.files().list(
                    q=query,
                    spaces='drive',
                    fields='nextPageToken, files(id, name, mimeType)',
                    pageToken=page_token
                ).execute()
        
                for item in results.get('files', []):
                    file_id = item['id']
                    file_name = item['name']
                    mime_type = item['mimeType']
        
                    if mime_type == 'application/vnd.google-apps.folder':
                        # Recursively download subfolders
                        new_path = os.path.join(destination_path, file_name)
                        print(f"Creating subfolder and downloading: {new_path}")
                        download_drive_folder(service, file_id, new_path)
                    else:
                        # Construct full file path and pass it as output_path
                        full_path = os.path.join(destination_path, file_name)
                        print(f"Downloading file: {file_name} to {full_path}")
                        download_drive_file(service, file_id, full_path, file_name=file_name, show_progress=False)
        
                page_token = results.get('nextPageToken', None)
                if not page_token:
                    break
        
        # --- Configuration for Dataset Download ---
        # This FOLDER_ID should contain the 'Quotewk.csv' file.
        FOLDER_ID = "1tZqZB1vAxp4TTxbPm6O2YjfkZD4FM-ml"
        # DATASET_FOLDER = "./workspace/Datasets"
        DATASET_FOLDER = os.path.join(CONTENT_DIR, 'workspace/Datasets')
        
        print(f"Starting download of folder {FOLDER_ID} to {DATASET_FOLDER}...")
        download_drive_folder(drive_service, FOLDER_ID, DATASET_FOLDER)
        print("Dataset download complete.")

        # --- Configuration for WS Dataset Download ---
        # This FOLDER_ID should contain the 'WS Multihop Datasets' file.
        WS_DATA_ID = "1kmXZ1oarBPlE0OQL52eGoc1xPbupJ1n9"
        WS_DATA_ZIP_PATH = os.path.join(CONTENT_DIR, 'WS_DATA.zip')
        
        print(f"Downloading WS Dataset zip file with ID: {WS_DATA_ID}...")
        download_drive_file(drive_service, WS_DATA_ID, WS_DATA_ZIP_PATH, file_name=f'WS_DATA.zip')
        print("Dataset download complete.")
        
        # Extract the Datasets
        WS_DATA_ZIP_PATH = os.path.join(CONTENT_DIR, 'WS_DATA.zip')
        with zipfile.ZipFile(WS_DATA_ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall(CONTENT_DIR)
        print(f"Extracted to {CONTENT_DIR}")
        
        # Moving 'file_dataset_pb2.py' to root directory
        src_path = os.path.join(CONTENT_DIR, 'WS_DATA', 'file_dataset_pb2.py')
        dst_path = os.path.join(CONTENT_DIR, 'file_dataset_pb2.py')
        
        if os.path.exists(src_path):
            shutil.move(src_path, dst_path)
            print(f"Moved {src_path} to {dst_path}")
        else:
            print(f"Source file not found: {src_path}")
        
        # Clean up
        if os.path.exists(WS_DATA_ZIP_PATH):
            os.remove(WS_DATA_ZIP_PATH)        

In [50]:
download_apis(download_datasets=True, save_directory='clean_workspace')

Searching for APIs zip file with version 0.1.0 in folder: 1QpkAZxXhVFzIbm8qPGPRP1YqXEvJ4uD4...
Found matching file: APIs_V0.1.0.zip (ID: 1hLV2slrHhH0RquKU-8oWRJRs_nHh5CT_)
Downloading APIs zip file with ID: 1hLV2slrHhH0RquKU-8oWRJRs_nHh5CT_...
Download progress: 100%
Extracting specific items from clean_workspace/0.1.0/APIs_V0.1.0.zip to clean_workspace/0.1.0...

Verifying extracted items:
‚úÖ clean_workspace/0.1.0/APIs is present.
‚úÖ clean_workspace/0.1.0/DBs is present.
‚úÖ clean_workspace/0.1.0/Scripts is present.

‚úÖ Setup complete! Required items extracted to clean_workspace/0.1.0.

Generating FC Schemas
‚úÖ notes_and_lists Schema generation complete: clean_workspace/0.1.0/Schemas/notes_and_lists.json


Processing mutation notes_and_lists.mutations.m01...
‚úÖ notes_and_lists.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/notes_and_lists.json

‚úÖ google_maps Schema generation complete: clean_workspace/0.1.0/Schemas/google_maps.json


Processi

  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """
  """


‚úÖ workday Schema generation complete: clean_workspace/0.1.0/Schemas/workday.json


Processing mutation workday.mutations.m01...
‚úÖ workday.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/workday.json

‚úÖ azure Schema generation complete: clean_workspace/0.1.0/Schemas/azure.json


Processing mutation azure.mutations.m01...
‚úÖ azure.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/azure.json

‚úÖ media_control Schema generation complete: clean_workspace/0.1.0/Schemas/media_control.json


Processing mutation media_control.mutations.m01...
‚úÖ media_control.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/media_control.json

‚úÖ google_meet Schema generation complete: clean_workspace/0.1.0/Schemas/google_meet.json


Processing mutation google_meet.mutations.m01...
‚úÖ google_meet.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/google_meet.json



  """
  """
  """
  """
  """
  """


‚úÖ jira.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/jira.json

‚úÖ github Schema generation complete: clean_workspace/0.1.0/Schemas/github.json


Processing mutation github.mutations.m01...
‚úÖ github.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/github.json

‚úÖ figma Schema generation complete: clean_workspace/0.1.0/Schemas/figma.json


Processing mutation figma.mutations.m01...
‚úÖ figma.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/figma.json

‚úÖ github_actions Schema generation complete: clean_workspace/0.1.0/Schemas/github_actions.json


Processing mutation github_actions.mutations.m01...
‚úÖ github_actions.mutations.m01 Schema generation complete: clean_workspace/0.1.0/MutationSchemas/m01/github_actions.json

‚úÖ zendesk Schema generation complete: clean_workspace/0.1.0/Schemas/zendesk.json


Processing mutation zendesk.mutations.m01...
‚úÖ zendesk.mutations.m01 

# Fetch & Download Colabs / Notebooks

In [39]:
sheet_id = "1V6IyjZMqXcQ07zc0naOm6cbYySKLxN95GRjuqLKzVDU"
data_tab = "auto_qc_data"

colabs_df = GoogleSheet.get_sheet_data(sheet_id, data_tab)

# colabs_df = colabs_df.loc[(colabs_df['status'] == "FALSE")]

# colabs_df = colabs_df.loc[((colabs_df['w/o'] != 'No Error Found') | (colabs_df['with'] != 'No Error Found')) & (colabs_df['status'] == 'Needs Fixes')]

colabs_df['colab_id'] = colabs_df['colab_url'].apply(GoogleService.extract_file_id)
colabs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sample_id  222 non-null    object
 1   colab_url  222 non-null    object
 2   status     222 non-null    object
 3   colab_id   222 non-null    object
dtypes: object(4)
memory usage: 7.1+ KB


In [40]:
colab_names = []
name_request_batch_size = 99
for start in range(0, len(colabs_df['colab_id']), name_request_batch_size):
    colab_names += GoogleDrive.get_file_names_in_batch(colabs_df['colab_id'].tolist()[start:start+name_request_batch_size])
colab_name_df = pd.DataFrame(colab_names)
colab_name_df = colab_name_df[~colab_name_df['colab_name'].isna()]
colabs_df = pd.merge(colab_name_df, colabs_df, on='colab_id')
# colabs_df = colabs_df.drop_duplicates(['colab_id'])
colabs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   colab_id    222 non-null    object
 1   colab_name  222 non-null    object
 2   sample_id   222 non-null    object
 3   colab_url   222 non-null    object
 4   status      222 non-null    object
dtypes: object(5)
memory usage: 8.8+ KB


# Create Batches and Configuration Files for Docker Runs

In [47]:
total_samples = len(list(set(colabs_df['colab_id'])))
max_container = 50
max_batch_size = math.ceil(total_samples / max_container)
print(f'Max Batches: {math.ceil(total_samples/max_batch_size)}\nMax Samples Per Batch: {max_batch_size}')

Max Batches: 45
Max Samples Per Batch: 5


In [49]:
api_version = '0.1.0'
notebooks = [{'path': notebook, 'api_version': api_version} for notebook in list(set(colabs_df['colab_id']))]
notebooks_df = pd.DataFrame(notebooks)
for idx, api_version in enumerate(set(notebooks_df['api_version'])):
    count_notebooks = len(notebooks_df[notebooks_df['api_version']==api_version])
    batches = []
    for idx in range(count_notebooks):
        batches.append(idx//max_batch_size)
    batch_ids = [f"{api_version}_{batch}" for batch in batches]
    notebooks_df.loc[notebooks_df['api_version'] == api_version, 'batch_id'] = batch_ids

notebooks_df.to_csv('execution_configs.csv', index=False)

# Docker Orchestration

## For Local Run (where you have root access)

In [50]:
exec_config = pd.read_csv("execution_configs.csv")
run_identifiers = list(set(exec_config['batch_id']))

In [23]:
import sanity_orchestrator_with_download as orchestrator
try:
    start_time = datetime.now()
    run_name = f'sanity_check_{start_time.strftime("%Y%m%d_%H%M%S")}'
    orchestrator.run_orchestration(run_name, run_identifiers)
    print(f"Finished Docker Run. Time Taken: {(datetime.now()-start_time).seconds} Seconds")
except (FileNotFoundError, FileExistsError, ConnectionError) as e:
    print(f"\n‚ùå A critical error occurred: {e}")

--- Step 1: Validating Host Environment ---
‚úÖ Docker client connected.

--- Step 2: Preparing Host Directories ---
‚úÖ Created log directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/execution_logs/sanity_check_20250821_131047
‚úÖ Created result directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/results/sanity_check_20250821_131047
‚úÖ Created result directory for this run at: /Users/nabeel/PycharmProjects/e2e_sanity_checks/executed_notebooks/sanity_check_20250821_131047

--- Step 4: Launching Containers in Parallel ---
  -> Launching container 'sanity_check_20250821_131047-0' for batch 0...
  -> Launching container 'sanity_check_20250821_131047-1' for batch 1...
  -> Launching container 'sanity_check_20250821_131047-2' for batch 2...
  -> Launching container 'sanity_check_20250821_131047-3' for batch 3...
  -> Launching container 'sanity_check_20250821_131047-4' for batch 4...
  -> Launching container 'sanity_check_20250821_131047-5' fo

## For VM Run (where you need to use sudo to run docker)

In [1]:
!sudo .venv/bin/python runner.py

Password:sudo: a password is required


# Process Results

In [24]:
output_dir = f'results/{run_name}'
output_files = os.listdir(output_dir)
complete_data = []
for file in output_files:
    full_path = Path(output_dir) / file
    with open(full_path, 'r') as f:
        complete_data += json.load(f)['result']
# Use json_normalize to flatten the data
sanity_df = pd.json_normalize(complete_data)
sanity_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 11 columns):
 #   Column                                                Non-Null Count  Dtype 
---  ------                                                --------------  ----- 
 0   notebook                                              222 non-null    object
 1   contains_golden_answer                                222 non-null    bool  
 2   contains_final_assert                                 222 non-null    bool  
 3   script_passed                                         222 non-null    bool  
 4   script_failure_msg                                    222 non-null    object
 5   Set Up - Install Dependencies and Clone Repositories  222 non-null    object
 6   Set Up - Import APIs and initiate DBs                 222 non-null    object
 7   Final Assertion_NO_ACTION                             222 non-null    object
 8   Initial Assertion                                     222 non-null    

In [25]:
sanity_df.head()

Unnamed: 0,notebook,contains_golden_answer,contains_final_assert,script_passed,script_failure_msg,Set Up - Install Dependencies and Clone Repositories,Set Up - Import APIs and initiate DBs,Final Assertion_NO_ACTION,Initial Assertion,Action,Final Assertion
0,1xWzV1ZwJYVqJBc2xoYzOsYzF2F2hS0RW,False,True,True,,,,ErrorType: AssertionError\nError Description: ...,,,ErrorType: NameError\nError Description: name ...
1,1_vPLqKE0J-nLQxjPkwDVN6kP42qvVK4l,True,False,False,"Traceback (most recent call last):\n File ""/a...",,,,,,
2,1n5qtNhll6vZnnHMh7H4Ec6FQuSmYVFon,True,True,True,,,,ErrorType: AssertionError\nError Description: ...,,ErrorType: AttributeError\nError Description: ...,ErrorType: AssertionError\nError Description: ...
3,17I3m3qQGZE9BuBnwB7GSUYhQUsvv-v_d,True,True,True,,,,ErrorType: NameError\nError Description: name ...,,,
4,1_sMiRI0ieIgdtHY-bzVMBdxqTX6QoF8-,True,True,True,,,,ErrorType: AttributeError\nError Description: ...,,,


In [32]:
FA_FAILED_ASSERTION = 'FA Failed - Assertion Error'
IA_FAILED_ASSERTION = 'IA Failed - Assertion Error'
NON_ASSERTION_ERROR = 'Non Assertion Error'
ASSERTION_ERROR = "Assertion Error"
NO_ERROR_FOUND = 'No Error Found'
UNDEFINED_ERROR = 'Undefined Error Type'

NEEDS_FIXES = 'Needs Fixes'
GOOD_TO_GO = 'Good To Go'
NEEDS_MANUAL_REVIEW = 'Needs Manual Review'
CHECK_NOT_EXECUTED = 'Check Not Executed'

def add_error_type(error_message):
    if error_message == "":
        return NO_ERROR_FOUND
    error_type = error_message.split('\n')[0].split(':')[-1].strip()

    if error_type != 'AssertionError':
        return NON_ASSERTION_ERROR
    if error_type == 'AssertionError':
        return ASSERTION_ERROR

    return UNDEFINED_ERROR

def get_auto_qc_status(row):
    init_status = row['Execution Status Initialisation']
    status_fa_no_action = row['Execution Status FA w/o Action']
    status_ia = row['Execution Status IA']
    status_action = row['Execution Status Action']
    status_fa = row['Execution Status FA'] 
    contains_final_assert = row['contains_final_assert']
    script_success = row['script_passed']


    status = ""
    message = ""
    
    if not script_success:
        status = NEEDS_FIXES
        message = "Failed: Script to run Auto QC failed"
        return pd.Series((status, message))
    
    if NON_ASSERTION_ERROR in [init_status, status_fa_no_action, status_ia, status_action, status_fa]:
        status = NEEDS_FIXES
        message = "Failed: One of the code block contains Non Assertion Error(s)"

    elif ASSERTION_ERROR in [status_ia]:
        status = NEEDS_FIXES
        message = "Failed: Assertion Failure in Initial Assertion."

    elif ASSERTION_ERROR in [status_fa]:
        status = NEEDS_FIXES
        message = "Failed: Final Assertion Failure even when Action is executed. Either Final Assertion or Action needs to be fixed."

    elif ASSERTION_ERROR in [status_fa_no_action]:
        status = GOOD_TO_GO
        message = "Passes: All Steps executed successfully and FA failed w/o action."
        
    else:
        if all(status==NO_ERROR_FOUND for status in [init_status, status_fa_no_action, status_ia, status_action, status_fa]):
            if contains_final_assert:
                status = NEEDS_FIXES
                message = "Failed: If FA is present, it must fail in absense of the action"
            else:
                status = GOOD_TO_GO
                message = "Passed: No FA block found so FA without action is expected to pass."
        
    return pd.Series((status, message))


In [27]:
sanity_df.columns

Index(['notebook', 'contains_golden_answer', 'contains_final_assert',
       'script_passed', 'script_failure_msg',
       'Set Up - Install Dependencies and Clone Repositories',
       'Set Up - Import APIs and initiate DBs', 'Final Assertion_NO_ACTION',
       'Initial Assertion', 'Action', 'Final Assertion'],
      dtype='object')

In [29]:
sanity_df['Execution Status Install Dependencies and Clone Repositories'] = sanity_df['Set Up - Install Dependencies and Clone Repositories'].apply(add_error_type)
sanity_df['Execution Status Initialisation'] = sanity_df['Set Up - Import APIs and initiate DBs'].apply(add_error_type)
sanity_df['Execution Status FA w/o Action'] = sanity_df['Final Assertion_NO_ACTION'].apply(add_error_type)
sanity_df['Execution Status IA'] = sanity_df['Initial Assertion'].apply(add_error_type)
sanity_df['Execution Status Action'] = sanity_df['Action'].apply(add_error_type)
sanity_df['Execution Status FA'] = sanity_df['Final Assertion'].apply(add_error_type)

In [None]:
sanity_df = sanity_df.rename(columns={'notebook': 'colab_id'})

In [55]:
merged_df = pd.merge(colabs_df[['colab_id', 'sample_id']], sanity_df, on='colab_id')

In [56]:
merged_df.head()

Unnamed: 0,colab_id,sample_id,contains_golden_answer,contains_final_assert,script_passed,script_failure_msg,Set Up - Install Dependencies and Clone Repositories,Set Up - Import APIs and initiate DBs,Final Assertion_NO_ACTION,Initial Assertion,Action,Final Assertion,Execution Status Install Dependencies and Clone Repositories,Execution Status Initialisation,Execution Status FA w/o Action,Execution Status IA,Execution Status Action,Execution Status FA,Auto QC Status,Auto QC Message
0,1WZF_kmS0Z78ZaSbRT3s69grD5EuvFTFg,1WZF_kmS0Z78ZaSbRT3s69grD5EuvFTFg,False,True,True,,,,ErrorType: AssertionError\nError Description: ...,,,,No Error Found,No Error Found,Assertion Error,No Error Found,No Error Found,No Error Found,Good To Go,Passes: All Steps executed successfully and FA...
1,1ddOYUiLnmflqzQQ7r9AYb3p-D-THcE7j,1ddOYUiLnmflqzQQ7r9AYb3p-D-THcE7j,False,True,True,,,,ErrorType: AssertionError\nError Description: ...,,ErrorType: NameError\nError Description: name ...,ErrorType: AssertionError\nError Description: ...,No Error Found,No Error Found,Assertion Error,No Error Found,Non Assertion Error,Assertion Error,Needs Fixes,Failed: One of the code block contains Non Ass...
2,1oSI1iciSHXJd5WPS6gN-ZFl8NtK7F_rC,1oSI1iciSHXJd5WPS6gN-ZFl8NtK7F_rC,True,True,True,,,,ErrorType: AssertionError\nError Description: ...,,ErrorType: NameError\nError Description: name ...,ErrorType: AssertionError\nError Description: ...,No Error Found,No Error Found,Assertion Error,No Error Found,Non Assertion Error,Assertion Error,Needs Fixes,Failed: One of the code block contains Non Ass...
3,1sOHIUssSmkKuXK8IASssFHaS9s1WMtgV,1sOHIUssSmkKuXK8IASssFHaS9s1WMtgV,True,False,True,,,,,,ErrorType: NameError\nError Description: name ...,,No Error Found,No Error Found,No Error Found,No Error Found,Non Assertion Error,No Error Found,Needs Fixes,Failed: One of the code block contains Non Ass...
4,149DQS_PcWIVB81k248hatMQlQiJLk0wJ,149DQS_PcWIVB81k248hatMQlQiJLk0wJ,True,False,True,,,,,,ErrorType: CommandExecutionError\nError Descri...,,No Error Found,No Error Found,No Error Found,No Error Found,Non Assertion Error,No Error Found,Needs Fixes,Failed: One of the code block contains Non Ass...


In [57]:
merged_df[['Auto QC Status', 'Auto QC Message']] = merged_df.apply(get_auto_qc_status, axis=1)
merged_df['Auto QC Status'].value_counts()

Auto QC Status
Needs Fixes    191
Good To Go      31
Name: count, dtype: int64

In [58]:
merged_df['script_passed'].value_counts()

script_passed
True     176
False     46
Name: count, dtype: int64

In [59]:
merged_df = merged_df.fillna("")

In [60]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222 entries, 0 to 221
Data columns (total 20 columns):
 #   Column                                                        Non-Null Count  Dtype 
---  ------                                                        --------------  ----- 
 0   colab_id                                                      222 non-null    object
 1   sample_id                                                     222 non-null    object
 2   contains_golden_answer                                        222 non-null    bool  
 3   contains_final_assert                                         222 non-null    bool  
 4   script_passed                                                 222 non-null    bool  
 5   script_failure_msg                                            222 non-null    object
 6   Set Up - Install Dependencies and Clone Repositories          222 non-null    object
 7   Set Up - Import APIs and initiate DBs                         222 non-null    ob

In [61]:
def trim_text(text):
    return text[:49999]

In [62]:
for col in merged_df.select_dtypes(include=['object', 'string']).columns.tolist():
    merged_df[col] = merged_df[col].apply(trim_text)

In [64]:
output_tab = 'auto_qc_response_parser'
dev_sheet_id = '1V6IyjZMqXcQ07zc0naOm6cbYySKLxN95GRjuqLKzVDU'
GoogleSheet.add_dataframe_to_sheet(dev_sheet_id, merged_df, output_tab, drop_duplicates_on = ['notebook'])
