## Uploading Ask Sage Documentation into a Ask Sage Dataset

This script is used to upload the Ask Sage Documentation into a Ask Sage Dataset. But is designed as an example use case. Users still need to refactor their code based on what they want to accomplish.

Make sure to reach the readme file for more information on how to use this script.

In [8]:
# pip install -r requirements.txt
from asksageclient import AskSageClient
from dotenv import load_dotenv
import logging
from dataclasses import dataclass
import os
import pandas as pd
from datetime import datetime
from pathlib import Path
import time
import requests
import hashlib
import time
from typing import Dict, List, Tuple, Optional

# ============================================================================


In [None]:
# ============================================================================
# LOGGING CONFIGURATION
# ============================================================================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ============================================================================
# CONFIGURATION
# ============================================================================

@dataclass
class TenantInstance:
    """Container for tenant client, token, and metadata"""
    name: str
    client: AskSageClient
    token: str
    api_key: str
    user_base_url: Optional[str] = None
    server_base_url: Optional[str] = None
    datasets_count: int = 0
    dataset_name: Optional[str] = None
    
    def __repr__(self) -> str:
        """Custom representation hiding sensitive data"""
        return (f"TenantInstance(name='{self.name}', "
                f"datasets_count={self.datasets_count}, "
                f"dataset_name='{self.dataset_name}', "
                f"token={'***' if self.token else 'None'})")

# Tenant configuration - needs to be modified depending on the tenant you are working with.
TENANT_NAME = "Ask Sage Environment Name"
API_KEY_ENV = "API_KEY"
USER_BASE_URL = "https://api.asksage.ai/user/"
SERVER_BASE_URL = "https://api.asksage.ai/server/"

# Default filter string for datasets - assumes that dataset has already been created this script just retrieves it and then uploads documents to it.
DEFAULT_FILTER_STRING = '_example-testing-dataset-script_'

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def create_tenant_instance(email: str) -> TenantInstance:
    """
    Create a TenantInstance with client and token
    
    Args:
        email: User email address
        
    Returns:
        TenantInstance object
        
    Raises:
        ValueError: If email is invalid or API key is missing
    """
    if not email or '@' not in email:
        raise ValueError(f"Invalid email address: {email}")
    
    api_key = os.getenv(API_KEY_ENV)
    
    if not api_key:
        raise ValueError(f"{API_KEY_ENV} not found in environment variables")
    
    try:
        client = AskSageClient(
            email=email,
            api_key=api_key,
            user_base_url=USER_BASE_URL,
            server_base_url=SERVER_BASE_URL
        )
        
        token = client.headers.get('x-access-tokens', '')
        
        if not token:
            logger.warning(f"No token found in headers for {TENANT_NAME}")
        
        instance = TenantInstance(
            name=TENANT_NAME,
            client=client,
            token=token,
            api_key=api_key,
            user_base_url=USER_BASE_URL,
            server_base_url=SERVER_BASE_URL
        )
        
        logger.info(f"Successfully created client for {TENANT_NAME}")
        return instance
    
    except Exception as e:
        logger.error(f"Error creating client for {TENANT_NAME}: {str(e)}", exc_info=True)
        raise


def get_filtered_datasets(
    instance: TenantInstance, 
    filter_string: str = DEFAULT_FILTER_STRING
) -> pd.DataFrame:
    """
    Retrieve and filter datasets from the tenant instance
    
    Args:
        instance: TenantInstance object
        filter_string: String to filter dataset names
        
    Returns:
        DataFrame with filtered datasets
    """
    try:
        datasets = instance.client.get_datasets()
        
        if not datasets:
            logger.info(f"No datasets found for {instance.name}")
            return pd.DataFrame(columns=['id', 'name'])
        
        df = pd.DataFrame(datasets)
        
        # Drop status column if it exists
        if 'status' in df.columns:
            df = df.drop(columns=['status'])
        
        # Rename response to name if needed
        if 'response' in df.columns:
            df = df.rename(columns={'response': 'name'})
        
        # Filter datasets by name
        if filter_string and 'name' in df.columns:
            original_count = len(df)
            df = df[df['name'].str.contains(filter_string, case=False, na=False)]
            logger.info(f"Filtered {original_count} datasets to {len(df)} for {instance.name}")
        
        # Update instance metadata
        instance.datasets_count = len(df)
        
        if not df.empty and 'name' in df.columns:
            instance.dataset_name = df.iloc[0]['name']
            logger.info(f"Set dataset name for {instance.name}: {instance.dataset_name}")
        
        return df
    
    except Exception as e:
        logger.error(f"Error retrieving datasets for {instance.name}: {str(e)}", exc_info=True)
        return pd.DataFrame(columns=['id', 'name'])


def print_summary(instance: TenantInstance, datasets_df: pd.DataFrame) -> None:
    """
    Print summary statistics
    
    Args:
        instance: TenantInstance object
        datasets_df: DataFrame with datasets
    """
    print(f"\nüìà Summary:")
    print(f"   Tenant: {instance.name}")
    print(f"   Total datasets found: {len(datasets_df)}")
    print(f"   Dataset name: {instance.dataset_name or 'N/A'}")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main() -> Tuple[TenantInstance, pd.DataFrame]:
    """
    Main execution function for tenant initialization
    
    Returns:
        Tuple containing:
        - TenantInstance object
        - DataFrame with filtered datasets
        
    Raises:
        ValueError: If EMAIL environment variable is not set
    """
    
    load_dotenv(dotenv_path='.env')
    EMAIL = os.getenv("EMAIL")
    
    if not EMAIL:
        raise ValueError("EMAIL not found in environment variables. Please set it in .env file")
    
    print("=" * 80)
    print("üöÄ Ask Sage Dataset Retrieval")
    print("=" * 80)
    
    print(f"\nüì° Processing {TENANT_NAME} tenant...")
    
    # Create tenant instance
    instance = create_tenant_instance(EMAIL)
    
    print(f"‚úÖ Client created for {TENANT_NAME}")
    print(f"   Token: {'Retrieved' if instance.token else 'Missing'}")
    
    # Get filtered datasets
    datasets_df = get_filtered_datasets(instance)
    
    if not datasets_df.empty:
        print(f"‚úÖ Found {len(datasets_df)} matching dataset(s)")
        print(f"   Dataset name: {instance.dataset_name}")
        
        print("\n" + "=" * 80)
        print("üìä DATASETS TABLE")
        print("=" * 80)
        display(datasets_df)
    else:
        print(f"‚ÑπÔ∏è  No matching datasets found")
    
    print_summary(instance, datasets_df)
    
    return instance, datasets_df


# ============================================================================
# UTILITY FUNCTIONS FOR ACCESSING STORED DATA
# ============================================================================

def get_client(instance: TenantInstance) -> AskSageClient:
    """Retrieve the tenant's client"""
    return instance.client


def get_token(instance: TenantInstance) -> str:
    """Retrieve the tenant's token"""
    return instance.token


# ============================================================================
# EXECUTE
# ============================================================================

if __name__ == "__main__":
    try:
        tenant_instance, datasets_df = main()
    except Exception as e:
        logger.error(f"Fatal error in main execution: {str(e)}", exc_info=True)
        raise

üöÄ Ask Sage Dataset Retrieval

üì° Processing Ask Sage Environment Name tenant...


2025-11-05 16:18:20,543 - INFO - Successfully created client for Ask Sage Environment Name


‚úÖ Client created for Ask Sage Environment Name
   Token: Retrieved


2025-11-05 16:18:20,880 - INFO - Filtered 71 datasets to 1 for Ask Sage Environment Name
2025-11-05 16:18:20,881 - INFO - Set dataset name for Ask Sage Environment Name: user_custom_34125_example-testing-dataset-script_content


‚úÖ Found 1 matching dataset(s)
   Dataset name: user_custom_34125_example-testing-dataset-script_content

üìä DATASETS TABLE


Unnamed: 0,name
70,user_custom_34125_example-testing-dataset-scri...



üìà Summary:
   Tenant: Ask Sage Environment Name
   Total datasets found: 1
   Dataset name: user_custom_34125_example-testing-dataset-script_content


In [None]:
# ============================================================================
# LOGGING CONFIGURATION
# ============================================================================

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# ============================================================================
# INGESTION CONFIGURATION
# ============================================================================

INGESTION_LOG_PATH = "ingestion_log.csv"
DOCS_BASE_PATH = "docs"
VERIFICATION_RETRIES = 3
VERIFICATION_DELAY = 5  # seconds between verification attempts
UPLOAD_DELAY = 10  # seconds between uploads
DELETE_DELAY = 3  # seconds after deletion before re-upload

# Temporary files to delete after script completes
TEMP_FILES = ["files_to_upload.csv", "upload_progress.csv", "files_to_delete.csv"]

# Upload limit configuration
UPLOAD_LIMIT = 5  # Change to 'ALL' for production

# Supported file extensions
SUPPORTED_EXTENSIONS = {'.docx', '.doc', '.pdf', '.pptx', '.ppt'}

# ============================================================================
# INGESTION LOG MANAGEMENT
# ============================================================================

def load_ingestion_log() -> pd.DataFrame:
    """Load existing ingestion log or create new one"""
    if Path(INGESTION_LOG_PATH).exists():
        df = pd.read_csv(INGESTION_LOG_PATH)
        print(f"‚úì Loaded ingestion log with {len(df)} records")
        return df
    else:
        print("üìù Creating new ingestion log")
        columns = [
            'relative_path', 'filename', 'file_modified_date', 
            'file_hash', 'size_kb', 'file_type',
            'ingestion_date', 'status'
        ]
        df = pd.DataFrame(columns=columns)
        df.to_csv(INGESTION_LOG_PATH, index=False)
        return df


def save_ingestion_log(df: pd.DataFrame):
    """Save ingestion log to CSV"""
    df.to_csv(INGESTION_LOG_PATH, index=False)
    print(f"üíæ Saved ingestion log to {INGESTION_LOG_PATH}")

# ============================================================================
# FILE MANAGEMENT FUNCTIONS
# ============================================================================

def get_file_hash(file_path: Path) -> str:
    """Calculate MD5 hash of file content for change detection"""
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()


def get_document_files(base_path: str = DOCS_BASE_PATH) -> pd.DataFrame:
    """Recursively find all Word, PDF, and PowerPoint files with metadata"""
    docs_path = Path(base_path)
    if not docs_path.exists():
        raise FileNotFoundError(f"Directory not found: {docs_path}")
    
    file_list = []
    for ext in SUPPORTED_EXTENSIONS:
        file_list.extend(docs_path.rglob(f"*{ext}"))
    
    if not file_list:
        print(f"‚ö†Ô∏è  No supported document files found in {docs_path}")
        print(f"   Supported types: {', '.join(SUPPORTED_EXTENSIONS)}")
        return pd.DataFrame()
    
    print(f"‚úì Found {len(file_list)} document files")
    
    # Count by type
    type_counts = {}
    for f in file_list:
        ext = f.suffix.lower()
        type_counts[ext] = type_counts.get(ext, 0) + 1
    
    print("   File type breakdown:")
    for ext, count in sorted(type_counts.items()):
        print(f"   - {ext}: {count} files")
    
    df = pd.DataFrame({
        "filename": [f.name for f in file_list],
        "relative_path": [str(f.relative_to(docs_path.parent)) for f in file_list],
        "file_modified_date": [datetime.fromtimestamp(f.stat().st_mtime).strftime('%Y-%m-%d %H:%M:%S') for f in file_list],
        "size_kb": [round(f.stat().st_size / 1024, 2) for f in file_list],
        "file_hash": [get_file_hash(f) for f in file_list],
        "file_type": [f.suffix.lower() for f in file_list],
        "full_path": [str(f) for f in file_list]
    })
    
    return df.sort_values(by="file_modified_date", ascending=False).reset_index(drop=True)


def cleanup_temp_files():
    """Delete temporary files created during script execution"""
    deleted_files = []
    for temp_file in TEMP_FILES:
        if Path(temp_file).exists():
            try:
                os.remove(temp_file)
                deleted_files.append(temp_file)
                print(f"üóëÔ∏è  Deleted temporary file: {temp_file}")
            except Exception as e:
                print(f"‚ö†Ô∏è  Could not delete {temp_file}: {e}")
    
    if deleted_files:
        print(f"\n‚úì Cleaned up {len(deleted_files)} temporary file(s)")
    return deleted_files

# ============================================================================
# API FUNCTIONS
# ============================================================================

def get_api_base_url(instance) -> str:
    """Get the appropriate API base URL"""
    if instance.server_base_url:
        return instance.server_base_url.rstrip('/')
    return "https://api.asksage.ai/server"


def fetch_api_ingested_files(instance) -> pd.DataFrame:
    """Fetch currently ingested files from API"""
    print(f"üîç Fetching ingested files from API...")
    url = f"{get_api_base_url(instance)}/get-all-files-ingested"
    headers = {
        "accept": "application/json",
        "x-access-tokens": instance.token
    }
    
    try:
        response = requests.post(url, headers=headers)
        response.raise_for_status()
        data = response.json()
        files = data.get("response", [])
        
        if files and isinstance(files, list):
            df = pd.DataFrame(files)
            print(f"‚úì Found {len(df)} files in API")
            return df
        else:
            print(f"‚ö†Ô∏è  No files found in API")
            return pd.DataFrame()
    except Exception as e:
        print(f"‚ùå Error fetching from API: {e}")
        return pd.DataFrame()


def delete_file_from_dataset(filename: str, instance) -> Tuple[bool, str]:
    """Delete a file from the dataset using the API"""
    if not instance.dataset_name:
        return False, "No dataset name configured"
    
    url = f"{get_api_base_url(instance)}/delete-filename-from-dataset"
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json",
        "x-access-tokens": instance.token
    }
    data = {
        "dataset": instance.dataset_name,
        "filename": filename
    }
    
    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        
        result = response.json()
        
        if response.status_code == 200:
            return True, f"Successfully deleted {filename} from {instance.dataset_name}"
        else:
            return False, result.get('message', f'Failed to delete {filename}')
    
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            return False, f"File not found in dataset: {filename}"
        return False, f"HTTP error: {str(e)}"
    except Exception as e:
        return False, f"Error deleting file: {str(e)}"


def upload_file_to_api(file_path: str, instance) -> Tuple[bool, str]:
    """Upload a single file to the API using the client"""
    if not instance.dataset_name:
        return False, "No dataset name configured"
    
    try:
        response = instance.client.train_with_file(
            file_path=str(file_path),
            dataset=instance.dataset_name
        )
        return True, f"Upload successful to {instance.dataset_name}: {response}"
    except Exception as e:
        return False, str(e)


def verify_file_ingestion(filename: str, instance, max_retries: int = VERIFICATION_RETRIES) -> bool:
    """Verify that a file was successfully ingested by checking the API"""
    for attempt in range(max_retries):
        if attempt > 0:
            print(f"   Retry {attempt}/{max_retries-1} for {filename}...")
            time.sleep(VERIFICATION_DELAY)
        
        api_df = fetch_api_ingested_files(instance)
        
        if not api_df.empty:
            if 'filename' in api_df.columns:
                if filename in api_df['filename'].values:
                    return True
            elif 'name' in api_df.columns:
                if filename in api_df['name'].values:
                    return True
    
    return False


def verify_file_deletion(filename: str, instance, max_retries: int = VERIFICATION_RETRIES) -> bool:
    """Verify that a file was successfully deleted by checking the API"""
    for attempt in range(max_retries):
        if attempt > 0:
            print(f"   Retry {attempt}/{max_retries-1} verifying deletion of {filename}...")
            time.sleep(VERIFICATION_DELAY)
        
        api_df = fetch_api_ingested_files(instance)
        
        if api_df.empty:
            return True
        
        if 'filename' in api_df.columns:
            if filename not in api_df['filename'].values:
                return True
        elif 'name' in api_df.columns:
            if filename not in api_df['name'].values:
                return True
    
    return False

# ============================================================================
# ANALYSIS FUNCTIONS
# ============================================================================

def analyze_files(local_df: pd.DataFrame, log_df: pd.DataFrame, api_df: pd.DataFrame) -> dict:
    """Analyze files and categorize them for upload decisions"""
    
    results = {
        'new_files': [],
        'modified_files': [],
        'unchanged_files': [],
        'deleted_files': []
    }
    
    log_dict = log_df.set_index('relative_path').to_dict('index') if not log_df.empty else {}
    
    for _, row in local_df.iterrows():
        rel_path = row['relative_path']
        
        if rel_path not in log_dict:
            results['new_files'].append(row.to_dict())
        else:
            log_entry = log_dict[rel_path]
            status = log_entry.get('status')
            
            if row['file_hash'] != log_entry.get('file_hash'):
                results['modified_files'].append({
                    **row.to_dict(),
                    'last_ingestion_date': log_entry.get('ingestion_date'),
                    'previous_hash': log_entry.get('file_hash'),
                    'previous_status': status
                })
            elif status == 'ingested':
                results['unchanged_files'].append({
                    **row.to_dict(),
                    'last_ingestion_date': log_entry.get('ingestion_date')
                })
            else:
                results['new_files'].append(row.to_dict())
    
    local_paths = set(local_df['relative_path'])
    for rel_path, log_entry in log_dict.items():
        if rel_path not in local_paths and log_entry.get('status') == 'ingested':
            results['deleted_files'].append(log_entry)
    
    return results


def update_ingestion_log(log_df: pd.DataFrame, files_to_mark: list, status: str = 'ingested') -> pd.DataFrame:
    """Update ingestion log with newly ingested files"""
    current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    for file_info in files_to_mark:
        rel_path = file_info['relative_path']
        
        if rel_path in log_df['relative_path'].values:
            idx = log_df[log_df['relative_path'] == rel_path].index[0]
            log_df.at[idx, 'ingestion_date'] = current_time
            log_df.at[idx, 'status'] = status
            log_df.at[idx, 'file_hash'] = file_info['file_hash']
            log_df.at[idx, 'file_modified_date'] = file_info['file_modified_date']
            log_df.at[idx, 'size_kb'] = file_info['size_kb']
            log_df.at[idx, 'file_type'] = file_info.get('file_type', '')
        else:
            new_row = {
                'relative_path': rel_path,
                'filename': file_info['filename'],
                'file_modified_date': file_info['file_modified_date'],
                'file_hash': file_info['file_hash'],
                'size_kb': file_info['size_kb'],
                'file_type': file_info.get('file_type', ''),
                'ingestion_date': current_time,
                'status': status
            }
            log_df = pd.concat([log_df, pd.DataFrame([new_row])], ignore_index=True)
    
    return log_df

# ============================================================================
# BATCH OPERATIONS
# ============================================================================

def batch_delete_files(files_to_delete: List[Dict], instance, verify: bool = True) -> Dict:
    """Delete multiple files from the dataset"""
    results = {
        'successful': [],
        'failed': [],
        'verified': [],
        'unverified': []
    }
    
    total = len(files_to_delete)
    print(f"\nüóëÔ∏è  Starting deletion of {total} modified files from dataset...")
    print(f"   Dataset: {instance.dataset_name}")
    print("=" * 80)
    
    for idx, file_info in enumerate(files_to_delete, 1):
        filename = file_info['filename']
        file_type = file_info.get('file_type', 'unknown')
        
        print(f"\n[{idx}/{total}] Deleting: {filename}")
        print(f"   Path: {file_info['relative_path']}")
        print(f"   Type: {file_type}")
        print(f"   Dataset: {instance.dataset_name}")
        print(f"   Reason: File was modified (hash changed)")
        
        delete_time = datetime.now()
        success, message = delete_file_from_dataset(filename, instance)
        
        if success:
            print(f"   ‚úì Deletion successful")
            file_info['delete_time'] = delete_time.strftime('%Y-%m-%d %H:%M:%S')
            file_info['delete_response'] = message
            results['successful'].append(file_info)
            
            if verify:
                print(f"   üîç Verifying deletion...")
                if verify_file_deletion(filename, instance):
                    print(f"   ‚úì Verified deletion from API")
                    results['verified'].append(file_info)
                else:
                    print(f"   ‚ö†Ô∏è  Could not verify deletion")
                    results['unverified'].append(file_info)
            
            if idx < total:
                print(f"   ‚è≥ Waiting {DELETE_DELAY} seconds...")
                time.sleep(DELETE_DELAY)
        else:
            print(f"   ‚ö†Ô∏è  Deletion failed: {message}")
            file_info['delete_time'] = delete_time.strftime('%Y-%m-%d %H:%M:%S')
            file_info['error'] = message
            results['failed'].append(file_info)
            
            if idx < total:
                time.sleep(DELETE_DELAY)
    
    return results


def batch_upload_files(files_to_upload: List[Dict], instance, 
                       verify: bool = True, limit: Optional[int] = None, 
                       is_reupload: bool = False) -> Dict:
    """Upload multiple files to the dataset"""
    results = {
        'successful': [],
        'failed': [],
        'verified': [],
        'unverified': [],
        'skipped': []
    }
    
    if limit is not None and limit > 0:
        if len(files_to_upload) > limit:
            results['skipped'] = files_to_upload[limit:]
            files_to_upload = files_to_upload[:limit]
            print(f"\n‚ö†Ô∏è  UPLOAD LIMIT: Processing only {limit} files (skipping {len(results['skipped'])})")
    
    total = len(files_to_upload)
    upload_type = "Re-uploading" if is_reupload else "Uploading"
    print(f"\nüì§ Starting {upload_type.lower()} of {total} files...")
    print(f"   Dataset: {instance.dataset_name}")
    print("=" * 80)
    
    for idx, file_info in enumerate(files_to_upload, 1):
        file_path = file_info['full_path']
        filename = file_info['filename']
        file_type = file_info.get('file_type', 'unknown')
        
        print(f"\n[{idx}/{total}] {upload_type}: {filename}")
        print(f"   Path: {file_info['relative_path']}")
        print(f"   Type: {file_type}")
        print(f"   Dataset: {instance.dataset_name}")
        print(f"   Size: {file_info['size_kb']} KB")
        
        upload_time = datetime.now()
        success, message = upload_file_to_api(file_path, instance)
        
        if success:
            print(f"   ‚úì Upload successful")
            file_info['upload_time'] = upload_time.strftime('%Y-%m-%d %H:%M:%S')
            file_info['upload_response'] = message
            results['successful'].append(file_info)
            
            if verify:
                print(f"   üîç Verifying ingestion...")
                if verify_file_ingestion(filename, instance):
                    print(f"   ‚úì Verified in API")
                    results['verified'].append(file_info)
                else:
                    print(f"   ‚ö†Ô∏è  Could not verify in API")
                    results['unverified'].append(file_info)
            
            if idx < total:
                print(f"   ‚è≥ Waiting {UPLOAD_DELAY} seconds...")
                time.sleep(UPLOAD_DELAY)
        else:
            print(f"   ‚ùå Upload failed: {message}")
            file_info['upload_time'] = upload_time.strftime('%Y-%m-%d %H:%M:%S')
            file_info['error'] = message
            results['failed'].append(file_info)
            
            if idx < total:
                time.sleep(UPLOAD_DELAY)
    
    return results

# ============================================================================
# MAIN EXECUTION
# ============================================================================

def main(tenant_instance, upload_limit=UPLOAD_LIMIT):
    """
    Main execution function for document ingestion
    
    Args:
        tenant_instance: TenantInstance object from Code Block 1
        upload_limit: Number of files to upload or 'ALL'
    """
    
    try:
        print("=" * 80)
        print("üìö DOCUMENT INGESTION SYSTEM")
        print("   Supported formats: Word (.docx, .doc), PDF (.pdf), PowerPoint (.pptx, .ppt)")
        print("=" * 80)
        
        if not tenant_instance:
            print("\n‚ùå No tenant instance provided!")
            print("   Please run Code Block 1 first to initialize the tenant.")
            return None, None
        
        if not tenant_instance.dataset_name:
            print(f"\n‚ö†Ô∏è  WARNING: No dataset found. Skipping ingestion.")
            return tenant_instance, None
        
        print(f"\nüìä Dataset: {tenant_instance.dataset_name}")
        
        log_df = load_ingestion_log()
        
        print("\nüìÇ Scanning local files...")
        local_df = get_document_files()
        
        if local_df.empty:
            print("\n‚ùå No supported document files found!")
            print(f"   Supported types: {', '.join(SUPPORTED_EXTENSIONS)}")
            return tenant_instance, log_df
        
        limit = None
        if upload_limit != 'ALL':
            try:
                limit = int(upload_limit)
                print(f"\nüîß Upload limit set to: {limit} files")
            except ValueError:
                print(f"\n‚ö†Ô∏è  Invalid upload limit '{upload_limit}', using ALL")
        else:
            print(f"\nüîß Upload limit set to: ALL files")
        
        api_df = fetch_api_ingested_files(tenant_instance)
        analysis = analyze_files(local_df, log_df, api_df)
        
        print(f"\nüìä ANALYSIS RESULTS")
        print("=" * 80)
        
        summary_df = pd.DataFrame({
            'Category': [
                'üÜï New Files',
                'üìù Modified Files',
                '‚úì Unchanged Files',
                'üóëÔ∏è  Deleted Files'
            ],
            'Count': [
                len(analysis['new_files']),
                len(analysis['modified_files']),
                len(analysis['unchanged_files']),
                len(analysis['deleted_files'])
            ]
        })
        display(summary_df)
        
        files_to_upload = analysis['new_files']
        modified_files = analysis['modified_files']
        
        if not files_to_upload and not modified_files:
            print(f"\n‚úì All files are up to date!")
            return tenant_instance, log_df
        
        # Handle modified files (delete then re-upload)
        if modified_files:
            delete_results = batch_delete_files(modified_files, tenant_instance, verify=True)
            
            print(f"\nüìä DELETION RESULTS")
            print("=" * 80)
            print(f"‚úì Successfully Deleted: {len(delete_results['successful'])}")
            print(f"‚úì Verified Deletion: {len(delete_results['verified'])}")
            print(f"‚ùå Failed to Delete: {len(delete_results['failed'])}")
            
            # Only re-upload files that were successfully deleted
            if delete_results['failed']:
                failed_filenames = {f['filename'] for f in delete_results['failed']}
                modified_files = [f for f in modified_files if f['filename'] not in failed_filenames]
        
        # Upload new files
        if files_to_upload:
            upload_results_new = batch_upload_files(files_to_upload, tenant_instance, verify=True, 
                                                   limit=limit, is_reupload=False)
        else:
            upload_results_new = {'successful': [], 'failed': [], 'verified': [], 'unverified': [], 'skipped': []}
        
        # Re-upload modified files
        if modified_files:
            remaining_limit = None
            if limit is not None:
                remaining_limit = limit - len(upload_results_new['successful'])
                if remaining_limit <= 0:
                    print(f"\n‚ö†Ô∏è  Upload limit reached")
                    modified_files = []
            
            if modified_files:
                upload_results_modified = batch_upload_files(modified_files, tenant_instance, verify=True,
                                                            limit=remaining_limit, is_reupload=True)
        else:
            upload_results_modified = {'successful': [], 'failed': [], 'verified': [], 'unverified': [], 'skipped': []}
        
        print(f"\nüìä FINAL UPLOAD RESULTS")
        print("=" * 80)
        
        total_successful = len(upload_results_new['successful']) + len(upload_results_modified['successful'])
        total_verified = len(upload_results_new['verified']) + len(upload_results_modified['verified'])
        total_failed = len(upload_results_new['failed']) + len(upload_results_modified['failed'])
        
        print(f"‚úì Total Uploaded: {total_successful}")
        print(f"‚úì Total Verified: {total_verified}")
        print(f"‚ùå Total Failed: {total_failed}")
        
        # Update log with verified uploads
        all_verified = upload_results_new['verified'] + upload_results_modified['verified']
        if all_verified:
            log_df = update_ingestion_log(log_df, all_verified, status='ingested')
        
        # Update log with unverified uploads
        all_unverified = upload_results_new['unverified'] + upload_results_modified['unverified']
        if all_unverified:
            log_df = update_ingestion_log(log_df, all_unverified, status='pending_verification')
        
        # Update log with failed uploads
        all_failed = upload_results_new['failed'] + upload_results_modified['failed']
        if all_failed:
            log_df = update_ingestion_log(log_df, all_failed, status='failed')
        
        save_ingestion_log(log_df)
        
        print("\n" + "=" * 80)
        print("üìú FINAL INGESTION LOG SUMMARY")
        print("=" * 80)
        print(f"Total tracked files: {len(log_df)}")
        print(f"Log file location: {INGESTION_LOG_PATH}")
        
        if 'status' in log_df.columns:
            print(f"\nStatus Summary:")
            status_counts = log_df['status'].value_counts()
            for status, count in status_counts.items():
                if pd.notna(status):
                    print(f"  {status}: {count}")
        
        print(f"\nSample of ingestion log:")
        display(log_df.head(10))
        
        return tenant_instance, log_df
    
    finally:
        print("\n" + "=" * 80)
        print("üßπ CLEANUP")
        print("=" * 80)
        cleanup_temp_files()
        print(f"\n‚úì Kept permanent file: {INGESTION_LOG_PATH}")

# ============================================================================
# EXECUTE
# ============================================================================

if __name__ == "__main__":
    try:
        
        # tenant_instance, log_df = main(tenant_instance, upload_limit=5)
        
        # To run for all files:
        tenant_instance, log_df = main(tenant_instance, upload_limit='ALL')
        
    except NameError:
        print("‚ùå ERROR: tenant_instance not found!")
        print("   Please run Code Block 1 first to initialize the tenant instance.")
    except Exception as e:
        logger.error(f"Fatal error in main execution: {str(e)}", exc_info=True)
        raise

üìö DOCUMENT INGESTION SYSTEM
   Supported formats: Word (.docx, .doc), PDF (.pdf), PowerPoint (.pptx, .ppt)

üìä Dataset: user_custom_34125_example-testing-dataset-script_content
‚úì Loaded ingestion log with 5 records

üìÇ Scanning local files...
‚úì Found 5 document files
   File type breakdown:
   - .docx: 3 files
   - .pdf: 2 files

üîß Upload limit set to: ALL files
üîç Fetching ingested files from API...
‚úì Found 83 files in API

üìä ANALYSIS RESULTS


Unnamed: 0,Category,Count
0,üÜï New Files,0
1,üìù Modified Files,0
2,‚úì Unchanged Files,5
3,üóëÔ∏è Deleted Files,0



‚úì All files are up to date!

üßπ CLEANUP

‚úì Kept permanent file: ingestion_log.csv
