In [6]:
"""
Test script for fetch_data.py module
Tests the fetch_pdfs_from_s3 function
"""
import os
import sys
from pathlib import Path

# Add src directory to path to import fetch_data
sys.path.insert(0, str(Path.cwd().parent))

from src.knowledge.fetch_data import fetch_pdfs_from_s3

print("=" * 60)
print("Testing fetch_data.py - fetch_pdfs_from_s3 function")
print("=" * 60)


Testing fetch_data.py - fetch_pdfs_from_s3 function


In [7]:
# Check AWS credentials
print("\n1. Checking AWS Credentials...")
print("-" * 60)

aws_access_key = os.getenv("AWS_ACCESS_KEY_ID")
aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
aws_region = os.getenv("AWS_DEFAULT_REGION", "us-east-1")

if aws_access_key and aws_secret_key:
    print(f"✓ AWS_ACCESS_KEY_ID: {'*' * 20}...{aws_access_key[-4:]}")
    print(f"✓ AWS_SECRET_ACCESS_KEY: {'*' * 20}...{aws_secret_key[-4:]}")
    print(f"✓ AWS_DEFAULT_REGION: {aws_region}")
    print("\n✓ AWS credentials found in environment variables")
else:
    print("✗ AWS credentials not found in environment variables")
    print("\nPlease set the following environment variables:")
    print("  - AWS_ACCESS_KEY_ID")
    print("  - AWS_SECRET_ACCESS_KEY")
    print("  - AWS_DEFAULT_REGION (optional, defaults to us-east-1)")



1. Checking AWS Credentials...
------------------------------------------------------------
✓ AWS_ACCESS_KEY_ID: ********************...YEXC
✓ AWS_SECRET_ACCESS_KEY: ********************...GUop
✓ AWS_DEFAULT_REGION: ap-northeast-3

✓ AWS credentials found in environment variables


In [9]:
# Test 1: Test with default parameters
print("\n2. Test 1: Testing with default parameters")
print("-" * 60)

try:
    # Use a local test directory instead of /opt/airflow/data for testing
    test_data_dir = Path.cwd().parent / "data" / "test_downloads"
    test_data_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Test data directory: {test_data_dir}")
    print("\nCalling fetch_pdfs_from_s3() with default parameters...")
    print("(bucket_name='knowledge-assistant-bucket', s3_prefix='raw-pdf-data/')")
    
    result_path = fetch_pdfs_from_s3(
        bucket_name="knowledge-assistant-project",
        s3_prefix="raw-pdf-data/",
        data_dir=str(test_data_dir)
    )
    
    print(f"\n✓ Function returned path: {result_path}")
    
    # Verify the directory exists
    if Path(result_path).exists():
        print(f"✓ Directory exists: {result_path}")
        
        # List downloaded files
        pdf_files = list(Path(result_path).glob("*.pdf"))
        print(f"\n✓ Found {len(pdf_files)} PDF file(s) in directory:")
        for pdf_file in pdf_files:
            file_size = pdf_file.stat().st_size / 1024  # Size in KB
            print(f"  - {pdf_file.name} ({file_size:.2f} KB)")
    else:
        print(f"✗ Directory does not exist: {result_path}")
        
except ValueError as e:
    print(f"\n✗ ValueError: {e}")
except Exception as e:
    print(f"\n✗ Error: {type(e).__name__}: {e}")



2. Test 1: Testing with default parameters
------------------------------------------------------------
Test data directory: /Users/user/Desktop/pdf-knowledge-assistant/data/test_downloads

Calling fetch_pdfs_from_s3() with default parameters...
(bucket_name='knowledge-assistant-bucket', s3_prefix='raw-pdf-data/')
Connecting to S3 bucket: knowledge-assistant-project
Fetching PDFs from path: raw-pdf-data/

Found 1 PDF file(s) in S3:
--------------------------------------------------
Downloading: 1.pdf
  ✓ Successfully downloaded: 1.pdf
--------------------------------------------------

Downloaded 1 PDF file(s):
  - 1.pdf

All PDFs saved to: /Users/user/Desktop/pdf-knowledge-assistant/data/test_downloads

✓ Function returned path: /Users/user/Desktop/pdf-knowledge-assistant/data/test_downloads
✓ Directory exists: /Users/user/Desktop/pdf-knowledge-assistant/data/test_downloads

✓ Found 1 PDF file(s) in directory:
  - 1.pdf (833.24 KB)


In [7]:
# Test 2: Test with custom parameters
print("\n3. Test 2: Testing with custom parameters")
print("-" * 60)

try:
    # Custom test directory
    custom_data_dir = Path.cwd().parent / "data" / "custom_test"
    custom_data_dir.mkdir(parents=True, exist_ok=True)
    
    # You can modify these parameters based on your S3 setup
    custom_bucket = "knowledge-assistant-bucket"  # Change if needed
    custom_prefix = "raw-pdf-data/"  # Change if needed
    
    print(f"Custom data directory: {custom_data_dir}")
    print(f"Custom bucket: {custom_bucket}")
    print(f"Custom prefix: {custom_prefix}")
    print("\nCalling fetch_pdfs_from_s3() with custom parameters...")
    
    result_path = fetch_pdfs_from_s3(
        bucket_name=custom_bucket,
        s3_prefix=custom_prefix,
        data_dir=str(custom_data_dir)
    )
    
    print(f"\n✓ Function returned path: {result_path}")
    
except ValueError as e:
    print(f"\n✗ ValueError: {e}")
except Exception as e:
    print(f"\n✗ Error: {type(e).__name__}: {e}")



3. Test 2: Testing with custom parameters
------------------------------------------------------------
Custom data directory: /Users/user/Desktop/pdf-knowledge-assistant/data/custom_test
Custom bucket: knowledge-assistant-bucket
Custom prefix: raw-pdf-data/

Calling fetch_pdfs_from_s3() with custom parameters...

✗ ValueError: AWS credentials not found in environment variables


In [None]:
# Test 3: Test error handling - Missing credentials
print("\n4. Test 3: Testing error handling (Missing AWS credentials)")
print("-" * 60)

# Temporarily remove AWS credentials to test error handling
original_access_key = os.environ.pop("AWS_ACCESS_KEY_ID", None)
original_secret_key = os.environ.pop("AWS_SECRET_ACCESS_KEY", None)

try:
    result_path = fetch_pdfs_from_s3()
    print("✗ Expected ValueError but function succeeded")
except ValueError as e:
    print(f"✓ Correctly raised ValueError: {e}")
except Exception as e:
    print(f"✗ Unexpected error: {type(e).__name__}: {e}")
finally:
    # Restore credentials
    if original_access_key:
        os.environ["AWS_ACCESS_KEY_ID"] = original_access_key
    if original_secret_key:
        os.environ["AWS_SECRET_ACCESS_KEY"] = original_secret_key
    print("\n✓ Restored AWS credentials")


In [None]:
# Test 4: Test error handling - Invalid bucket
print("\n5. Test 4: Testing error handling (Invalid bucket name)")
print("-" * 60)

# Make sure credentials are restored
if not os.getenv("AWS_ACCESS_KEY_ID") or not os.getenv("AWS_SECRET_ACCESS_KEY"):
    print("⚠ Skipping test - AWS credentials not available")
else:
    try:
        test_data_dir = Path.cwd().parent / "data" / "test_invalid"
        test_data_dir.mkdir(parents=True, exist_ok=True)
        
        # Use a non-existent bucket name
        result_path = fetch_pdfs_from_s3(
            bucket_name="non-existent-bucket-12345",
            s3_prefix="raw-pdf-data/",
            data_dir=str(test_data_dir)
        )
        print("✗ Expected ValueError but function succeeded")
    except ValueError as e:
        print(f"✓ Correctly raised ValueError: {e}")
    except Exception as e:
        print(f"ℹ Error raised: {type(e).__name__}: {e}")
        print("  (This is expected for non-existent buckets)")


In [None]:
# Summary
print("\n" + "=" * 60)
print("Test Summary")
print("=" * 60)
print("\nTests completed! Review the output above for results.")
print("\nNote: Some tests may fail if:")
print("  - AWS credentials are not configured")
print("  - S3 bucket doesn't exist or is not accessible")
print("  - No PDF files are present in the specified S3 path")
print("\nTo run these tests successfully, ensure:")
print("  1. AWS credentials are set in environment variables")
print("  2. S3 bucket exists and is accessible")
print("  3. PDF files are present in the specified S3 prefix")
