# Notebook 00: Setup and Configuration

**Purpose**: Environment setup, code packaging, and configuration validation

**Key Tasks**:
1. Validate environment (Python, S3 access)
2. Load and validate training configuration
3. Package `query_predictor` for Spark distribution
4. Upload to S3 with versioning
5. Generate Spark configuration snippet for next notebooks

**Prerequisites**:
- Python 3.8+
- AWS credentials configured
- S3 bucket access

**Duration**: ~5 minutes

## 1. Import Dependencies and Validate Environment

In [1]:
import sys
import os
import boto3
import yaml
import shutil
import zipfile
from datetime import datetime
from pathlib import Path

print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

# Validate Python version
assert sys.version_info >= (3, 8), "Python 3.8+ required"
print("✅ Python version validated")

# Validate S3 access
try:
    s3 = boto3.client('s3')
    bucket = 'uip-datalake-bucket-prod'
    s3.head_bucket(Bucket=bucket)
    print(f"✅ S3 access validated: {bucket}")
except Exception as e:
    print(f"❌ S3 access failed: {e}")
    raise

Python version: 3.11.11 (main, Jun 20 2025, 00:00:00) [GCC 11.5.0 20240719 (Red Hat 11.5.0-5)]
Python executable: /usr/local/bin/python
✅ Python version validated
✅ S3 access validated: uip-datalake-bucket-prod


## 2. Load and Validate Configuration

In [2]:
# Method 1: Load from local file (PRIMARY - for initial setup)
config_path = '../config/training_config.yaml'
with open(config_path) as f:
    config = yaml.safe_load(f)

print("✅ Configuration loaded from local file")

# Method 2: Download from S3 (ALTERNATIVE - for reproducibility)
# Uncomment to use config from previous run:
# import boto3
# s3_client = boto3.client('s3')
# s3_bucket = 'uip-datalake-bucket-prod'
# s3_key = 'sf_trino/trino_query_predictor/config/training_config_latest.yaml'
# config_path = '/tmp/training_config.yaml'
# s3_client.download_file(s3_bucket, s3_key, config_path)
# with open(config_path) as f:
#     config = yaml.safe_load(f)
# print("✅ Configuration loaded from S3")

# Validate configuration using ConfigValidator
import sys
sys.path.insert(0, str(Path.cwd().parent))
from query_predictor.training.config_validator import ConfigValidator

validator = ConfigValidator()
errors = validator.validate(config)

if errors:
    print("\n❌ Configuration validation failed:")
    for error in errors:
        print(f"  - {error}")
    raise ValueError(f"Invalid configuration: {len(errors)} errors found")

print("✅ Configuration validated")

# OPTIONAL: Override config parameters after loading
# Example: Change date range for quick testing
# config['data_loading']['start_date'] = '2025-09-01'
# config['data_loading']['end_date'] = '2025-09-15'
# Example: Change sampling ratio
# config['boundary_sampling']['balance_ratio'] = 3.0
# Example: Disable analysis for faster execution
# config['analysis']['enabled'] = False

# Display key configuration
print("\n📋 Training Configuration:")
print(f"  Date range: {config['data_loading']['start_date']} to {config['data_loading']['end_date']}")
print(f"  Features: {config['features']['base_feature_count']} base + "
      f"{config['features']['historical_feature_count']} historical + "
      f"{config['features']['tfidf_vocab_size']} TF-IDF = "
      f"{config['features']['total_features']} total")
print(f"  Balance ratio: {config['boundary_sampling']['balance_ratio']}:1 (Small:Heavy)")
print(f"  Model: {config['model']['algorithm'].upper()}")
print(f"  Target recall: ≥{config['prd_requirements']['target_heavy_recall']}")

✅ Configuration loaded from local file
✅ Configuration validated

📋 Training Configuration:
  Date range: 2025-08-01 to 2025-10-01
  Features: 78 base + 17 historical + 250 TF-IDF = 345 total
  Balance ratio: 5.0:1 (Small:Heavy)
  Model: XGBOOST
  Target recall: ≥0.98


## 3. Package query_predictor for Spark Distribution

In [3]:
# Determine paths
repo_root = Path.cwd().parent
src_dir = repo_root / 'query_predictor'
temp_dir = Path('/tmp/query_predictor_package')
zip_path = Path('/tmp/query_predictor.zip')

print(f"Repository root: {repo_root}")
print(f"Source directory: {src_dir}")

# Create temp directory
if temp_dir.exists():
    shutil.rmtree(temp_dir)
temp_dir.mkdir(parents=True)

# Copy query_predictor package
print("\n📦 Packaging query_predictor...")
dest_dir = temp_dir / 'query_predictor'
shutil.copytree(src_dir, dest_dir, ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '*.pyo', '.DS_Store'))

# Create ZIP file
if zip_path.exists():
    zip_path.unlink()

with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(temp_dir):
        # Exclude __pycache__ directories
        dirs[:] = [d for d in dirs if d != '__pycache__']
        for file in files:
            if not file.endswith(('.pyc', '.pyo')):
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, temp_dir)
                zipf.write(file_path, arcname)

zip_size_mb = os.path.getsize(zip_path) / 1024 / 1024
print(f"✅ Package created: {zip_path} ({zip_size_mb:.2f} MB)")

Repository root: /home/pmannem/trino-query-predictor
Source directory: /home/pmannem/trino-query-predictor/query_predictor

📦 Packaging query_predictor...
✅ Package created: /tmp/query_predictor.zip (0.08 MB)


## 4. Upload to S3 with Versioning

In [4]:
# Generate version timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
s3_key = f"{config['s3']['prefix']}/code/query_predictor_{timestamp}.zip"

print(f"\n☁️  Uploading to S3...")
print(f"  Bucket: {config['s3']['bucket']}")
print(f"  Key: {s3_key}")

# Upload versioned file
s3.upload_file(str(zip_path), config['s3']['bucket'], s3_key)
print(f"✅ Uploaded versioned package: s3://{config['s3']['bucket']}/{s3_key}")

# Also save as "latest"
latest_key = f"{config['s3']['prefix']}/code/query_predictor_latest.zip"
s3.copy_object(
    Bucket=config['s3']['bucket'],
    CopySource={'Bucket': config['s3']['bucket'], 'Key': s3_key},
    Key=latest_key
)
print(f"✅ Updated latest package: s3://{config['s3']['bucket']}/{latest_key}")

# Store for next notebooks
package_s3_path = f"s3://{config['s3']['bucket']}/{latest_key}"
print(f"\n📍 Package S3 path: {package_s3_path}")


☁️  Uploading to S3...
  Bucket: uip-datalake-bucket-prod
  Key: sf_trino/trino_query_predictor/code/query_predictor_20251016_215152.zip
✅ Uploaded versioned package: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/code/query_predictor_20251016_215152.zip
✅ Updated latest package: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/code/query_predictor_latest.zip

📍 Package S3 path: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/code/query_predictor_latest.zip


## 5. Generate Spark Configuration Snippet

In [5]:
spark_config = f'''%%configure -f
{{
    "pyFiles": [
        "{package_s3_path}",
        "s3://uipds-108043591022/dataintelligence-dev/di-airflow-prod/dags/common/utils/ParseArgs.py"
    ],
    "driverMemory": "{config['spark']['driver_memory']}",
    "driverCores": {config['spark']['driver_cores']},
    "executorMemory": "{config['spark']['executor_memory']}",
    "executorCores": {config['spark']['executor_cores']},
    "conf": {{
        "spark.driver.maxResultSize": "8G",
        "spark.dynamicAllocation.enabled": "true",
        "spark.dynamicAllocation.minExecutors": "{config['spark']['min_executors']}",
        "spark.dynamicAllocation.maxExecutors": "{config['spark']['max_executors']}"
    }}
}}'''

print("\n" + "="*70)
print("SPARK CONFIGURATION FOR NEXT NOTEBOOKS")
print("="*70)
print("\nCopy this to the first cell of notebooks 01-04:\n")
print(spark_config)
print("\n" + "="*70)


SPARK CONFIGURATION FOR NEXT NOTEBOOKS

Copy this to the first cell of notebooks 01-04:

%%configure -f
{
    "pyFiles": [
        "s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/code/query_predictor_latest.zip",
        "s3://uipds-108043591022/dataintelligence-dev/di-airflow-prod/dags/common/utils/ParseArgs.py"
    ],
    "driverMemory": "16G",
    "driverCores": 4,
    "executorMemory": "20G",
    "executorCores": 5,
    "conf": {
        "spark.driver.maxResultSize": "8G",
        "spark.dynamicAllocation.enabled": "true",
        "spark.dynamicAllocation.minExecutors": "2",
        "spark.dynamicAllocation.maxExecutors": "20"
    }
}



## 6. Save Configuration Metadata

In [6]:
import json

# Upload training config to S3 for reproducibility
print("\nUploading training config to S3...")
config_s3_key = f"{config['s3']['prefix']}/config/training_config_{timestamp}.yaml"
s3.upload_file(config_path, config['s3']['bucket'], config_s3_key)
config_s3_path = f"s3://{config['s3']['bucket']}/{config_s3_key}"
print(f"✅ Config uploaded: {config_s3_path}")

# Also save as "latest"
latest_config_key = f"{config['s3']['prefix']}/config/training_config_latest.yaml"
s3.copy_object(
    Bucket=config['s3']['bucket'],
    CopySource={'Bucket': config['s3']['bucket'], 'Key': config_s3_key},
    Key=latest_config_key
)
latest_config_path = f"s3://{config['s3']['bucket']}/{latest_config_key}"
print(f"✅ Updated latest config: {latest_config_path}")

# Prepare metadata
metadata = {
    'timestamp': timestamp,
    'package_version': timestamp,
    'package_s3_path': package_s3_path,
    'config_path': config_s3_path,  # S3 path to versioned config
    'latest_config_path': latest_config_path,  # S3 path to latest config
    'date_range': f"{config['data_loading']['start_date']} to {config['data_loading']['end_date']}",
    'total_features': config['features']['total_features'],
    'spark_config': config['spark']
}

# Save locally
metadata_path = '/tmp/setup_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✅ Metadata saved locally: {metadata_path}")

# Upload to S3
metadata_s3_key = f"{config['s3']['prefix']}/metadata/setup_{timestamp}.json"
s3.upload_file(metadata_path, config['s3']['bucket'], metadata_s3_key)
print(f"✅ Metadata uploaded: s3://{config['s3']['bucket']}/{metadata_s3_key}")

print(f"\n📍 For notebooks 01-04, config will be downloaded from: {latest_config_path}")


Uploading training config to S3...
✅ Config uploaded: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/config/training_config_20251016_215152.yaml
✅ Updated latest config: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/config/training_config_latest.yaml
✅ Metadata saved locally: /tmp/setup_metadata.json
✅ Metadata uploaded: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/metadata/setup_20251016_215152.json

📍 For notebooks 01-04, config will be downloaded from: s3://uip-datalake-bucket-prod/sf_trino/trino_query_predictor/config/training_config_latest.yaml


## 7. Cleanup

In [7]:
# Clean up temporary files
if temp_dir.exists():
    shutil.rmtree(temp_dir)
if zip_path.exists():
    zip_path.unlink()

print("✅ Cleanup completed")
print("\n" + "="*70)
print("SETUP COMPLETE!")
print("="*70)
print("\nNext Steps:")
print("1. Copy the Spark configuration above")
print("2. Open notebook 01_data_loading.ipynb")
print("3. Paste Spark configuration in first cell")
print("4. Run the data loading pipeline")
print("="*70)

✅ Cleanup completed

SETUP COMPLETE!

Next Steps:
1. Copy the Spark configuration above
2. Open notebook 01_data_loading.ipynb
3. Paste Spark configuration in first cell
4. Run the data loading pipeline
