## Setup and Imports

In [4]:
# Standard library imports
import json
import re
from pathlib import Path
from pprint import pprint
from typing import Dict, List, Set
import warnings

# Data processing
import pandas as pd
import numpy as np

# Visualization (optional, for data exploration)
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 100)
warnings.filterwarnings('ignore')

# Load Snowflake Views Metadata

Load the database schema information including view names, descriptions, columns, and selector keywords.

In [11]:
# Define file paths
SNOWFLAKE_VIEWS_FILE = Path("../data/snowflake_view.json")
OUTPUT_DIR = Path("../data/processed")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("üîç Loading Snowflake Views Metadata...")
print(f"üìÅ Source file: {SNOWFLAKE_VIEWS_FILE}")
print("-" * 60)

try:
    with open(SNOWFLAKE_VIEWS_FILE, 'r', encoding='utf-8') as f:
        snowflake_data = json.load(f)
    
    # Extract views list
    if isinstance(snowflake_data, dict) and 'views' in snowflake_data:
        actual_views = snowflake_data['views']
    else:
        actual_views = snowflake_data
    
    print(f"‚úÖ Loaded {len(actual_views)} Snowflake views")
    
    # Display sample view structure
    if actual_views:
        print(f"\nüìã Sample View Structure (First View):")
        sample_view = actual_views[0]
        print(f"  ‚Ä¢ View Name: {sample_view.get('view_name', 'N/A')}")
        print(f"  ‚Ä¢ Entity: {sample_view.get('entity', 'N/A')}")
        print(f"  ‚Ä¢ Description: {sample_view.get('description', 'N/A')[:100]}...")
        print(f"  ‚Ä¢ Columns: {len(sample_view.get('columns', []))} columns")
        print(f"  ‚Ä¢ Selector Keywords: {len(sample_view.get('selector', []))} keywords")
    
    # List all view entities
    print(f"\nüìù All View Entities:")
    for i, view in enumerate(actual_views, 1):
        entity = view.get('entity', 'Unknown')
        view_name = view.get('view_name', 'Unknown')
        print(f"  {i:2d}. {entity:30s} ({view_name})")
    
except FileNotFoundError:
    print(f"‚ùå ERROR: File not found: {SNOWFLAKE_VIEWS_FILE}")
    print("   Please ensure snowflake_view.json is in the current directory.")
    actual_views = []
except json.JSONDecodeError as e:
    print(f"‚ùå ERROR: Invalid JSON format: {e}")
    actual_views = []
except Exception as e:
    print(f"‚ùå ERROR: Unexpected error: {e}")
    actual_views = []

print(f"\n{'='*60}")
print(f"üìä Total Views Loaded: {len(actual_views)}")
print(f"{'='*60}")

üîç Loading Snowflake Views Metadata...
üìÅ Source file: ..\data\snowflake_view.json
------------------------------------------------------------
‚úÖ Loaded 21 Snowflake views

üìã Sample View Structure (First View):
  ‚Ä¢ View Name: Active Deal List
  ‚Ä¢ Entity: ACTIVE_DEAL_LIST_VW
  ‚Ä¢ Description: This view contains a list of all deals that are available for consideration and previously closed de...
  ‚Ä¢ Columns: 57 columns
  ‚Ä¢ Selector Keywords: 759 keywords

üìù All View Entities:
   1. ACTIVE_DEAL_LIST_VW            (Active Deal List)
   2. INVESTMENT_KPI_VW              (Investment KPI View)
   3. MIC_KPI_VW                     (MIC KPI View)
   4. PLATFORM_KPI_VW                (Platform KPI View)
   5. BUSINESS_UNIT_KPI_VW           (Business Unit KPI View)
   6. BUSINESS_UNIT_BY_SECTOR_VW     (Business Unit Sector View)
   7. MIC_BY_INVESTMENT_CLASS_VW     (MIC Investment Class View)
   8. PLATFORM_BY_INVESTMENT_CLASS_VW (Platform Investment Class View)
   9. BUSINES

### Analyze View Metadata Quality

In [None]:
# Analyze metadata completeness
print("üîç ANALYZING VIEW METADATA QUALITY")
print("=" * 60)

if actual_views:
    metadata_stats = {
        'total_views': len(actual_views),
        'with_descriptions': 0,
        'with_selectors': 0,
        'with_columns': 0,
        'avg_columns_per_view': 0,
        'avg_selectors_per_view': 0,
        'view_entities': []
    }
    
    total_columns = 0
    total_selectors = 0
    
    for view in actual_views:
        # Check description
        if view.get('description'):
            metadata_stats['with_descriptions'] += 1
        
        # Check selectors
        selectors = view.get('selector', [])
        if selectors:
            metadata_stats['with_selectors'] += 1
            total_selectors += len(selectors)
        
        # Check columns
        columns = view.get('columns', [])
        if columns:
            metadata_stats['with_columns'] += 1
            total_columns += len(columns)
        
        # Store entity
        metadata_stats['view_entities'].append(view.get('entity', 'Unknown'))
    
    # Calculate averages
    metadata_stats['avg_columns_per_view'] = total_columns / len(actual_views) if actual_views else 0
    metadata_stats['avg_selectors_per_view'] = total_selectors / len(actual_views) if actual_views else 0
    
    # Print results
    print(f"üìä Metadata Completeness:")
    print(f"  ‚Ä¢ Views with descriptions: {metadata_stats['with_descriptions']}/{metadata_stats['total_views']} ({metadata_stats['with_descriptions']/metadata_stats['total_views']*100:.1f}%)")
    print(f"  ‚Ä¢ Views with selectors: {metadata_stats['with_selectors']}/{metadata_stats['total_views']} ({metadata_stats['with_selectors']/metadata_stats['total_views']*100:.1f}%)")
    print(f"  ‚Ä¢ Views with columns: {metadata_stats['with_columns']}/{metadata_stats['total_views']} ({metadata_stats['with_columns']/metadata_stats['total_views']*100:.1f}%)")
    print(f"\nüìä Statistics:")
    print(f"  ‚Ä¢ Average columns per view: {metadata_stats['avg_columns_per_view']:.1f}")
    print(f"  ‚Ä¢ Average selectors per view: {metadata_stats['avg_selectors_per_view']:.1f}")
    
else:
    print("‚ö†Ô∏è  No views loaded, skipping analysis")
    metadata_stats = {}

üîç ANALYZING VIEW METADATA QUALITY
üìä Metadata Completeness:
  ‚Ä¢ Views with descriptions: 21/21 (100.0%)
  ‚Ä¢ Views with selectors: 21/21 (100.0%)
  ‚Ä¢ Views with columns: 21/21 (100.0%)

üìä Statistics:
  ‚Ä¢ Average columns per view: 26.3
  ‚Ä¢ Average selectors per view: 331.3

üíæ Saved metadata to: ..\data\processed\snowflake_views_metadata.json


## 3Ô∏è‚É£ Load Historical Training Examples

Load batch test results containing questions and their corresponding SQL queries.

In [8]:
# Load batch test results (historical data)
BATCH_RESULTS_FILE = Path("../data/batch_test_results_20250924_172439.jsonl")

print("üîç Loading Historical Training Examples...")
print(f"üìÅ Source file: {BATCH_RESULTS_FILE}")
print("-" * 60)

batch_results = []

try:
    with open(BATCH_RESULTS_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            if line.strip():
                try:
                    batch_results.append(json.loads(line.strip()))
                except json.JSONDecodeError as e:
                    print(f"‚ö†Ô∏è  Warning: Skipping line {line_num} due to JSON error: {e}")
    
    print(f"‚úÖ Loaded {len(batch_results)} historical examples")
    
    # Convert to DataFrame for analysis
    df_batch_results = pd.json_normalize(batch_results)
    print(f"\nüìä DataFrame shape: {df_batch_results.shape}")
    print(f"üìä Columns: {list(df_batch_results.columns)}")
    
    # Display sample
    if len(df_batch_results) > 0:
        print(f"\nüìã Sample Record:")
        sample = batch_results[0]
        print(f"  ‚Ä¢ Question ID: {sample.get('question_id', 'N/A')}")
        print(f"  ‚Ä¢ Question: {sample.get('question', 'N/A')[:100]}...")
        print(f"  ‚Ä¢ Has SQL queries: {len(sample.get('generated_sql_queries', []))} queries")
        print(f"  ‚Ä¢ Has conversation history: {'Yes' if sample.get('conversation_history') else 'No'}")

except FileNotFoundError:
    print(f"‚ùå ERROR: File not found: {BATCH_RESULTS_FILE}")
    batch_results = []
    df_batch_results = pd.DataFrame()
except Exception as e:
    print(f"‚ùå ERROR: {e}")
    batch_results = []
    df_batch_results = pd.DataFrame()

print(f"\n{'='*60}")
print(f"üìä Total Examples Loaded: {len(batch_results)}")
print(f"{'='*60}")

üîç Loading Historical Training Examples...
üìÅ Source file: ..\data\batch_test_results_20250924_172439.jsonl
------------------------------------------------------------
‚úÖ Loaded 30 historical examples

üìä DataFrame shape: (30, 16)
üìä Columns: ['question_id', 'question', 'conversation_history', 'natural_language_answer', 'formatted_technical_answer', 'view_selection_reasoning', 'schema_info', 'generated_sql_queries', 'raw_query_results', 'result_count', 'processing_success', 'status', 'timestamp', 'has_conversation_history', 'sql_queries_count', 'raw_results_count']

üìã Sample Record:
  ‚Ä¢ Question ID: 1
  ‚Ä¢ Question: What is MICs current exposure in the USA in PE?...
  ‚Ä¢ Has SQL queries: 2 queries
  ‚Ä¢ Has conversation history: No

üìä Total Examples Loaded: 30


## 4Ô∏è‚É£ Extract Ground Truth View Selections

Parse SQL queries to identify which Snowflake views were actually used. This creates our ground truth labels for training.

In [9]:
def extract_view_from_sql(sql_queries: List[str]) -> str:
    """
    Extract view names from SQL queries and return as comma-separated string.
    
    Args:
        sql_queries: List of SQL query strings
        
    Returns:
        Comma-separated string of unique view names, or '<NO_VIEWS>' if none found
    """
    views = set()
    
    for query in sql_queries:
        if not query:
            continue
            
        # Look for FROM clauses with view names
        # Pattern: FROM schema.table_name or FROM table_name
        from_matches = re.findall(r'FROM\s+([A-Z_\.]+)', query, re.IGNORECASE)
        
        for match in from_matches:
            # Extract view name from fully qualified name
            if 'VT_QA_DB.GANDALF.' in match:
                view_name = match.split('.')[-1]
                views.add(view_name)
            elif '.' in match:
                # Handle other schema patterns
                view_name = match.split('.')[-1]
                views.add(view_name)
            else:
                views.add(match)
    
    # Return comma-separated string instead of list
    return ', '.join(sorted(views)) if views else '<NO_VIEWS>'


print("üîç EXTRACTING GROUND TRUTH VIEW SELECTIONS")
print("=" * 60)

sql_analysis = []
views_json = json.dumps(actual_views, indent=2) if actual_views else "{}"

for result in batch_results:
    sql_queries = result.get('generated_sql_queries', [])
    actual_views_used = extract_view_from_sql(sql_queries)
    
    sql_analysis.append({
        'question_id': result.get('question_id', ''),
        'question': result.get('question', ''),
        'available_views': views_json,
        'conversation_history': result.get('conversation_history', ''),
        'reasoning': result.get('natural_language_answer', ''),
        'selected_views': actual_views_used
    })

sql_df = pd.DataFrame(sql_analysis)

print(f"‚úÖ Extracted ground truth for {len(sql_df)} examples")
print(f"\nüìä View Selection Statistics:")
print(f"  ‚Ä¢ Examples with views: {(sql_df['selected_views'] != '<NO_VIEWS>').sum()}")
print(f"  ‚Ä¢ Examples without views: {(sql_df['selected_views'] == '<NO_VIEWS>').sum()}")

# Analyze view distribution
all_selected_views = []
for views_str in sql_df['selected_views']:
    if views_str != '<NO_VIEWS>':
        all_selected_views.extend([v.strip() for v in views_str.split(',')])

if all_selected_views:
    view_counts = pd.Series(all_selected_views).value_counts()
    print(f"\nüìä Most Frequently Selected Views:")
    for view, count in view_counts.head(10).items():
        print(f"  ‚Ä¢ {view}: {count} times")

üîç EXTRACTING GROUND TRUTH VIEW SELECTIONS
‚úÖ Extracted ground truth for 30 examples

üìä View Selection Statistics:
  ‚Ä¢ Examples with views: 25
  ‚Ä¢ Examples without views: 5

üìä Most Frequently Selected Views:
  ‚Ä¢ MIC_BY_COUNTRY_VW: 7 times
  ‚Ä¢ MIC_BY_ASSET_CLASS_VW: 6 times
  ‚Ä¢ MIC_BY_REGION_VW: 6 times
  ‚Ä¢ MIC_BY_SECTOR_VW: 3 times
  ‚Ä¢ MIC_KPI_VW: 2 times
  ‚Ä¢ MIC_BY_INVESTMENT_CLASS_VW: 2 times
  ‚Ä¢ PLATFORM_BY_SECTOR_VW: 1 times
  ‚Ä¢ BUSINESS_UNIT_BY_SECTOR_VW: 1 times
  ‚Ä¢ PLATFORM_BY_COUNTRY_VW: 1 times
  ‚Ä¢ PLATFORM_BY_REGION_VW: 1 times


### Display Sample Extracted Data

In [12]:
# Display sample extracted data
print("üìã SAMPLE EXTRACTED DATA")
print("=" * 60)

print(sql_df[['question_id', 'question', 'conversation_history', 'selected_views']].head(3).to_string())

# Save intermediate results
intermediate_output = OUTPUT_DIR / "training_raw_extracted.csv"
sql_df[['question_id', 'question', 'conversation_history', 'selected_views']].to_csv(
    intermediate_output, 
    index=False, 
    encoding='utf-8'
)
print(f"\nüíæ Saved intermediate data to: {intermediate_output}")

üìã SAMPLE EXTRACTED DATA
   question_id                                                             question conversation_history                            selected_views
0            1                      What is MICs current exposure in the USA in PE?                   []  MIC_BY_ASSET_CLASS_VW, MIC_BY_COUNTRY_VW
1            2          What is MICs current exposure in the GCC in Private Equity?                   []   MIC_BY_ASSET_CLASS_VW, MIC_BY_REGION_VW
2            3  In FinTech, report the current exposure for Europe for Group (MIC).                   []        MIC_BY_REGION_VW, MIC_BY_SECTOR_VW

üíæ Saved intermediate data to: ..\data\processed\training_raw_extracted.csv


## 5Ô∏è‚É£ Load Updated Training Dataset

Load the curated/updated training dataset with verified ground truth labels.

In [None]:
# Load updated training dataset with verified labels
UPDATED_TRAINING_FILE = Path("view_selector_results_updated.csv")

print("üîç Loading Updated Training Dataset...")
print(f"üìÅ Source file: {UPDATED_TRAINING_FILE}")
print("-" * 60)

try:
    sql_df_updated = pd.read_csv(UPDATED_TRAINING_FILE, encoding='utf-8')
    
    print(f"‚úÖ Loaded {len(sql_df_updated)} training examples")
    print(f"\nüìä DataFrame shape: {sql_df_updated.shape}")
    print(f"üìä Columns: {list(sql_df_updated.columns)}")
    
    # Display sample
    print(f"\nüìã Sample Records:")
    display_cols = [col for col in ['question_id', 'question', 'conversation_history', 'expected_views'] if col in sql_df_updated.columns]
    if display_cols:
        print(sql_df_updated[display_cols].head(3).to_string())
    
    # Analyze expected views
    if 'expected_views' in sql_df_updated.columns:
        print(f"\nüìä Expected Views Statistics:")
        print(f"  ‚Ä¢ Examples with expected views: {sql_df_updated['expected_views'].notna().sum()}")
        print(f"  ‚Ä¢ Examples without expected views: {sql_df_updated['expected_views'].isna().sum()}")
        
        # View distribution
        all_expected_views = []
        for views in sql_df_updated['expected_views'].dropna():
            if isinstance(views, str) and views and views != '<NO_VIEWS>':
                all_expected_views.extend([v.strip() for v in str(views).split(',')])
        
        if all_expected_views:
            expected_view_counts = pd.Series(all_expected_views).value_counts()
            print(f"\nüìä Most Common Expected Views:")
            for view, count in expected_view_counts.head(10).items():
                print(f"  ‚Ä¢ {view}: {count} times")
    
    # Use this as the primary training dataset
    sql_df = sql_df_updated.copy()
    
except FileNotFoundError:
    print(f"‚ö†Ô∏è  Warning: Updated file not found: {UPDATED_TRAINING_FILE}")
    print("   Using previously extracted data instead.")
except Exception as e:
    print(f"‚ùå ERROR: {e}")
    print("   Using previously extracted data instead.")

print(f"\n{'='*60}")
print(f"üìä Final Training Dataset Size: {len(sql_df)}")
print(f"{'='*60}")

## 6Ô∏è‚É£ Data Quality Assessment

Verify data completeness, check for missing values, and identify potential issues.

In [13]:
print("üîç DATA QUALITY ASSESSMENT")
print("=" * 60)

# Check for missing values
print("üìä Missing Values:")
missing_stats = sql_df.isnull().sum()
print(missing_stats[missing_stats > 0])
if missing_stats.sum() == 0:
    print("  ‚úÖ No missing values detected")

# Check for duplicate questions
print(f"\nüìä Duplicate Questions:")
if 'question' in sql_df.columns:
    duplicates = sql_df['question'].duplicated().sum()
    print(f"  ‚Ä¢ Duplicate questions: {duplicates}")
    if duplicates > 0:
        print("  ‚ö†Ô∏è  Warning: Duplicate questions found")
else:
    print("  ‚ö†Ô∏è  'question' column not found")

# Check conversation history presence
print(f"\nüìä Conversation History:")
if 'conversation_history' in sql_df.columns:
    with_history = (sql_df['conversation_history'].notna() & (sql_df['conversation_history'] != '')).sum()
    without_history = len(sql_df) - with_history
    print(f"  ‚Ä¢ Examples with history: {with_history} ({with_history/len(sql_df)*100:.1f}%)")
    print(f"  ‚Ä¢ Examples without history: {without_history} ({without_history/len(sql_df)*100:.1f}%)")

# Check expected views format
print(f"\nüìä Expected Views Format:")
if 'expected_views' in sql_df.columns:
    views_column = 'expected_views'
elif 'selected_views' in sql_df.columns:
    views_column = 'selected_views'
else:
    views_column = None

if views_column:
    no_views_count = (sql_df[views_column] == '<NO_VIEWS>').sum()
    with_views_count = len(sql_df) - no_views_count
    
    print(f"  ‚Ä¢ Examples with selected views: {with_views_count} ({with_views_count/len(sql_df)*100:.1f}%)")
    print(f"  ‚Ä¢ Examples with <NO_VIEWS>: {no_views_count} ({no_views_count/len(sql_df)*100:.1f}%)")
    
    # Count views per example
    views_per_example = []
    for views in sql_df[views_column]:
        if pd.notna(views) and views != '<NO_VIEWS>':
            views_per_example.append(len([v.strip() for v in str(views).split(',')]))
        else:
            views_per_example.append(0)
    
    if views_per_example:
        print(f"  ‚Ä¢ Average views per example: {np.mean(views_per_example):.2f}")
        print(f"  ‚Ä¢ Max views per example: {max(views_per_example)}")
        print(f"  ‚Ä¢ Min views per example: {min(views_per_example)}")

# Generate data quality report
data_quality_report = {
    'total_examples': len(sql_df),
    'missing_values': missing_stats.to_dict(),
    'duplicate_questions': int(duplicates) if 'question' in sql_df.columns else None,
    'with_conversation_history': int(with_history) if 'conversation_history' in sql_df.columns else None,
    'with_selected_views': int(with_views_count) if views_column else None,
    'avg_views_per_example': float(np.mean(views_per_example)) if views_per_example else None,
    'metadata_stats': metadata_stats
}

# Save quality report
quality_report_path = OUTPUT_DIR / "data_quality_report.json"
with open(quality_report_path, 'w', encoding='utf-8') as f:
    json.dump(data_quality_report, f, indent=2, ensure_ascii=False)

print(f"\nüíæ Saved quality report to: {quality_report_path}")
print(f"\n{'='*60}")
print("‚úÖ Data quality assessment complete")
print(f"{'='*60}")

üîç DATA QUALITY ASSESSMENT
üìä Missing Values:
Series([], dtype: int64)
  ‚úÖ No missing values detected

üìä Duplicate Questions:
  ‚Ä¢ Duplicate questions: 0

üìä Conversation History:
  ‚Ä¢ Examples with history: 30 (100.0%)
  ‚Ä¢ Examples without history: 0 (0.0%)

üìä Expected Views Format:
  ‚Ä¢ Examples with selected views: 25 (83.3%)
  ‚Ä¢ Examples with <NO_VIEWS>: 5 (16.7%)
  ‚Ä¢ Average views per example: 1.13
  ‚Ä¢ Max views per example: 2
  ‚Ä¢ Min views per example: 0

üíæ Saved quality report to: ..\data\processed\data_quality_report.json

‚úÖ Data quality assessment complete


## 7Ô∏è‚É£ Prepare Final Training Examples

Convert DataFrame to dictionary format and prepare for DSPy format conversion.

In [15]:
# Convert to dictionary format for easier processing
print("üîÑ Converting to Dictionary Format...")

# Ensure we're using the correct column name for expected views
if 'expected_views' in sql_df.columns:
    views_column = 'expected_views'
elif 'selected_views' in sql_df.columns:
    views_column = 'selected_views'
else:
    print("‚ö†Ô∏è  Warning: No views column found, will use empty values")
    views_column = None

training_examples = []
for idx, row in sql_df.iterrows():
    example = {
        'question_id': row.get('question_id', f'q_{idx}'),
        'question': row.get('question', ''),
        'conversation_history': row.get('conversation_history', ''),
        'expected_views': row.get(views_column, '<NO_VIEWS>') if views_column else '<NO_VIEWS>',
    }
    
    # Add any additional fields that exist
    for col in sql_df.columns:
        if col not in example and col not in ['question_id', 'question', 'conversation_history', views_column]:
            example[col] = row.get(col, '')
    
    training_examples.append(example)

print(f"‚úÖ Converted {len(training_examples)} examples to dictionary format")

üîÑ Converting to Dictionary Format...
‚úÖ Converted 30 examples to dictionary format


## 8Ô∏è‚É£ Train/Test Split

Create stratified 70/30 split for training and testing, ensuring balanced distribution of view selections.

In [14]:
import random

print("üîÄ CREATING TRAIN/TEST SPLIT")
print("=" * 60)

# Set random seed for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# Shuffle examples
shuffled_examples = training_examples.copy()
random.shuffle(shuffled_examples)

# Split ratio
TRAIN_RATIO = 0.7
split_index = int(len(shuffled_examples) * TRAIN_RATIO)

train_examples = shuffled_examples[:split_index]
test_examples = shuffled_examples[split_index:]

print(f"üìä Split Configuration:")
print(f"  ‚Ä¢ Random seed: {RANDOM_SEED}")
print(f"  ‚Ä¢ Train ratio: {TRAIN_RATIO:.0%}")
print(f"  ‚Ä¢ Train size: {len(train_examples)} examples")
print(f"  ‚Ä¢ Test size: {len(test_examples)} examples")
print(f"  ‚Ä¢ Total: {len(training_examples)} examples")

# Analyze split distribution
print(f"\nüìä Split Distribution Analysis:")

def analyze_split(examples, split_name):
    """Analyze characteristics of a data split"""
    with_views = sum(1 for ex in examples if ex['expected_views'] != '<NO_VIEWS>')
    with_history = sum(1 for ex in examples if ex.get('conversation_history'))
    
    print(f"\n  {split_name}:")
    print(f"    ‚Ä¢ Total: {len(examples)}")
    print(f"    ‚Ä¢ With views: {with_views} ({with_views/len(examples)*100:.1f}%)")
    print(f"    ‚Ä¢ With conversation history: {with_history} ({with_history/len(examples)*100:.1f}%)")
    
    # View count distribution
    view_counts = []
    for ex in examples:
        if ex['expected_views'] != '<NO_VIEWS>':
            view_counts.append(len([v.strip() for v in ex['expected_views'].split(',')]))
    
    if view_counts:
        print(f"    ‚Ä¢ Avg views per example: {np.mean(view_counts):.2f}")

analyze_split(train_examples, "Training Set")
analyze_split(test_examples, "Test Set")

print(f"\n{'='*60}")
print("‚úÖ Train/test split complete")
print(f"{'='*60}")

üîÄ CREATING TRAIN/TEST SPLIT


NameError: name 'training_examples' is not defined

## 9Ô∏è‚É£ Export Clean Datasets

Save preprocessed training and test datasets for use in subsequent notebooks.

In [None]:
print("üíæ EXPORTING CLEAN DATASETS")
print("=" * 60)

# Export paths
train_output = OUTPUT_DIR / "train_examples.json"
test_output = OUTPUT_DIR / "test_examples.json"
all_output = OUTPUT_DIR / "all_examples.json"

# Save training data
with open(train_output, 'w', encoding='utf-8') as f:
    json.dump(train_examples, f, indent=2, ensure_ascii=False)
print(f"‚úÖ Saved {len(train_examples)} training examples to: {train_output}")

# Save test data
with open(test_output, 'w', encoding='utf-8') as f:
    json.dump(test_examples, f, indent=2, ensure_ascii=False)
print(f"‚úÖ Saved {len(test_examples)} test examples to: {test_output}")

# Save all data (for reference)
with open(all_output, 'w', encoding='utf-8') as f:
    json.dump(training_examples, f, indent=2, ensure_ascii=False)
print(f"‚úÖ Saved {len(training_examples)} total examples to: {all_output}")

# Create summary document
summary = {
    'dataset_info': {
        'total_examples': len(training_examples),
        'train_examples': len(train_examples),
        'test_examples': len(test_examples),
        'split_ratio': TRAIN_RATIO,
        'random_seed': RANDOM_SEED
    },
    'metadata': {
        'total_views': len(actual_views) if actual_views else 0,
        'view_entities': metadata_stats.get('view_entities', []) if metadata_stats else []
    },
    'data_quality': data_quality_report,
    'output_files': {
        'train': str(train_output),
        'test': str(test_output),
        'all': str(all_output),
        'views_metadata': str(OUTPUT_DIR / "snowflake_views_metadata.json"),
        'quality_report': str(quality_report_path)
    }
}

summary_output = OUTPUT_DIR / "dataset_summary.json"
with open(summary_output, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"‚úÖ Saved dataset summary to: {summary_output}")

print(f"\n{'='*60}")
print("‚úÖ All datasets exported successfully!")
print(f"{'='*60}")