# Database Dump Transformation and Cleaning

This notebook transforms the PostgreSQL dump file into cleaned pandas DataFrames for analysis.

## Setup and Dependencies

In [None]:
# # Install required dependencies
# import sys
# import subprocess

# def install_package(package):
#     """Install a package using pip."""
#     try:
#         __import__(package)
#         print(f"✓ {package} is already installed")
#     except ImportError:
#         print(f"Installing {package}...")
#         subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])
#         print(f"✓ {package} installed successfully")

# # Install required packages
# required_packages = [
#     "pandas",
#     "numpy",
#     "jupyter",
#     "ipython"
# ]

# print("Checking and installing dependencies...\n")
# for package in required_packages:
#     install_package(package)

# print("\n✓ All dependencies are ready!")
# print("\nNote: If you need to convert PostgreSQL custom format dumps, you'll also need:")
# print("  - PostgreSQL client tools (pg_restore)")
# print("  - macOS: brew install postgresql")
# print("  - Ubuntu: sudo apt-get install postgresql-client")

Checking and installing dependencies...

Installing pandas...
✓ pandas installed successfully
✓ numpy is already installed
✓ jupyter is already installed
Installing ipython...
✓ ipython installed successfully

✓ All dependencies are ready!

Note: If you need to convert PostgreSQL custom format dumps, you'll also need:
  - PostgreSQL client tools (pg_restore)
  - macOS: brew install postgresql
  - Ubuntu: sudo apt-get install postgresql-client


In [2]:
import pandas as pd
import numpy as np
import json
import re
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Pandas version: 3.0.0
NumPy version: 2.4.1


## Step 1: Locate and Verify Dump File

In [3]:
# Find the dump file
dump_files = [
    Path.home() / "Downloads" / "b078-20260113-215725.dump",
    Path("/workspace") / "heroku_psql_181025.dump",
    Path("./heroku_psql_181025.dump"),
    Path("./b078-20260113-215725.dump")
]

dump_file = None
for df in dump_files:
    if df.exists():
        dump_file = df
        break

if dump_file:
    print(f"Found dump file: {dump_file}")
    print(f"File size: {dump_file.stat().st_size / 1024:.2f} KB")
    
    # Check file format
    with open(dump_file, 'rb') as f:
        header = f.read(5)
        if header == b'PGDMP':
            print("Format: PostgreSQL custom format (requires pg_restore)")
            file_format = 'custom'
        else:
            print("Format: Plain SQL")
            file_format = 'sql'
else:
    print("ERROR: Could not find dump file.")
    print("Please ensure the dump file is in one of these locations:")
    for df in dump_files:
        print(f"  - {df}")

Found dump file: /Users/johndriscoll/Downloads/b078-20260113-215725.dump
File size: 248.83 KB
Format: PostgreSQL custom format (requires pg_restore)


## Manual Conversion Instructions

If automatic conversion failed, convert the dump file manually using one of these methods:

### Method 1: Using PostgreSQL 16 (Recommended)

Since PostgreSQL 16 is "keg-only" (not in PATH), use the full path:

**For Apple Silicon Macs:**
```bash
/opt/homebrew/opt/postgresql@17/bin/pg_restore --no-owner --no-privileges -f ~/Downloads/b078-20260113-215725.sql ~/Downloads/b078-20260113-215725.dump
```

**For Intel Macs:**
```bash
/usr/local/opt/postgresql@16/bin/pg_restore --no-owner --no-privileges -f ~/Downloads/b078-20260113-215725.sql ~/Downloads/b078-20260113-215725.dump
```


In [None]:
## Step 2: Convert Dump to SQL (if needed)

This cell will automatically convert the PostgreSQL custom format dump to SQL format.

In [None]:
import subprocess
import os

# Check if we need to convert
if 'dump_file' in globals() and 'file_format' in globals() and file_format == 'custom':
    print("="*60)
    print("CONVERTING CUSTOM FORMAT DUMP TO SQL")
    print("="*60)
    
    # Define output SQL file
    sql_file = dump_file.with_suffix('.sql')
    
    # Check if SQL file already exists
    if sql_file.exists():
        print(f"\n✓ SQL file already exists: {sql_file}")
        print(f"  File size: {sql_file.stat().st_size / 1024:.2f} KB")
        print("  Using existing file. Skipping conversion.")
        dump_file = sql_file
        file_format = 'sql'
    else:
        print(f"\nInput dump file: {dump_file}")
        print(f"Output SQL file: {sql_file}")
        
        # Try to find pg_restore in various locations
        pg_restore_paths = [
            # PostgreSQL 17 (newest)
            '/opt/homebrew/opt/postgresql@17/bin/pg_restore',
            '/usr/local/opt/postgresql@17/bin/pg_restore',
            # PostgreSQL 16
            '/opt/homebrew/opt/postgresql@16/bin/pg_restore',
            '/usr/local/opt/postgresql@16/bin/pg_restore',
            # PostgreSQL 15
            '/opt/homebrew/opt/postgresql@15/bin/pg_restore',
            '/usr/local/opt/postgresql@15/bin/pg_restore',
            # Standard locations
            '/opt/homebrew/bin/pg_restore',
            '/usr/local/bin/pg_restore',
            'pg_restore',  # In PATH
        ]
        
        pg_restore_path = None
        version_info = None
        
        print("\nSearching for pg_restore...")
        for path in pg_restore_paths:
            try:
                if path == 'pg_restore':
                    # Check if it's in PATH
                    result = subprocess.run(
                        ['which', 'pg_restore'],
                        capture_output=True,
                        text=True
                    )
                    if result.returncode == 0:
                        path = result.stdout.strip()
                    else:
                        continue
                
                if os.path.exists(path):
                    # Check version
                    result = subprocess.run(
                        [path, '--version'],
                        capture_output=True,
                        text=True,
                        check=True
                    )
                    pg_restore_path = path
                    version_info = result.stdout.strip()
                    print(f"✓ Found: {version_info}")
                    print(f"  Location: {path}")
                    break
            except Exception as e:
                continue
        
        if pg_restore_path:
            print(f"\nConverting dump to SQL...")
            print(f"  Command: {pg_restore_path} --no-owner --no-privileges -f {sql_file} {dump_file}")
            
            try:
                result = subprocess.run(
                    [pg_restore_path, '--no-owner', '--no-privileges', '-f', str(sql_file), str(dump_file)],
                    capture_output=True,
                    text=True,
                    timeout=300  # 5 minute timeout
                )
                
                if result.returncode == 0:
                    if sql_file.exists():
                        print(f"\n✓✓✓ CONVERSION SUCCESSFUL! ✓✓✓")
                        print(f"  Created: {sql_file}")
                        print(f"  File size: {sql_file.stat().st_size / 1024:.2f} KB")
                        dump_file = sql_file
                        file_format = 'sql'
                        print("\n✓ Ready to proceed with parsing!")
                    else:
                        print(f"\n✗ Conversion reported success but SQL file not found!")
                        print(f"  Expected location: {sql_file}")
                else:
                    error_msg = result.stderr.strip() if result.stderr else result.stdout.strip()
                    print(f"\n✗ CONVERSION FAILED")
                    print(f"  Error: {error_msg}")
                    
                    if 'unsupported version' in error_msg.lower():
                        print("\n" + "="*60)
                        print("VERSION INCOMPATIBILITY")
                        print("="*60)
                        print(f"\nThe dump file format version is not supported by {version_info}.")
                        print("\nSOLUTIONS:")
                        print("\n1. Try a newer PostgreSQL version (17 or 18)")
                        print("   brew install postgresql@17")
                        print(f"   /opt/homebrew/opt/postgresql@17/bin/pg_restore -f {sql_file} {dump_file}")
                        print("\n2. Use Docker:")
                        print(f"   docker run --rm -v ~/Downloads:/data postgres:17 \\")
                        print(f"     pg_restore -f /data/{sql_file.name} /data/{dump_file.name}")
                        print("\n3. Check if SQL file was already created manually")
                    else:
                        print("\nPlease check the error message above and try manual conversion.")
                        print("See the manual conversion instructions in the previous cell.")
            except subprocess.TimeoutExpired:
                print("\n✗ Conversion timed out after 5 minutes.")
                print("  The dump file might be very large or there's an issue.")
            except Exception as e:
                print(f"\n✗ Error during conversion: {e}")
        else:
            print("\n✗ pg_restore not found!")
            print("\nPlease install PostgreSQL client tools:")
            print("  brew install postgresql@17")
            print("\nOr use Docker (see manual conversion instructions above).")
            print("\nAfter manual conversion, set:")
            print(f"  dump_file = Path('{sql_file}')")
            print(f"  file_format = 'sql'")
            print("\nThen proceed to the next cell.")

elif 'dump_file' in globals() and 'file_format' in globals() and file_format == 'sql':
    print("✓ Dump file is already in SQL format.")
    print(f"  File: {dump_file}")
    if isinstance(dump_file, Path) and dump_file.exists():
        print(f"  File size: {dump_file.stat().st_size / 1024:.2f} KB")
    print("\nReady to proceed with parsing!")
else:
    print("⚠ Cannot convert: dump_file or file_format not set.")
    print("Please run the previous cell to locate and detect the dump file format.")

In [6]:
! /opt/homebrew/opt/postgresql@17/bin/pg_restore --no-owner --no-privileges -v -f ~/Downloads/b078-20260113-215725.sql ~/Downloads/b078-20260113-215725.dump

pg_restore: creating SCHEMA "public"
pg_restore: creating EXTENSION "pg_stat_statements"
pg_restore: creating COMMENT "EXTENSION "pg_stat_statements""
pg_restore: creating TABLE "public.alembic_version"
pg_restore: creating TABLE "public.assignment_session_activity"
pg_restore: creating TABLE "public.attention_check_question"
pg_restore: creating TABLE "public.attention_check_response"
pg_restore: creating TABLE "public.attention_check_scenarios"
pg_restore: creating TABLE "public.auth"
pg_restore: creating TABLE "public.channel"
pg_restore: creating TABLE "public.channel_member"
pg_restore: creating TABLE "public.chat"
pg_restore: creating TABLE "public.chatidtag"
pg_restore: creating TABLE "public.child_profile"
pg_restore: creating TABLE "public.config"
pg_restore: creating SEQUENCE "public.config_id_seq"
pg_restore: creating SEQUENCE OWNED BY "public.config_id_seq"
pg_restore: creating TABLE "public.consent_audit"
pg_restore: creating TABLE "public.document"
pg_restore: creating TA

In [10]:
dump_file = "~/Downloads/b078-20260113-215725.sql"

## Step 3: Parse SQL Dump and Extract Tables

We'll parse the SQL dump to extract table data into pandas DataFrames.

In [11]:
def parse_sql_dump(sql_file_path):
    """Parse SQL dump file and extract table data."""
    print(f"Reading SQL file: {sql_file_path}")
    
    with open(sql_file_path, 'rb') as f:
        content = f.read()
    
    # Try to decode
    try:
        text_content = content.decode('utf-8')
    except UnicodeDecodeError:
        print("Warning: UTF-8 decode failed, trying latin-1...")
        text_content = content.decode('latin-1', errors='ignore')
    
    tables_data = {}
    
    # Find all COPY statements
    copy_pattern = r'COPY "public"\."(\w+)"\s*\(([^)]+)\)\s+FROM stdin;'
    copy_matches = list(re.finditer(copy_pattern, text_content, re.MULTILINE))
    
    print(f"Found {len(copy_matches)} COPY statements")
    
    for i, copy_match in enumerate(copy_matches):
        table_name = copy_match.group(1)
        columns_str = copy_match.group(2)
        
        # Parse column names
        columns = [col.strip().strip('"') for col in columns_str.split(',')]
        
        # Find data section
        start_pos = copy_match.end()
        
        # Find end marker
        if i + 1 < len(copy_matches):
            end_pos = copy_matches[i + 1].start()
        else:
            end_marker = text_content.find('\\.', start_pos)
            end_pos = end_marker if end_marker != -1 else len(text_content)
        
        data_section = text_content[start_pos:end_pos]
        
        # Parse tab-separated values
        rows = []
        lines = data_section.strip().split('\n')
        
        for line in lines:
            line = line.strip()
            if not line or line.startswith('\\'):
                continue
            
            # Split by tab
            values = line.split('\t')
            
            if len(values) != len(columns):
                continue
            
            row = {}
            for col, val in zip(columns, values):
                # Handle NULL
                if val == '\\N':
                    row[col] = None
                else:
                    # Try to parse JSON
                    if val.startswith('{') or val.startswith('['):
                        try:
                            row[col] = json.loads(val)
                        except:
                            row[col] = val
                    else:
                        row[col] = val
            
            rows.append(row)
        
        if rows:
            tables_data[table_name] = pd.DataFrame(rows)
            print(f"  ✓ {table_name}: {len(rows)} rows")
    
    return tables_data

# Parse the dump
# Handle path expansion and format detection
if 'dump_file' in globals() and dump_file:
    # Convert to Path object and expand ~ if it's a string
    if isinstance(dump_file, str):
        dump_file = Path(dump_file).expanduser()
    elif not isinstance(dump_file, Path):
        dump_file = Path(dump_file)
    
    # Auto-detect format if not set
    if 'file_format' not in globals() or not file_format:
        if dump_file.suffix == '.sql':
            file_format = 'sql'
        elif dump_file.suffix == '.dump':
            file_format = 'custom'
        else:
            # Check file header
            with open(dump_file, 'rb') as f:
                header = f.read(5)
                if header == b'PGDMP':
                    file_format = 'custom'
                else:
                    file_format = 'sql'
    
    # Verify file exists
    if not dump_file.exists():
        print(f"ERROR: File not found: {dump_file}")
        print(f"Please check the path and try again.")
        raw_dataframes = {}
    elif file_format == 'sql':
        print(f"Parsing SQL dump: {dump_file}")
        raw_dataframes = parse_sql_dump(str(dump_file))
        print(f"\nTotal tables extracted: {len(raw_dataframes)}")
        print(f"Tables: {', '.join(sorted(raw_dataframes.keys()))}")
    else:
        print(f"Cannot proceed: dump file is in {file_format} format, not SQL.")
        print("Please convert to SQL format first (see conversion cells above).")
        raw_dataframes = {}
else:
    print("ERROR: dump_file variable not set.")
    print("Please set dump_file to the path of your SQL dump file.")
    print("Example: dump_file = Path('~/Downloads/b078-20260113-215725.sql')")
    raw_dataframes = {}

Cannot proceed: dump file is in custom format, not SQL.
Please convert to SQL format first (see conversion cells above).


## Step 4: Define Relevant Tables

We'll focus on these tables for analysis:

In [None]:
RELEVANT_TABLES = [
    'user',
    'chat',
    'message',
    'child_profile',
    'selection',
    'moderation_scenario',
    'moderation_session',
    'moderation_applied',
    'moderation_question_answer',
    'exit_quiz_response',
    'scenario_assignments',
    'scenarios',
    'attention_check_scenarios',
    'assignment_session_activity',
]

print("Relevant tables for analysis:")
for table in RELEVANT_TABLES:
    status = "✓" if table in raw_dataframes else "✗"
    count = len(raw_dataframes[table]) if table in raw_dataframes else 0
    print(f"  {status} {table}: {count} rows")

## Step 5: Clean and Transform Data

Now we'll clean and transform each relevant table systematically.

### 5.1: Helper Functions for Data Cleaning

In [None]:
def convert_timestamps(df, timestamp_cols=None):
    """Convert timestamp columns to datetime."""
    df = df.copy()
    
    if timestamp_cols is None:
        # Auto-detect timestamp columns
        timestamp_cols = [col for col in df.columns 
                         if 'at' in col.lower() or 'time' in col.lower()]
    
    for col in timestamp_cols:
        if col not in df.columns:
            continue
            
        try:
            # Try numeric conversion first
            if df[col].dtype in ['int64', 'float64']:
                sample_val = df[col].dropna()
                if len(sample_val) > 0:
                    val = sample_val.iloc[0]
                    # Determine unit based on magnitude
                    if val > 1e12:
                        # Nanoseconds
                        df[f'{col}_datetime'] = pd.to_datetime(df[col] / 1e9, unit='s', errors='coerce')
                    elif val > 1e9:
                        # Milliseconds
                        df[f'{col}_datetime'] = pd.to_datetime(df[col], unit='ms', errors='coerce')
                    else:
                        # Seconds
                        df[f'{col}_datetime'] = pd.to_datetime(df[col], unit='s', errors='coerce')
            else:
                # Try direct datetime conversion
                df[f'{col}_datetime'] = pd.to_datetime(df[col], errors='coerce')
        except Exception as e:
            print(f"    Warning: Could not convert {col}: {e}")
    
    return df

def clean_strings(df):
    """Clean string columns."""
    df = df.copy()
    
    for col in df.columns:
        if df[col].dtype == 'object':
            # Convert to string, handle None
            df[col] = df[col].astype(str).replace('None', None).replace('nan', None)
            # Remove null bytes
            df[col] = df[col].str.replace('\x00', '', regex=False)
    
    return df

def parse_json_column(df, col_name, new_col_name=None):
    """Parse JSON column into Python dict/list."""
    if new_col_name is None:
        new_col_name = f'{col_name}_parsed'
    
    if col_name not in df.columns:
        return df
    
    def parse_json(val):
        if isinstance(val, (dict, list)):
            return val
        if isinstance(val, str):
            if val.startswith('{') or val.startswith('['):
                try:
                    return json.loads(val)
                except:
                    return None
        return None
    
    df[new_col_name] = df[col_name].apply(parse_json)
    return df

print("Helper functions defined.")

### 5.2: Clean User Table

In [None]:
if 'user' in raw_dataframes:
    print("Cleaning user table...")
    df_users = raw_dataframes['user'].copy()
    
    # Basic cleaning
    df_users = clean_strings(df_users)
    df_users = convert_timestamps(df_users)
    
    # Parse JSON fields
    df_users = parse_json_column(df_users, 'info')
    df_users = parse_json_column(df_users, 'settings')
    
    # Convert date_of_birth if present
    if 'date_of_birth' in df_users.columns:
        df_users['date_of_birth'] = pd.to_datetime(df_users['date_of_birth'], errors='coerce')
    
    print(f"  ✓ Cleaned user table: {len(df_users)} rows, {len(df_users.columns)} columns")
    print(f"  Columns: {', '.join(df_users.columns[:10])}..." if len(df_users.columns) > 10 else f"  Columns: {', '.join(df_users.columns)}")
else:
    print("✗ User table not found")
    df_users = pd.DataFrame()

### 5.3: Clean Chat Table

In [None]:
if 'chat' in raw_dataframes:
    print("Cleaning chat table...")
    df_chats = raw_dataframes['chat'].copy()
    
    # Basic cleaning
    df_chats = clean_strings(df_chats)
    df_chats = convert_timestamps(df_chats)
    
    # Parse JSON fields
    df_chats = parse_json_column(df_chats, 'chat')
    df_chats = parse_json_column(df_chats, 'meta')
    
    # Extract message count from chat JSON
    if 'chat_parsed' in df_chats.columns:
        def count_messages(chat_data):
            if isinstance(chat_data, dict):
                history = chat_data.get('history', {})
                messages = history.get('messages', {})
                if isinstance(messages, dict):
                    return len(messages)
            return 0
        
        df_chats['message_count'] = df_chats['chat_parsed'].apply(count_messages)
    
    # Convert boolean columns
    for col in ['archived', 'pinned']:
        if col in df_chats.columns:
            df_chats[col] = df_chats[col].astype(str).str.lower() == 'true'
    
    print(f"  ✓ Cleaned chat table: {len(df_chats)} rows, {len(df_chats.columns)} columns")
    print(f"  Total messages across all chats: {df_chats['message_count'].sum() if 'message_count' in df_chats.columns else 'N/A'}")
else:
    print("✗ Chat table not found")
    df_chats = pd.DataFrame()

### 5.4: Clean Message Table

In [None]:
if 'message' in raw_dataframes:
    print("Cleaning message table...")
    df_messages = raw_dataframes['message'].copy()
    
    # Basic cleaning
    df_messages = clean_strings(df_messages)
    df_messages = convert_timestamps(df_messages)
    
    # Parse JSON fields
    df_messages = parse_json_column(df_messages, 'data')
    df_messages = parse_json_column(df_messages, 'meta')
    
    print(f"  ✓ Cleaned message table: {len(df_messages)} rows, {len(df_messages.columns)} columns")
    if 'role' in df_messages.columns:
        print(f"  Messages by role:")
        print(df_messages['role'].value_counts().to_string())
else:
    print("✗ Message table not found")
    df_messages = pd.DataFrame()

### 5.5: Clean Child Profile Table

In [None]:
if 'child_profile' in raw_dataframes:
    print("Cleaning child_profile table...")
    df_child_profiles = raw_dataframes['child_profile'].copy()
    
    # Basic cleaning
    df_child_profiles = clean_strings(df_child_profiles)
    df_child_profiles = convert_timestamps(df_child_profiles)
    
    # Parse JSON fields if any
    for col in df_child_profiles.columns:
        if df_child_profiles[col].dtype == 'object':
            sample = df_child_profiles[col].dropna().astype(str)
            if len(sample) > 0 and sample.str.startswith('{').any():
                df_child_profiles = parse_json_column(df_child_profiles, col)
    
    # Convert boolean columns
    for col in ['is_current', 'is_only_child']:
        if col in df_child_profiles.columns:
            df_child_profiles[col] = df_child_profiles[col].astype(str).str.lower() == 'true'
    
    # Convert numeric columns
    for col in ['attempt_number', 'session_number']:
        if col in df_child_profiles.columns:
            df_child_profiles[col] = pd.to_numeric(df_child_profiles[col], errors='coerce')
    
    print(f"  ✓ Cleaned child_profile table: {len(df_child_profiles)} rows, {len(df_child_profiles.columns)} columns")
    print(f"  Unique users: {df_child_profiles['user_id'].nunique() if 'user_id' in df_child_profiles.columns else 'N/A'}")
else:
    print("✗ Child profile table not found")
    df_child_profiles = pd.DataFrame()

### 5.6: Clean Selection Table

In [None]:
if 'selection' in raw_dataframes:
    print("Cleaning selection table...")
    df_selections = raw_dataframes['selection'].copy()
    
    # Basic cleaning
    df_selections = clean_strings(df_selections)
    df_selections = convert_timestamps(df_selections)
    
    # Parse JSON fields
    df_selections = parse_json_column(df_selections, 'meta')
    
    # Convert numeric columns
    for col in ['start_offset', 'end_offset']:
        if col in df_selections.columns:
            df_selections[col] = pd.to_numeric(df_selections[col], errors='coerce')
    
    print(f"  ✓ Cleaned selection table: {len(df_selections)} rows, {len(df_selections.columns)} columns")
    if 'role' in df_selections.columns:
        print(f"  Selections by role:")
        print(df_selections['role'].value_counts().to_string())
    if 'source' in df_selections.columns:
        print(f"  Selections by source:")
        print(df_selections['source'].value_counts().to_string())
else:
    print("✗ Selection table not found")
    df_selections = pd.DataFrame()

### 5.7: Clean Moderation Tables

In [None]:
# Moderation Scenario
if 'moderation_scenario' in raw_dataframes:
    print("Cleaning moderation_scenario table...")
    df_mod_scenarios = raw_dataframes['moderation_scenario'].copy()
    df_mod_scenarios = clean_strings(df_mod_scenarios)
    df_mod_scenarios = convert_timestamps(df_mod_scenarios)
    
    # Convert boolean columns
    for col in ['is_applicable']:
        if col in df_mod_scenarios.columns:
            df_mod_scenarios[col] = df_mod_scenarios[col].astype(str).str.lower() == 'true'
    
    print(f"  ✓ Cleaned moderation_scenario: {len(df_mod_scenarios)} rows")
else:
    df_mod_scenarios = pd.DataFrame()

# Moderation Session
if 'moderation_session' in raw_dataframes:
    print("Cleaning moderation_session table...")
    df_mod_sessions = raw_dataframes['moderation_session'].copy()
    df_mod_sessions = clean_strings(df_mod_sessions)
    df_mod_sessions = convert_timestamps(df_mod_sessions)
    
    # Parse JSON fields
    json_cols = ['strategies', 'custom_instructions', 'highlighted_texts', 
                 'refactored_response', 'session_metadata']
    for col in json_cols:
        if col in df_mod_sessions.columns:
            df_mod_sessions = parse_json_column(df_mod_sessions, col)
    
    # Convert boolean and numeric columns
    if 'is_final_version' in df_mod_sessions.columns:
        df_mod_sessions['is_final_version'] = df_mod_sessions['is_final_version'].astype(str).str.lower() == 'true'
    for col in ['scenario_index', 'attempt_number', 'version_number']:
        if col in df_mod_sessions.columns:
            df_mod_sessions[col] = pd.to_numeric(df_mod_sessions[col], errors='coerce')
    
    print(f"  ✓ Cleaned moderation_session: {len(df_mod_sessions)} rows")
else:
    df_mod_sessions = pd.DataFrame()

# Moderation Applied
if 'moderation_applied' in raw_dataframes:
    print("Cleaning moderation_applied table...")
    df_mod_applied = raw_dataframes['moderation_applied'].copy()
    df_mod_applied = clean_strings(df_mod_applied)
    df_mod_applied = convert_timestamps(df_mod_applied)
    
    # Parse JSON fields
    json_cols = ['strategies', 'custom_instructions', 'highlighted_texts', 'refactored_response']
    for col in json_cols:
        if col in df_mod_applied.columns:
            df_mod_applied = parse_json_column(df_mod_applied, col)
    
    if 'confirmed_preferred' in df_mod_applied.columns:
        df_mod_applied['confirmed_preferred'] = df_mod_applied['confirmed_preferred'].astype(str).str.lower() == 'true'
    if 'version_index' in df_mod_applied.columns:
        df_mod_applied['version_index'] = pd.to_numeric(df_mod_applied['version_index'], errors='coerce')
    
    print(f"  ✓ Cleaned moderation_applied: {len(df_mod_applied)} rows")
else:
    df_mod_applied = pd.DataFrame()

# Moderation Question Answer
if 'moderation_question_answer' in raw_dataframes:
    print("Cleaning moderation_question_answer table...")
    df_mod_qa = raw_dataframes['moderation_question_answer'].copy()
    df_mod_qa = clean_strings(df_mod_qa)
    df_mod_qa = convert_timestamps(df_mod_qa)
    print(f"  ✓ Cleaned moderation_question_answer: {len(df_mod_qa)} rows")
else:
    df_mod_qa = pd.DataFrame()

### 5.8: Clean Exit Quiz Table

In [None]:
if 'exit_quiz_response' in raw_dataframes:
    print("Cleaning exit_quiz_response table...")
    df_exit_quiz = raw_dataframes['exit_quiz_response'].copy()
    
    # Basic cleaning
    df_exit_quiz = clean_strings(df_exit_quiz)
    df_exit_quiz = convert_timestamps(df_exit_quiz)
    
    # Parse JSON fields
    df_exit_quiz = parse_json_column(df_exit_quiz, 'answers')
    df_exit_quiz = parse_json_column(df_exit_quiz, 'score')
    df_exit_quiz = parse_json_column(df_exit_quiz, 'meta')
    
    # Convert boolean and numeric columns
    if 'is_current' in df_exit_quiz.columns:
        df_exit_quiz['is_current'] = df_exit_quiz['is_current'].astype(str).str.lower() == 'true'
    if 'attempt_number' in df_exit_quiz.columns:
        df_exit_quiz['attempt_number'] = pd.to_numeric(df_exit_quiz['attempt_number'], errors='coerce')
    
    print(f"  ✓ Cleaned exit_quiz_response table: {len(df_exit_quiz)} rows, {len(df_exit_quiz.columns)} columns")
else:
    print("✗ Exit quiz table not found")
    df_exit_quiz = pd.DataFrame()

### 5.9: Clean Scenario Tables (if present)

In [None]:
# Scenario Assignments
if 'scenario_assignments' in raw_dataframes:
    print("Cleaning scenario_assignments table...")
    df_scenario_assignments = raw_dataframes['scenario_assignments'].copy()
    df_scenario_assignments = clean_strings(df_scenario_assignments)
    df_scenario_assignments = convert_timestamps(df_scenario_assignments)
    
    # Convert numeric columns
    numeric_cols = ['alpha', 'eligible_pool_size', 'n_assigned_before', 'weight', 
                   'sampling_prob', 'assignment_position', 'issue_any', 'duration_seconds']
    for col in numeric_cols:
        if col in df_scenario_assignments.columns:
            df_scenario_assignments[col] = pd.to_numeric(df_scenario_assignments[col], errors='coerce')
    
    print(f"  ✓ Cleaned scenario_assignments: {len(df_scenario_assignments)} rows")
else:
    df_scenario_assignments = pd.DataFrame()

# Scenarios
if 'scenarios' in raw_dataframes:
    print("Cleaning scenarios table...")
    df_scenarios = raw_dataframes['scenarios'].copy()
    df_scenarios = clean_strings(df_scenarios)
    df_scenarios = convert_timestamps(df_scenarios)
    
    # Convert boolean and numeric columns
    if 'is_active' in df_scenarios.columns:
        df_scenarios['is_active'] = df_scenarios['is_active'].astype(str).str.lower() == 'true'
    numeric_cols = ['n_assigned', 'n_completed', 'n_skipped', 'n_abandoned']
    for col in numeric_cols:
        if col in df_scenarios.columns:
            df_scenarios[col] = pd.to_numeric(df_scenarios[col], errors='coerce')
    
    print(f"  ✓ Cleaned scenarios: {len(df_scenarios)} rows")
else:
    df_scenarios = pd.DataFrame()

# Attention Check Scenarios
if 'attention_check_scenarios' in raw_dataframes:
    print("Cleaning attention_check_scenarios table...")
    df_attention_checks = raw_dataframes['attention_check_scenarios'].copy()
    df_attention_checks = clean_strings(df_attention_checks)
    df_attention_checks = convert_timestamps(df_attention_checks)
    
    if 'is_active' in df_attention_checks.columns:
        df_attention_checks['is_active'] = df_attention_checks['is_active'].astype(str).str.lower() == 'true'
    
    print(f"  ✓ Cleaned attention_check_scenarios: {len(df_attention_checks)} rows")
else:
    df_attention_checks = pd.DataFrame()

# Assignment Session Activity
if 'assignment_session_activity' in raw_dataframes:
    print("Cleaning assignment_session_activity table...")
    df_activity = raw_dataframes['assignment_session_activity'].copy()
    df_activity = clean_strings(df_activity)
    df_activity = convert_timestamps(df_activity)
    
    # Convert numeric columns
    numeric_cols = ['session_number', 'active_ms_delta', 'cumulative_ms']
    for col in numeric_cols:
        if col in df_activity.columns:
            df_activity[col] = pd.to_numeric(df_activity[col], errors='coerce')
    
    print(f"  ✓ Cleaned assignment_session_activity: {len(df_activity)} rows")
else:
    df_activity = pd.DataFrame()

## Step 6: Data Quality Summary

In [None]:
print("=" * 60)
print("DATA QUALITY SUMMARY")
print("=" * 60)

cleaned_tables = {
    'users': df_users if 'df_users' in globals() else pd.DataFrame(),
    'chats': df_chats if 'df_chats' in globals() else pd.DataFrame(),
    'messages': df_messages if 'df_messages' in globals() else pd.DataFrame(),
    'child_profiles': df_child_profiles if 'df_child_profiles' in globals() else pd.DataFrame(),
    'selections': df_selections if 'df_selections' in globals() else pd.DataFrame(),
    'moderation_scenarios': df_mod_scenarios if 'df_mod_scenarios' in globals() else pd.DataFrame(),
    'moderation_sessions': df_mod_sessions if 'df_mod_sessions' in globals() else pd.DataFrame(),
    'moderation_applied': df_mod_applied if 'df_mod_applied' in globals() else pd.DataFrame(),
    'moderation_qa': df_mod_qa if 'df_mod_qa' in globals() else pd.DataFrame(),
    'exit_quiz': df_exit_quiz if 'df_exit_quiz' in globals() else pd.DataFrame(),
    'scenario_assignments': df_scenario_assignments if 'df_scenario_assignments' in globals() else pd.DataFrame(),
    'scenarios': df_scenarios if 'df_scenarios' in globals() else pd.DataFrame(),
    'attention_checks': df_attention_checks if 'df_attention_checks' in globals() else pd.DataFrame(),
    'activity': df_activity if 'df_activity' in globals() else pd.DataFrame(),
}

for table_name, df in cleaned_tables.items():
    if len(df) > 0:
        print(f"\n{table_name.upper()}:")
        print(f"  Rows: {len(df)}")
        print(f"  Columns: {len(df.columns)}")
        print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
        
        # Check for nulls
        null_counts = df.isnull().sum()
        if null_counts.sum() > 0:
            print(f"  Columns with nulls: {len(null_counts[null_counts > 0])}")
            top_nulls = null_counts[null_counts > 0].head(5)
            for col, count in top_nulls.items():
                pct = (count / len(df)) * 100
                print(f"    - {col}: {count} ({pct:.1f}%)")
    else:
        print(f"\n{table_name.upper()}: No data")

## Step 7: Save Cleaned Data

In [None]:
# Create output directory
output_dir = Path("data_exports")
output_dir.mkdir(exist_ok=True)

print(f"Saving cleaned DataFrames to {output_dir}/...")

saved_files = {}
for table_name, df in cleaned_tables.items():
    if len(df) > 0:
        # Save as CSV
        csv_file = output_dir / f"{table_name}.csv"
        df.to_csv(csv_file, index=False)
        
        # Save as pickle (faster loading)
        pkl_file = output_dir / f"{table_name}.pkl"
        df.to_pickle(pkl_file)
        
        saved_files[table_name] = {
            'csv': str(csv_file),
            'pkl': str(pkl_file),
            'rows': len(df)
        }
        print(f"  ✓ {table_name}: {len(df)} rows -> {csv_file.name}, {pkl_file.name}")

print(f"\n✓ Saved {len(saved_files)} tables")

# Create summary JSON
summary = {
    'extraction_date': datetime.now().isoformat(),
    'dump_file': str(dump_file) if 'dump_file' in locals() else None,
    'tables': {}
}

for table_name, df in cleaned_tables.items():
    if len(df) > 0:
        summary['tables'][table_name] = {
            'row_count': len(df),
            'column_count': len(df.columns),
            'columns': list(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024
        }

summary_file = output_dir / "summary.json"
with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✓ Summary saved to {summary_file}")

## Step 8: Quick Data Exploration

In [None]:
# Display sample data from key tables
print("Sample data from key tables:\n")

if len(df_users) > 0:
    print("USERS (first 3 rows):")
    print(df_users.head(3).to_string())
    print("\n")

if len(df_chats) > 0:
    print("CHATS (first 3 rows):")
    display_cols = ['id', 'user_id', 'title', 'created_at_datetime', 'message_count']
    available_cols = [col for col in display_cols if col in df_chats.columns]
    print(df_chats[available_cols].head(3).to_string())
    print("\n")

if len(df_selections) > 0:
    print("SELECTIONS (first 3 rows):")
    display_cols = ['id', 'user_id', 'role', 'source', 'created_at_datetime']
    available_cols = [col for col in display_cols if col in df_selections.columns]
    print(df_selections[available_cols].head(3).to_string())
    print("\n")

if len(df_child_profiles) > 0:
    print("CHILD PROFILES (first 3 rows):")
    display_cols = ['id', 'user_id', 'name', 'child_age', 'child_gender', 'is_current']
    available_cols = [col for col in display_cols if col in df_child_profiles.columns]
    print(df_child_profiles[available_cols].head(3).to_string())

## Conclusion

All relevant tables have been extracted, cleaned, and saved to the `data_exports/` directory.

You can now load the cleaned data using:
```python
import pandas as pd
df = pd.read_pickle('data_exports/table_name.pkl')
```

The data has been:
- Parsed from the PostgreSQL dump
- Cleaned (null bytes removed, strings normalized)
- Transformed (timestamps converted, JSON parsed, types corrected)
- Saved in both CSV and pickle formats for easy access