In [4]:
#!/usr/bin/env python3
"""
DHN Metadata Explorer - Simulation Data Investigation
====================================================
Explores large Dymola simulation files before processing.
Helps understand data structure and generate appropriate filters.

Usage:
    1. Run this after main_analysis.py setup
    2. Uses same config and uesgraph
    3. Provides overview of available simulation data
"""

# Same imports and config as main_analysis.py
try:
    from config_local import SCENARIOS, DEFAULT_PARAMS
    print("✅ Using local configuration")
except ImportError:
    print("⚠️ config_local.py not found - using template")
    from config_template import SCENARIOS, DEFAULT_PARAMS

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import re
from collections import defaultdict, Counter
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# UESGraphs
from uesgraphs.uesgraph import UESGraph
import uesgraphs.analyze as analyze

# Same setup as main_analysis.py
scenario_key = "Scenario 1"  # Adjust this
scenario = SCENARIOS[scenario_key]
print(f"📋 Exploring: {scenario['name']}")
print(f"📂 Data file: {scenario['data_path']}")
print(f"Json path: {scenario['json_path']}")

✅ Using local configuration
📋 Exploring: Analysis 1
📂 Data file: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\Results\Sim20250722_164922_1_inputs.gzip
Json path: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\json\district_with_demand.json


In [8]:

# Load UESGraph 
uesgraph = UESGraph()
uesgraph.from_json(path=scenario["json_path"], network_type="heating")
print(f"✅ Network loaded: {len(uesgraph.nodes)} nodes, {len(uesgraph.edges)} edges")


# Get building names from uesgraph (like in get_dataframe())
building_names = []
for node in uesgraph.nodelist_building:
    if not uesgraph.nodes[node]["is_supply_heating"]:
        building_names.append(uesgraph.nodes[node]["name"])
print(f"🏢 Buildings in network: {len(building_names)}")


read nodes...
******
 input_ids were {'buildings': None, 'nodes': '84dd76a2-6254-4b54-9023-5ec728bbae40', 'pipes': None, 'supplies': None}
...finished
✅ Network loaded: 280 nodes, 279 edges
🏢 Buildings in network: 53


In [7]:

def test_parquet_import_thrift_limits(file_path):
    """
    Test different thrift limits to find minimum required size.
    
    Args:
        file_path: Path to the parquet file
    
    Returns:
        dict: Results of the successful attempt, or None if all fail
    """
    # Try progressively larger thrift limits
    limits_to_try = [
        {"size": "16MB",   "limit": 16_000_000},      # PyArrow default
        {"size": "100MB",  "limit": 100_000_000},     # Conservative increase
        {"size": "500MB",  "limit": 500_000_000},     # Current fix
        {"size": "1GB",    "limit": 1_000_000_000},   # Large files
        {"size": "2GB",    "limit": 2_000_000_000},   # Very large files
    ]
    
    print(f"🔍 Testing thrift limits for: {file_path}")
    print("=" * 60)
    
    for attempt in limits_to_try:
        try:
            print(f"🔄 Trying {attempt['size']} limit...")
            
            parquet_file = pq.ParquetFile(
                file_path,
                thrift_string_size_limit=attempt['limit'],
                thrift_container_size_limit=attempt['limit']
            )
            
            # Extract metadata
            all_columns = parquet_file.schema.names
            num_rows = parquet_file.metadata.num_rows
            file_size_mb = parquet_file.metadata.serialized_size / 1_000_000
            
            # Success!
            print(f"✅ SUCCESS with {attempt['size']} limit!")
            print(f"📊 Total columns: {len(all_columns):,}")
            print(f"📊 Total rows: {num_rows:,}")
            print(f"📊 File size: {file_size_mb:.1f} MB")
            print("=" * 60)
            
            return {
                'success': True,
                'limit_used': attempt['limit'],
                'limit_size': attempt['size'],
                'total_columns': len(all_columns),
                'total_rows': num_rows,
                'file_size_mb': file_size_mb,
                'columns': all_columns
            }
            
        except Exception as e:
            print(f"❌ Failed with {attempt['size']}: {str(e)[:100]}...")
            continue
    
    print("🚨 All thrift limits failed!")
    print("💡 Consider using alternative parquet engines (fastparquet, pandas)")
    return None


# Aufruf
result = test_parquet_import_thrift_limits(scenario["data_path"])

if result:
    print(f"🎯 Minimum required limit: {result['limit_size']}")
    print(f"📋 File has {result['total_columns']:,} columns and {result['total_rows']:,} rows")
else:
    print("❌ Could not read file with any thrift limit")

🔍 Testing thrift limits for: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\Results\Sim20250722_164922_1_inputs.gzip
🔄 Trying 16MB limit...
❌ Failed with 16MB: Couldn't deserialize thrift: TProtocolException: Exceeded size limit
...
🔄 Trying 100MB limit...
❌ Failed with 100MB: Couldn't deserialize thrift: TProtocolException: Exceeded size limit
...
🔄 Trying 500MB limit...
✅ SUCCESS with 500MB limit!
📊 Total columns: 251,415
📊 Total rows: 8,761
📊 File size: 227.1 MB
🎯 Minimum required limit: 500MB
📋 File has 251,415 columns and 8,761 rows


In [13]:
def explore_simulation_metadata(result, max_display=10):
    """
    Quick overview of simulation data structure.
    
    Args:
        file_path: Path to simulation file
        max_display: Maximum items to show per category
    
    Returns:
        dict: Basic metadata info
    """
    print("🔍 Exploring simulation data structure...")
    
    if not result:
        return None
    
    all_columns = result['columns']
    print()
    
    # 1. Find buildings in simulation
    print("🏢 BUILDINGS FOUND:")
    building_pattern = re.compile(r'demandT([^.]+)')
    sim_buildings = set()
    for col in all_columns:
        match = building_pattern.search(col)
        if match:
            sim_buildings.add(f"T{match.group(1)}")
    
    print(f"   Count: {len(sim_buildings)}")
    print(f"   Sample: {list(sim_buildings)[:max_display]}")
    if len(sim_buildings) > max_display:
        print(f"   ... +{len(sim_buildings) - max_display} more")
    print()
    
    # 2. Find common variables
    print("🔧 COMMON VARIABLES:")
    from collections import Counter
    var_types = Counter()
    for col in all_columns:
        if '.' in col:
            var_name = col.split('.')[-1]
            var_types[var_name] += 1
    
    print("   Most frequent:")
    for var, count in var_types.most_common(max_display):
        print(f"     {var}: {count}x")
    print()
    
    # 3. Pump variables (user's main interest)
    print("💧 PUMP VARIABLES:")
    pump_vars = [col for col in all_columns if 'pump' in col.lower()]
    print(f"   Found: {len(pump_vars)} pump-related variables")
    if pump_vars:
        print("   Examples:")
        for var in pump_vars[:max_display]:
            print(f"     {var}")
        if len(pump_vars) > max_display:
            print(f"     ... +{len(pump_vars) - max_display} more")
    print()
    
    return {
        'buildings': sim_buildings,
        'var_types': var_types,
        'pump_vars': pump_vars,
        'total_columns': result['total_columns'],
        'total_rows': result['total_rows']
    }


# Aufruf
import re
from collections import Counter

metadata = explore_simulation_metadata(result)

if metadata:
    print("✅ Exploration completed!")
    print(f"💾 Found {len(metadata['buildings'])} buildings, {len(metadata['pump_vars'])} pump variables")

🔍 Exploring simulation data structure...

🏢 BUILDINGS FOUND:
   Count: 53
   Sample: ['T24', 'T41', 'T195', 'T138', 'T179', 'T261', 'T168', 'T37', 'T66', 'T4']
   ... +43 more

🔧 COMMON VARIABLES:
   Most frequent:
     T: 21422x
     m_flow: 17121x
     Q_flow: 16796x
     p: 15234x
     h_outflow: 12443x
     length: 10044x
     d_in: 7254x
     nParallel: 7254x
     d_out: 6696x
     m_flow_nominal: 6114x

💧 PUMP VARIABLES:
   Found: 1 pump-related variables
   Examples:
     networkModel.supplyT284.dp_nominal_pump

✅ Exploration completed!
💾 Found 53 buildings, 1 pump variables


In [14]:
import re
from collections import Counter

def analyze_buildings(all_columns, max_display=10):
    """Analyze buildings found in simulation data."""
    print("🏢 BUILDINGS FOUND:")
    building_pattern = re.compile(r'demandT([^.]+)')
    sim_buildings = set()
    for col in all_columns:
        match = building_pattern.search(col)
        if match:
            sim_buildings.add(f"T{match.group(1)}")
    
    print(f"   Count: {len(sim_buildings)}")
    print(f"   Sample: {list(sim_buildings)[:max_display]}")
    if len(sim_buildings) > max_display:
        print(f"   ... +{len(sim_buildings) - max_display} more")
    print()
    
    return sim_buildings

def analyze_system_components(all_columns, max_display=10):
    """Analyze demand, supply and pipe variables."""
    print("🏗️ SYSTEM COMPONENTS:")
    
    # Count by component type
    demand_vars = [col for col in all_columns if 'demand' in col.lower()]
    supply_vars = [col for col in all_columns if 'supply' in col.lower()]  
    pipe_vars = [col for col in all_columns if 'pipe' in col.lower()]
    
    print(f"   Demand variables: {len(demand_vars)}")
    if demand_vars:
        print(f"     Examples: {demand_vars[:3]}")
    
    print(f"   Supply variables: {len(supply_vars)}")
    if supply_vars:
        print(f"     Examples: {supply_vars[:3]}")
        
    print(f"   Pipe variables: {len(pipe_vars)}")
    if pipe_vars:
        print(f"     Examples: {pipe_vars[:3]}")
    print()
    
    return {
        'demand': demand_vars,
        'supply': supply_vars,
        'pipes': pipe_vars
    }

def analyze_pump_variables(all_columns, max_display=10):
    """Analyze pump-related variables."""
    print("💧 PUMP VARIABLES:")
    pump_vars = [col for col in all_columns if 'pump' in col.lower()]
    print(f"   Found: {len(pump_vars)} pump-related variables")
    if pump_vars:
        print("   Examples:")
        for var in pump_vars[:max_display]:
            print(f"     {var}")
        if len(pump_vars) > max_display:
            print(f"     ... +{len(pump_vars) - max_display} more")
    print()
    
    return pump_vars

def explore_simulation_metadata(result, max_display=10):
    """
    Quick overview of simulation data structure.
    
    Args:
        result: Result dict from test_parquet_thrift_limits()
        max_display: Maximum items to show per category
    
    Returns:
        dict: Basic metadata info
    """
    print("🔍 Exploring simulation data structure...")
    
    if not result:
        return None
    
    all_columns = result['columns']
    print()
    
    # Use the three analysis functions
    buildings = analyze_buildings(all_columns, max_display)
    components = analyze_system_components(all_columns, max_display) 
    pump_vars = analyze_pump_variables(all_columns, max_display)
    
    return {
        'buildings': buildings,
        'components': components,
        'pump_vars': pump_vars,
        'total_columns': result['total_columns'],
        'total_rows': result['total_rows']
    }

# Aufruf
metadata = explore_simulation_metadata(result)

if metadata:
    print("✅ Exploration completed!")
    print(f"💾 Found {len(metadata['buildings'])} buildings, {len(metadata['pump_vars'])} pump variables")
    print(f"💾 Components: {len(metadata['components']['demand'])} demand, {len(metadata['components']['supply'])} supply, {len(metadata['components']['pipes'])} pipe variables")

🔍 Exploring simulation data structure...

🏢 BUILDINGS FOUND:
   Count: 53
   Sample: ['T24', 'T41', 'T195', 'T138', 'T179', 'T261', 'T168', 'T37', 'T66', 'T4']
   ... +43 more

🏗️ SYSTEM COMPONENTS:
   Demand variables: 17879
     Examples: ['networkModel.supplyT284.heatDemand_max_supply', 'networkModel.supplyT284.demand_total.nout', 'networkModel.supplyT284.demand_total.tableOnFile']
   Supply variables: 765
     Examples: ['networkModel.supplyT284.cp_default', 'networkModel.supplyT284.m_flow_nominal_supply', 'networkModel.supplyT284.dp_nominal']
   Pipe variables: 233244
     Examples: ['networkModel.pipe10011002.allowFlowReversal', 'networkModel.pipe10011002.m_flow_nominal', 'networkModel.pipe10011002.m_flow_small']

💧 PUMP VARIABLES:
   Found: 1 pump-related variables
   Examples:
     networkModel.supplyT284.dp_nominal_pump

✅ Exploration completed!
💾 Found 53 buildings, 1 pump variables
💾 Components: 17879 demand, 765 supply, 233244 pipe variables
