In [25]:
#!/usr/bin/env python3
"""
DHN Metadata Explorer - Simulation Data Investigation
====================================================
Explores large Dymola simulation files before processing.
Helps understand data structure and generate appropriate filters.

Usage:
    1. Run this after main_analysis.py setup
    2. Uses same config and uesgraph
    3. Provides overview of available simulation data
"""

# Same imports and config as main_analysis.py
try:
    from config_local import SCENARIOS, DEFAULT_PARAMS
    print("✅ Using local configuration")
except ImportError:
    print("⚠️ config_local.py not found - using template")
    from config_template import SCENARIOS, DEFAULT_PARAMS

import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import re
from collections import defaultdict, Counter
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# UESGraphs
from uesgraphs.uesgraph import UESGraph
import uesgraphs.analyze as analyze

# Same setup as main_analysis.py
scenario_key = "Scenario 1"  # Adjust this
scenario = SCENARIOS[scenario_key]
print(f"📋 Exploring: {scenario['name']}")
print(f"📂 Data file: {scenario['data_path']}")
print(f"Json path: {scenario['json_path']}")

✅ Using local configuration
📋 Exploring: Analysis 1
📂 Data file: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\Results\Sim20250722_164922_1_inputs.gzip
Json path: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\json\district_with_demand.json


In [26]:

# Load UESGraph 
uesgraph = UESGraph()
uesgraph.from_json(path=scenario["json_path"], network_type="heating")
print(f"✅ Network loaded: {len(uesgraph.nodes)} nodes, {len(uesgraph.edges)} edges")


# Get building names from uesgraph (like in get_dataframe())
building_names = []
for node in uesgraph.nodelist_building:
    if not uesgraph.nodes[node]["is_supply_heating"]:
        building_names.append(uesgraph.nodes[node]["name"])
print(f"🏢 Buildings in network: {len(building_names)}")


read nodes...
******
 input_ids were {'buildings': None, 'nodes': '84dd76a2-6254-4b54-9023-5ec728bbae40', 'pipes': None, 'supplies': None}
...finished
✅ Network loaded: 280 nodes, 279 edges
🏢 Buildings in network: 53


In [27]:

def test_parquet_import_thrift_limits(file_path):
    """
    Test different thrift limits to find minimum required size.
    
    Args:
        file_path: Path to the parquet file
    
    Returns:
        dict: Results of the successful attempt, or None if all fail
    """
    # Try progressively larger thrift limits
    limits_to_try = [
        {"size": "16MB",   "limit": 16_000_000},      # PyArrow default
        {"size": "100MB",  "limit": 100_000_000},     # Conservative increase
        {"size": "500MB",  "limit": 500_000_000},     # Current fix
        {"size": "1GB",    "limit": 1_000_000_000},   # Large files
        {"size": "2GB",    "limit": 2_000_000_000},   # Very large files
    ]
    
    print(f"🔍 Testing thrift limits for: {file_path}")
    print("=" * 60)
    
    for attempt in limits_to_try:
        try:
            print(f"🔄 Trying {attempt['size']} limit...")
            
            parquet_file = pq.ParquetFile(
                file_path,
                thrift_string_size_limit=attempt['limit'],
                thrift_container_size_limit=attempt['limit']
            )
            
            # Extract metadata
            all_columns = parquet_file.schema.names
            num_rows = parquet_file.metadata.num_rows
            file_size_mb = parquet_file.metadata.serialized_size / 1_000_000
            
            # Success!
            print(f"✅ SUCCESS with {attempt['size']} limit!")
            print(f"📊 Total columns: {len(all_columns):,}")
            print(f"📊 Total rows: {num_rows:,}")
            print(f"📊 File size: {file_size_mb:.1f} MB")
            print("=" * 60)
            
            return {
                'success': True,
                'limit_used': attempt['limit'],
                'limit_size': attempt['size'],
                'total_columns': len(all_columns),
                'total_rows': num_rows,
                'file_size_mb': file_size_mb,
                'columns': all_columns
            }
            
        except Exception as e:
            print(f"❌ Failed with {attempt['size']}: {str(e)[:100]}...")
            continue
    
    print("🚨 All thrift limits failed!")
    print("💡 Consider using alternative parquet engines (fastparquet, pandas)")
    return None


# Aufruf
result = test_parquet_import_thrift_limits(scenario["data_path"])

if result:
    print(f"🎯 Minimum required limit: {result['limit_size']}")
    print(f"📋 File has {result['total_columns']:,} columns and {result['total_rows']:,} rows")
else:
    print("❌ Could not read file with any thrift limit")

🔍 Testing thrift limits for: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\Results\Sim20250722_164922_1_inputs.gzip
🔄 Trying 16MB limit...
❌ Failed with 16MB: Couldn't deserialize thrift: TProtocolException: Exceeded size limit
...
🔄 Trying 100MB limit...
❌ Failed with 100MB: Couldn't deserialize thrift: TProtocolException: Exceeded size limit
...
🔄 Trying 500MB limit...
✅ SUCCESS with 500MB limit!
📊 Total columns: 251,415
📊 Total rows: 8,761
📊 File size: 227.1 MB
🎯 Minimum required limit: 500MB
📋 File has 251,415 columns and 8,761 rows


In [28]:
# Show hierarchical structure of Modelica simulation data to find variables you can use
# For example finding the 'senTem_Flow' variable
import re
from collections import defaultdict, Counter

def show_structure(result, max_examples=3):
    """
    Show hierarchical structure of Modelica simulation data.
    
    Args:
        result: Result dict from test_parquet_thrift_limits()
        max_examples: Maximum examples to show per category
    """
    if not result:
        print("❌ No data to analyze")
        return
    
    all_columns = result['columns']
    print("🌳 MODELICA STRUCTURE ANALYSIS")
    print("=" * 50)
    
    # Parse structure
    demand_components = defaultdict(set)
    supply_components = defaultdict(set)  
    pipe_components = defaultdict(set)
    
    # Analyze column patterns
    for col in all_columns:
        parts = col.split('.')
        if len(parts) < 3:
            continue
            
        # Demand side analysis
        if 'demandT' in col:
            # Extract building and component info
            demand_match = re.search(r'demandT([^.]+)\.([^.]+)', col)
            if demand_match:
                building = f"T{demand_match.group(1)}"
                component = demand_match.group(2)
                demand_components[component].add(building)
        
        # Supply side analysis  
        elif 'supplyT' in col:
            supply_match = re.search(r'supplyT[^.]*\.([^.]+)', col)
            if supply_match:
                component = supply_match.group(1)
                supply_components[component].add(col)
                
        # Pipe analysis
        elif 'pipe' in col.lower():
            pipe_match = re.search(r'(pipe[^.]*)', col)
            if pipe_match:
                pipe_name = pipe_match.group(1)
                pipe_components['pipes'].add(pipe_name)
    
    # Display structure
    print("📁 networkModel/")
    print("├── 🏢 DEMAND SIDE")
    
    if demand_components:
        # Show buildings
        all_buildings = set()
        for buildings in demand_components.values():
            all_buildings.update(buildings)
        print(f"│   ├── Buildings: {len(all_buildings)} found")
        print(f"│   │   Examples: {list(all_buildings)[:max_examples]}")
        if len(all_buildings) > max_examples:
            print(f"│   │   ... +{len(all_buildings) - max_examples} more")
        print("│   │")
        
        # Show components per building
        print("│   └── Components per building:")
        for component, buildings in sorted(demand_components.items()):
            icon = "💧" if "pump" in component.lower() else "🔥" if "heat" in component.lower() else "🔧"
            print(f"│       ├── {icon} {component} ({len(buildings)} buildings)")
    
    print("│")
    print("├── 🏭 SUPPLY SIDE")
    if supply_components:
        for component, vars in sorted(supply_components.items()):
            icon = "🔥" if any(x in component.lower() for x in ["boiler", "heat"]) else "📊"
            print(f"│   ├── {icon} {component} ({len(vars)} variables)")
    else:
        print("│   └── No supply components found")
    
    print("│")  
    print("└── 🚰 DISTRIBUTION")
    if pipe_components.get('pipes'):
        pipe_count = len(pipe_components['pipes'])
        print(f"    └── Pipes: {pipe_count} found")
        if pipe_count <= max_examples:
            for pipe in list(pipe_components['pipes'])[:max_examples]:
                print(f"        ├── {pipe}")
        else:
            for pipe in list(pipe_components['pipes'])[:max_examples]:
                print(f"        ├── {pipe}")
            print(f"        └── ... +{pipe_count - max_examples} more")
    else:
        print("    └── No pipe components found")
    
    print()


show_structure(result)




🌳 MODELICA STRUCTURE ANALYSIS
📁 networkModel/
├── 🏢 DEMAND SIDE
│   ├── Buildings: 53 found
│   │   Examples: ['T179', 'T261', 'T37']
│   │   ... +50 more
│   │
│   └── Components per building:
│       ├── 🔧 T_cold_supply (53 buildings)
│       ├── 🔧 T_dhw_supply (53 buildings)
│       ├── 🔥 T_heat_supply (53 buildings)
│       ├── 🔧 allowFlowReversal (53 buildings)
│       ├── 🔧 cp_default (53 buildings)
│       ├── 🔧 dT_Network (53 buildings)
│       ├── 🔧 demand_dhw (53 buildings)
│       ├── 🔥 demand_heat (53 buildings)
│       ├── 🔧 dp_nominal_SS (53 buildings)
│       ├── 🔧 dp_valve_fixed (53 buildings)
│       ├── 🔧 dp_valve_nominal (53 buildings)
│       ├── 🔧 hE_1_1 (53 buildings)
│       ├── 🔥 heatDemand_max (53 buildings)
│       ├── 🔧 m_flow_nominal (53 buildings)
│       ├── 🔧 port_a (53 buildings)
│       ├── 🔧 port_a1 (53 buildings)
│       ├── 🔧 port_a2 (53 buildings)
│       ├── 🔧 port_b (53 buildings)
│       ├── 🔧 port_b1 (53 buildings)
│       ├── 🔧 port_b2 (53 buil

In [29]:
# For identified variables, find specific physical variables like 'senTem_Flow.T', 'senTem_Return.p', etc. 
# And generate ready-to-use masks for them.
def find_specific_variables(result, base_variable, endings=['T', 'p', 'm_flow'], max_examples=5):
    """
    Find specific physical variables (not configuration parameters).
    
    Args:
        result: Result dict from test_parquet_thrift_limits()
        base_variable: Base variable name (e.g., 'senTem_Flow')
        endings: Physical variable endings to look for
        max_examples: Maximum examples to show
    """
    if not result:
        return None
    
    all_columns = result['columns']
    print(f"🔍 Looking for physical variables of: {base_variable}")
    print("=" * 60)
    
    found_variables = {}
    
    for ending in endings:
        pattern = f"{base_variable}.{ending}"
        matching_columns = [col for col in all_columns if pattern in col and col.endswith(f'.{ending}')]
        
        if matching_columns:
            print(f"✅ {base_variable}.{ending}: {len(matching_columns)} found")
            print(f"   Examples: {matching_columns[:max_examples]}")
            
            # Generate mask
            mask = f"networkModel.demandT{{name_bldg}}.{base_variable}.{ending}$"
            print(f"   Mask: {mask}")
            
            found_variables[f"{base_variable}_{ending}"] = {
                'mask': mask,
                'count': len(matching_columns),
                'examples': matching_columns[:max_examples]
            }
        else:
            print(f"❌ {base_variable}.{ending}: Not found")
        
        print()
    
    return found_variables


# Quick usage
print("🎯 SENSOR TEMPERATURE SEARCH:")
flow_results = find_specific_variables(result, 'senTem_Flow')
return_results = find_specific_variables(result, 'senTem_Return')

# Show ready-to-use masks
if flow_results or return_results:
    print("📝 READY-TO-USE MASKS:")
    print("=" * 30)
    
    # Combine results
    all_results = {}
    if flow_results:
        all_results.update(flow_results)
    if return_results:
        all_results.update(return_results)
    
    for var_name, info in all_results.items():
        mask_name = f"{var_name.upper()}_MASK"
        print(f"{mask_name} = '{info['mask']}'")
    
    print("\n# Most likely what you want:")
    if 'senTem_Flow_T' in all_results:
        print("FLOW_TEMP_MASK = 'networkModel.demandT{name_bldg}.senTem_Flow.T$'")
    if 'senTem_Return_T' in all_results:
        print("RETURN_TEMP_MASK = 'networkModel.demandT{name_bldg}.senTem_Return.T$'")

🎯 SENSOR TEMPERATURE SEARCH:
🔍 Looking for physical variables of: senTem_Flow
✅ senTem_Flow.T: 106 found
   Examples: ['networkModel.demandT4.senTem_Flow.T', 'networkModel.demandT4.hE_1_1.senTem_Flow.T', 'networkModel.demandT5.senTem_Flow.T', 'networkModel.demandT5.hE_1_1.senTem_Flow.T', 'networkModel.demandT12.senTem_Flow.T']
   Mask: networkModel.demandT{name_bldg}.senTem_Flow.T$

✅ senTem_Flow.p: 212 found
   Examples: ['networkModel.demandT4.senTem_Flow.port_a.p', 'networkModel.demandT4.senTem_Flow.port_b.p', 'networkModel.demandT4.hE_1_1.senTem_Flow.port_a.p', 'networkModel.demandT4.hE_1_1.senTem_Flow.port_b.p', 'networkModel.demandT5.senTem_Flow.port_a.p']
   Mask: networkModel.demandT{name_bldg}.senTem_Flow.p$

❌ senTem_Flow.m_flow: Not found

🔍 Looking for physical variables of: senTem_Return
✅ senTem_Return.T: 53 found
   Examples: ['networkModel.demandT4.senTem_Return.T', 'networkModel.demandT5.senTem_Return.T', 'networkModel.demandT12.senTem_Return.T', 'networkModel.demandT1

In [None]:
#Use this masks to load results

# Data Loading Functions from analysis.ipynb
def get_dataframe(mask, file_path, uesgraph):
    """Load data for a specific mask pattern"""
    filter_list = []
    for node in uesgraph.nodelist_building:
        if not uesgraph.nodes[node]["is_supply_heating"]:
            name_bldg = uesgraph.nodes[node]["name"]
            filter_pattern = mask.format(name_bldg=name_bldg)
            filter_list.append(filter_pattern)
    
    df = analyze.process_simulation_result(file_path=file_path, filter_list=filter_list)
    df = analyze.prepare_DataFrame(
        df, 
        base_date=datetime.strptime(DEFAULT_PARAMS["start_date"], "%Y-%m-%d"), 
        end_date=datetime.strptime(DEFAULT_PARAMS["end_date"], "%Y-%m-%d"),
        time_interval=DEFAULT_PARAMS["time_interval"]
    )
    
    # Simplify column names
    import re
    pattern = re.compile(r'T([^.]+)')
    new_columns = []
    for col in df.columns:
        match = pattern.search(col)
        if match:
            new_columns.append(f"T{match.group(1)}")
        else:
            new_columns.append(col)
    df.columns = new_columns
    
    return df

# Example usage
flow_temp_mask = "networkModel.demandT{name_bldg}.senTem_Flow.T$"
return_temp_mask = "networkModel.demandT{name_bldg}.senTem_Return.T$"
df = get_dataframe(flow_temp_mask, scenario["data_path"], uesgraph)
# Display basic info
print("📊 DataFrame loaded:"
      f" {df.shape[0]:,} rows, {df.shape[1]:,} columns")
# Display first few rows
print(df.head())

Processing: X:\Projekte\EBC_ACS_JERI_0001_BMWi_TransUrbanNRW\Students\Students_Exchange\rka-lko\X drive\work\2025_07_uesgraphs parquet conversion\Sim20250722_164922\Sim20250722_164922_1\Results\Sim20250722_164922_1_inputs.gzip
