# LeakDB

In [18]:
#Load data from all scenarios
import pickle

# Load dictionary from pickle file
with open("scenario_data.pkl", "rb") as f:
    G = pickle.load(f)

print(type(G)) 

<class 'dict'>


In [19]:
# Check structure
print(G['Scenario'].keys()) 

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])


In [30]:
# Inspect the structure of the available data
for scenario, scenario_data in G.items():
    print(f"Scenario: {scenario}")  # Prints the scenario number
    
    for subfolder, subfolder_data in scenario_data.items():
        if (int(subfolder) < 4):
            print(f"  Subfolder: {subfolder}")  # Prints the subfolder name

            for file, df in subfolder_data.items():
                    print(f"    File: {file}")  # Prints the CSV filename
        else:
             break

Scenario: Scenario
  Subfolder: 1
    File: Labels.csv
    File: Scenario-1_info.csv
    File: Demands
    File: Flows
    File: Leaks
    File: Pressures
  Subfolder: 2
    File: Labels.csv
    File: Scenario-2_info.csv
    File: Demands
    File: Flows
    File: Leaks
    File: Pressures
  Subfolder: 3
    File: Labels.csv
    File: Scenario-3_info.csv
    File: Demands
    File: Flows
    File: Leaks
    File: Pressures


In [None]:
import pandas as pd

#Processes demands, pressures, flows, and leaks for a given scenario in `G`.
def process_df_node_data(scenario_dict):

    demands_dict = scenario_dict.get("Demands", {})
    pressures_dict = scenario_dict.get("Pressures", {})
    flows_dict = scenario_dict.get("Flows", {})
    #leaks_dict = scenario_dict.get("Leaks", {})

    # gather data associated with nodes

    nodes_data = []
    for file_name in sorted(demands_dict.keys()):
        if file_name in pressures_dict:

            d_df = demands_dict[file_name]
            p_df = pressures_dict[file_name]

            # Merge on common node identifier (assuming "NodeID" column)
            merged_df = d_df.merge(p_df, on="Timestamp", suffixes=("_Demand", "_Pressure"))
                
            merged_df.rename(columns={"Value_Demand": "Demand", "Value_Pressure": "Pressure", "Description": "Leak_Demand"}, inplace=True)
            merged_df['node_id'] = file_name.replace('.csv', '')
            nodes_data.append(merged_df)

    # Combine all time-step data for demands and pressures
    nodes_data = pd.concat(nodes_data, ignore_index=True) if nodes_data else None

    # Gather data associated with pipes (links)
    links_data = []
    for file_name in sorted(flows_dict):
        f_df = flows_dict[file_name]
        f_df['link_id'] = file_name.replace('.csv', '')
        links_data.append(f_df)

    # Combine all time-step data for flows
    links_data = pd.concat(links_data, ignore_index=True) if links_data else None

    # Return structured data
    return nodes_data, links_data

In [22]:
# Save processed data from all scenarios in a single dictionary
processed_scenarios = {}

for scenario in G["Scenario"]:
    nodes_data, links_data = process_df_node_data(G["Scenario"][scenario])
    processed_scenarios[scenario] = {"nodes": nodes_data, "links": links_data}

## Process all scenarios

In [None]:
# Initialize an empty list to collect results
all_scenarios = []

# Iterate over the scenarios
for scenario_number in processed_scenarios.keys():
    # Extract nodes and links for each scenario
    nodes_scenario = processed_scenarios[scenario_number]['nodes']
    links_scenario = processed_scenarios[scenario_number]['links']
    
    # Create df with unique identifiers for each sensor measurement, regarding demands
    demands_scenario = nodes_scenario.drop(columns=["Pressure"])
    demands_scenario["sensor_id"] = demands_scenario["node_id"] + "_demand"
    demands_scenario = demands_scenario.drop(columns="node_id").rename(columns={"Demand": "measurement"})
    demands_scenario["sensor_type"] = "demand"

    # Create df with unique identifiers for each sensor measurement, regarding pressures
    pressures_scenario = nodes_scenario.drop(columns=["Demand"])
    pressures_scenario["sensor_id"] = pressures_scenario["node_id"] + "_pressure"
    pressures_scenario = pressures_scenario.drop(columns="node_id").rename(columns={"Pressure": "measurement"})
    pressures_scenario["sensor_type"] = "pressure"

    # Create df with unique identifiers for each sensor measurement, regarding flows
    flows_scenario = links_scenario.copy()
    flows_scenario["sensor_id"] = links_scenario["link_id"] + "_flow"
    flows_scenario = flows_scenario.drop(columns="link_id").rename(columns={"Value": "measurement"})
    flows_scenario["sensor_type"] = "flow"

    # Combine all the sensor measurements into a final dataframe for this scenario
    all_scenario = pd.concat([demands_scenario, pressures_scenario, flows_scenario], ignore_index=True)
    
    # Add a unique identifier for each sensor measurement
    all_scenario['unique_id'] = all_scenario['sensor_id'] + '_' + all_scenario["Timestamp"].astype(str)
    
    all_scenario.to_csv(f"measurements_{scenario_number}_LeakDB.csv")

    # You can print or work with each scenario's dataframe (all_scenario) here
    print(f"Processed scenario {scenario_number}")




Processed scenario 1
Processed scenario 2
Processed scenario 3
Processed scenario 4
Processed scenario 5
Processed scenario 6
Processed scenario 7
Processed scenario 8
Processed scenario 9
Processed scenario 10
