in the following notebook we will continue our efforts to design gcn for leak detection and localization

#### highlighting the baseline problem

In [1]:
import torch
import numpy as np
from torch.nn import Linear, Dropout, BatchNorm1d
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Step 1: Load the graph data
print("Loading graph data...")
graph_data = torch.load('graph_data_new_topology.pt')
print("Graph data loaded successfully!")
print("Graph structure:", graph_data)

# Step 2: Use only the required features
print("Processing graph data for leak detection task...")
graph_data.y = graph_data.y_leak_detection  # Set the leak detection target
del graph_data.y_location_1, graph_data.y_location_2  # Remove unused labels
print("Graph data prepared. Target variable: y_leak_detection")

# Step 3: Define the GCN Model with More Layers and Dropout
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout_prob=0.5):
        super(GCN, self).__init__()
        print("Initializing GCN model...")
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(dropout_prob)
        print("GCN model initialized successfully!")

    def forward(self, x, edge_index):
        # Add batch normalization and residual connections
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Residual connection for conv2
        x_res = x
        x = self.conv2(x, edge_index)
        x = F.relu(x + x_res)  # Add residual connection
        x = self.dropout(x)
        
        # Residual connection for conv3
        x_res = x
        x = self.conv3(x, edge_index)
        x = F.relu(x + x_res)  # Add residual connection
        x = self.dropout(x)
        
        x = self.conv4(x, edge_index)
        x = torch.sigmoid(x)  # Explicitly apply sigmoid for probabilities
        return x

# Step 4: Prepare data chunks for leak event-based training
print("Preparing time-series data chunks focused on leak events...")
time_window = 100
num_nodes = graph_data.num_nodes
num_steps = graph_data.node_features.size(1)
chunks = []

# Iterate over the time steps, focus on chunks where leak events occur
for start in range(0, num_steps - time_window + 1, time_window):
    end = start + time_window
    x_chunk = graph_data.node_features[:, start:end].T
    y_chunk = graph_data.y[start:end]
    
    # Filter only chunks with leak events (y_chunk contains leak detection labels)
    if torch.any(y_chunk == 1):  # Look for leak events (assume 1 indicates a leak)
        edge_index = graph_data.edge_index
        chunks.append(Data(x=x_chunk, edge_index=edge_index, y=y_chunk))

print(f"Prepared {len(chunks)} chunks of data with leak events within a time window of {time_window}.")

# Step 5: Split data into training and testing sets
print("Splitting data into training and testing sets...")
train_chunks, test_chunks = train_test_split(chunks, test_size=0.5, random_state=42)
train_loader = DataLoader(train_chunks, batch_size=32, shuffle=True)
test_loader = DataLoader(test_chunks, batch_size=32, shuffle=False)
print("Data split completed. Training and testing loaders created.")

# Step 6: Initialize model, optimizer, and loss function with early stopping
print("Initializing GCN model and training components...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
in_channels = num_nodes
hidden_channels = 32  # Increased hidden channels
out_channels = 1

model = GCN(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)  # Reduced learning rate
criterion = torch.nn.BCELoss()  # Use BCELoss instead of BCEWithLogitsLoss
print(f"Model initialized on device: {device}")

# Step 7: Training loop with early stopping
def train_model(patience=5):
    print("Starting training...")
    model.train()
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(5):  # Increased epochs
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch.x.float(), batch.edge_index).view(-1)  # Model already outputs probabilities
            loss = criterion(out, batch.y.float())  # BCELoss works with probabilities
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/50, Loss: {avg_loss:.4f}")
        
        # Early stopping check
        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'best_model.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch + 1} epochs")
                model.load_state_dict(torch.load('best_model.pt'))
                break

def test_model():
    print("\nStarting testing...")
    model.eval()
    all_true_labels = []
    all_predictions = []  # Store probabilities directly
    
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            out = model(batch.x.float(), batch.edge_index).view(-1)  # Output is already probabilities
            all_true_labels.extend(batch.y.cpu().numpy())
            all_predictions.extend(out.cpu().numpy())  # No need for sigmoid here
    
    all_true_labels = np.array(all_true_labels)
    all_predictions = np.array(all_predictions)
    
    # Test different thresholds
    thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
    
    for threshold in thresholds:
        print(f"\nResults for threshold = {threshold}")
        pred_labels = (all_predictions > threshold).astype(int)
        
        cm = confusion_matrix(all_true_labels, pred_labels)
        print(f"Confusion Matrix:\n{cm}")
        
        metrics = {
            'Accuracy': accuracy_score(all_true_labels, pred_labels),
            'Precision': precision_score(all_true_labels, pred_labels, zero_division=1),  # Handle zero division
            'Recall': recall_score(all_true_labels, pred_labels, zero_division=1),  # Handle zero division
            'F1 Score': f1_score(all_true_labels, pred_labels, zero_division=1)  # Handle zero division
        }
        
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")

if __name__ == "__main__":
    train_model()
    test_model()



Loading graph data...


  graph_data = torch.load('graph_data_new_topology.pt')


Graph data loaded successfully!
Graph structure: Data(edge_index=[2, 34], y_leak_detection=[17520000], y_location_1=[17520000], y_location_2=[17520000], node_features=[32, 17520000], num_nodes=32, edge_features=[34, 17520000])
Processing graph data for leak detection task...
Graph data prepared. Target variable: y_leak_detection
Preparing time-series data chunks focused on leak events...
Prepared 40955 chunks of data with leak events within a time window of 100.
Splitting data into training and testing sets...
Data split completed. Training and testing loaders created.
Initializing GCN model and training components...
Initializing GCN model...
GCN model initialized successfully!
Model initialized on device: cpu
Starting training...




Epoch 1/50, Loss: 2.2232
Epoch 2/50, Loss: 2.1877
Epoch 3/50, Loss: 2.1878
Epoch 4/50, Loss: 2.1877
Epoch 5/50, Loss: 2.1879

Starting testing...

Results for threshold = 0.3
Confusion Matrix:
[[      0   44127]
 [      0 2003673]]
Accuracy: 0.9785
Precision: 0.9785
Recall: 1.0000
F1 Score: 0.9891

Results for threshold = 0.4
Confusion Matrix:
[[      0   44127]
 [      0 2003673]]
Accuracy: 0.9785
Precision: 0.9785
Recall: 1.0000
F1 Score: 0.9891

Results for threshold = 0.5
Confusion Matrix:
[[      0   44127]
 [      0 2003673]]
Accuracy: 0.9785
Precision: 0.9785
Recall: 1.0000
F1 Score: 0.9891

Results for threshold = 0.6
Confusion Matrix:
[[      0   44127]
 [      0 2003673]]
Accuracy: 0.9785
Precision: 0.9785
Recall: 1.0000
F1 Score: 0.9891

Results for threshold = 0.7
Confusion Matrix:
[[      0   44127]
 [      0 2003673]]
Accuracy: 0.9785
Precision: 0.9785
Recall: 1.0000
F1 Score: 0.9891


# starting fresh

In [1]:
import torch
import numpy as np
from torch.nn import Linear, Dropout, BatchNorm1d
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Step 1: Load the graph data
print("Loading graph data...")
graph_data = torch.load('graph_data_new_topology.pt')
print("Graph data loaded successfully!")
print("Graph structure:", graph_data)

Loading graph data...


  graph_data = torch.load('graph_data_new_topology.pt')


Graph data loaded successfully!
Graph structure: Data(edge_index=[2, 34], y_leak_detection=[17520000], y_location_1=[17520000], y_location_2=[17520000], node_features=[32, 17520000], num_nodes=32, edge_features=[34, 17520000])


testing with smaller dataset of 1 million data points

In [2]:

# Step 2: Clip the graph data to the first million data points
num_data_points = 1_000_000  # Define the limit for the data points

# Clipping relevant fields
graph_data.y_leak_detection = graph_data.y_leak_detection[:num_data_points]
graph_data.y_location_1 = graph_data.y_location_1[:num_data_points]
graph_data.y_location_2 = graph_data.y_location_2[:num_data_points]
graph_data.node_features = graph_data.node_features[:, :num_data_points]
graph_data.edge_features = graph_data.edge_features[:, :num_data_points]

# Verify the updated data shape
print("Updated graph data structure:")
print("y_leak_detection:", graph_data.y_leak_detection.shape)
print("y_location_1:", graph_data.y_location_1.shape)
print("y_location_2:", graph_data.y_location_2.shape)
print("node_features:", graph_data.node_features.shape)
print("edge_features:", graph_data.edge_features.shape)

Updated graph data structure:
y_leak_detection: torch.Size([1000000])
y_location_1: torch.Size([1000000])
y_location_2: torch.Size([1000000])
node_features: torch.Size([32, 1000000])
edge_features: torch.Size([34, 1000000])


In [3]:
print("Graph structure:", graph_data)

Graph structure: Data(edge_index=[2, 34], y_leak_detection=[1000000], y_location_1=[1000000], y_location_2=[1000000], node_features=[32, 1000000], num_nodes=32, edge_features=[34, 1000000])


In [3]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Step 1: Prepare the data
print("Processing graph data for leak detection task...")
# Assume `graph_data` is already loaded from the .pt file
graph_data.y = graph_data.y_leak_detection  # Set the target for leak detection
del graph_data.y_location_1, graph_data.y_location_2  # Remove unused labels
print("Graph data prepared. Target variable: y_leak_detection")

# Step 2: Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Step 3: Down-sample the data with a sliding window approach
def downsample_with_sliding_window(features, window_size=20, overlap=10):
    stride = window_size - overlap
    num_windows = (features.shape[1] - window_size) // stride + 1
    downsampled_features = []
    for i in range(num_windows):
        start = i * stride
        end = start + window_size
        window_avg = features[:, start:end].mean(axis=1)
        downsampled_features.append(window_avg)
    return np.stack(downsampled_features, axis=1)

print("Down-sampling node and edge features...")
node_features = graph_data.node_features.cpu().numpy()
edge_features = graph_data.edge_features.cpu().numpy()

downsampled_node_features = downsample_with_sliding_window(node_features, window_size=20, overlap=10)
downsampled_edge_features = downsample_with_sliding_window(edge_features, window_size=20, overlap=10)

graph_data.node_features = torch.tensor(downsampled_node_features, dtype=torch.float).to(device)
graph_data.edge_features = torch.tensor(downsampled_edge_features, dtype=torch.float).to(device)
print("Down-sampling complete.")

# Step 4: Balance the dataset with SMOTE
print("Balancing dataset with SMOTE...")
x_flat = graph_data.node_features.T.cpu().numpy()  # Flatten features for SMOTE
y_flat = graph_data.y.cpu().numpy()

# Reshaping y_flat to match the number of samples in x_flat
desired_num_labels = x_flat.shape[0]  # Ensure y_flat has the same number of samples as x_flat
y_flat_resized = y_flat[:desired_num_labels]  # Adjust size (or repeat if necessary)

print(f"Reshaped y_flat shape: {y_flat_resized.shape}")
print(f"Number of samples in x_flat: {x_flat.shape[0]}")
print(f"Number of samples in y_flat_resized: {y_flat_resized.shape[0]}")

smote = SMOTE()
x_resampled, y_resampled = smote.fit_resample(x_flat, y_flat_resized)

# Restore node features back to graph format
graph_data.node_features = torch.tensor(x_resampled.T, dtype=torch.float).to(device)
graph_data.y = torch.tensor(y_resampled, dtype=torch.float).to(device)
print("SMOTE completed. Dataset balanced.")

# Step 5: Define the GCN model with residual connections and dropout
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout_prob=0.5):
        super(GCN, self).__init__()
        print("Initializing GCN model...")
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(dropout_prob)
        print("GCN model initialized successfully!")

    def forward(self, x, edge_index):
        # Add batch normalization and residual connections
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Residual connection for conv2
        x_res = x
        x = self.conv2(x, edge_index)
        x = F.relu(x + x_res)  # Add residual connection
        x = self.dropout(x)
        
        # Residual connection for conv3
        x_res = x
        x = self.conv3(x, edge_index)
        x = F.relu(x + x_res)  # Add residual connection
        x = self.dropout(x)
        
        x = self.conv4(x, edge_index)
        x = torch.sigmoid(x)  # Explicitly apply sigmoid for probabilities
        return x

# Step 6: Instantiate and train the model
print("Instantiating the model...")
in_channels = graph_data.node_features.shape[0]
hidden_channels = 64
out_channels = 1

model = GCN(in_channels, hidden_channels, out_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

print("Starting training...")
model.train()
for epoch in range(10):  # Train for 10 epochs (can adjust as needed)
    optimizer.zero_grad()
    output = model(graph_data.node_features, graph_data.edge_index)
    loss = F.binary_cross_entropy(output.squeeze(), graph_data.y)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/10, Loss: {loss.item():.4f}")

print("Training complete.")

# Step 7: Test the model
print("Testing the model...")
train_size = int(0.8 * graph_data.node_features.shape[1])  # 80% train, 20% test
train_features = graph_data.node_features[:, :train_size].to(device)
train_labels = graph_data.y[:train_size].to(device)
test_features = graph_data.node_features[:, train_size:].to(device)
test_labels = graph_data.y[train_size:].to(device)

model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    predictions = model(test_features, graph_data.edge_index.to(device))

# Convert predictions to binary class labels (0 or 1)
predicted_labels = (predictions.squeeze() > 0.5).float()

# Compute accuracy
accuracy = accuracy_score(test_labels.cpu().numpy(), predicted_labels.cpu().numpy())
print(f"Accuracy: {accuracy:.4f}")

# Generate confusion matrix
conf_matrix = confusion_matrix(test_labels.cpu().numpy(), predicted_labels.cpu().numpy())
print("Confusion Matrix:")
print(conf_matrix)

# Display classification report
print("Classification Report:")
print(classification_report(test_labels.cpu().numpy(), predicted_labels.cpu().numpy()))


Processing graph data for leak detection task...
Graph data prepared. Target variable: y_leak_detection
Using device: cpu
Down-sampling node and edge features...
Down-sampling complete.
Balancing dataset with SMOTE...
Reshaped y_flat shape: (99999,)
Number of samples in x_flat: 99999
Number of samples in y_flat_resized: 99999
SMOTE completed. Dataset balanced.
Instantiating the model...
Initializing GCN model...
GCN model initialized successfully!
Starting training...


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x180780 and 32x64)

# claude proposed fix

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

# Step 1: Data preparation with correct shapes
print("Processing graph data for leak detection task...")
print("\nInitial data shapes:")
print(f"Node features: {graph_data.node_features.shape}")
print(f"Edge index: {graph_data.edge_index.shape}")
print(f"Target variable shape: {graph_data.y_leak_detection.shape}")

# The key insight is that for GCN, we need:
# - Node features shape: [num_nodes, num_features]
# - Each node should have a feature vector
# - The time series should be treated as features, not nodes

def reshape_for_gcn(node_features, window_size=20, stride=10):
    """
    Reshape time series data into appropriate GCN format
    Returns: Features with shape [num_nodes, num_features]
    """
    # First, reshape to handle the time series properly
    num_nodes = node_features.shape[0]  # 32 nodes
    num_timesteps = node_features.shape[1]  # 1000000 timesteps
    
    # Calculate number of windows
    num_windows = (num_timesteps - window_size) // stride + 1
    
    # Create feature matrix where each window becomes a feature
    features = torch.zeros((num_nodes, num_windows))
    
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        # Average over the window to create a feature
        features[:, i] = node_features[:, start_idx:end_idx].mean(dim=1)
    
    return features

# Reshape data
print("\nReshaping data for GCN...")
node_features = reshape_for_gcn(graph_data.node_features, window_size=20, stride=10)
print(f"Reshaped node features: {node_features.shape}")

# Create target variable for nodes
# We'll aggregate the time series labels to get one label per node
def aggregate_labels(y_leak_detection, num_nodes):
    """
    Aggregate time series labels to get one label per node
    Uses majority voting to determine node label
    """
    y_reshaped = y_leak_detection.view(num_nodes, -1)
    # A node is considered affected if it has leaks in more than 25% of timesteps
    threshold = 0.25
    node_labels = (y_reshaped.float().mean(dim=1) > threshold).float()
    return node_labels

y_node = aggregate_labels(graph_data.y_leak_detection, node_features.shape[0])
print(f"Node labels shape: {y_node.shape}")

# Step 2: Define the GCN model with proper dimensions
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels=64, dropout_prob=0.5):
        super(GCN, self).__init__()
        # in_channels is now the number of features per node
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, 1)  # Output one value per node
        self.dropout = torch.nn.Dropout(dropout_prob)
        self.batch_norm1 = torch.nn.BatchNorm1d(hidden_channels)
        self.batch_norm2 = torch.nn.BatchNorm1d(hidden_channels)

    def forward(self, x, edge_index):
        # First layer
        x = self.conv1(x, edge_index)
        x = self.batch_norm1(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Second layer
        x = self.conv2(x, edge_index)
        x = self.batch_norm2(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Output layer
        x = self.conv3(x, edge_index)
        return torch.sigmoid(x)

# Step 3: Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Move data to device
node_features = node_features.to(device)
edge_index = graph_data.edge_index.to(device)
y_node = y_node.to(device)

# Initialize model
in_channels = node_features.shape[1]  # Number of features per node
model = GCN(in_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Step 4: Training loop
print("\nStarting training...")
model.train()
num_epochs = 100  # Increased epochs since we have less data now

for epoch in range(num_epochs):
    optimizer.zero_grad()
    out = model(node_features, edge_index)
    loss = F.binary_cross_entropy(out.squeeze(), y_node)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Step 5: Evaluation
print("\nEvaluating model...")
model.eval()
with torch.no_grad():
    pred = model(node_features, edge_index)
    pred_labels = (pred.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_node.cpu(), pred_labels.cpu())
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_node.cpu(), pred_labels.cpu()))

Processing graph data for leak detection task...

Initial data shapes:
Node features: torch.Size([32, 1000000])
Edge index: torch.Size([2, 34])
Target variable shape: torch.Size([1000000])

Reshaping data for GCN...
Reshaped node features: torch.Size([32, 99999])
Node labels shape: torch.Size([32])

Using device: cpu

Starting training...
Epoch 10/100, Loss: 1.0411
Epoch 20/100, Loss: 0.7278
Epoch 30/100, Loss: 0.7279
Epoch 40/100, Loss: 0.6703
Epoch 50/100, Loss: 0.7818
Epoch 60/100, Loss: 0.6919
Epoch 70/100, Loss: 0.6386
Epoch 80/100, Loss: 0.7120
Epoch 90/100, Loss: 0.6778
Epoch 100/100, Loss: 0.6807

Evaluating model...
Accuracy: 0.5312

Classification Report:
              precision    recall  f1-score   support

         0.0       0.50      0.07      0.12        15
         1.0       0.53      0.94      0.68        17

    accuracy                           0.53        32
   macro avg       0.52      0.50      0.40        32
weighted avg       0.52      0.53      0.42        32


we have a working model, lets improve it with chatgpt

In [1]:
import torch
import numpy as np
from torch.nn import Linear, Dropout, BatchNorm1d
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# Step 1: Load the graph data
print("Loading graph data...")
graph_data = torch.load('graph_data_new_topology.pt')
print("Graph data loaded successfully!")
print("Graph structure:", graph_data)

Loading graph data...


  graph_data = torch.load('graph_data_new_topology.pt')


Graph data loaded successfully!
Graph structure: Data(edge_index=[2, 34], y_leak_detection=[17520000], y_location_1=[17520000], y_location_2=[17520000], node_features=[32, 17520000], num_nodes=32, edge_features=[34, 17520000])


In [2]:

# Step 2: Clip the graph data to the first million data points
num_data_points = 1_000_000  # Define the limit for the data points

# Clipping relevant fields
graph_data.y_leak_detection = graph_data.y_leak_detection[:num_data_points]
graph_data.y_location_1 = graph_data.y_location_1[:num_data_points]
graph_data.y_location_2 = graph_data.y_location_2[:num_data_points]
graph_data.node_features = graph_data.node_features[:, :num_data_points]
graph_data.edge_features = graph_data.edge_features[:, :num_data_points]

# Verify the updated data shape
print("Updated graph data structure:")
print("y_leak_detection:", graph_data.y_leak_detection.shape)
print("y_location_1:", graph_data.y_location_1.shape)
print("y_location_2:", graph_data.y_location_2.shape)
print("node_features:", graph_data.node_features.shape)
print("edge_features:", graph_data.edge_features.shape)
print("Graph structure:", graph_data)

Updated graph data structure:
y_leak_detection: torch.Size([1000000])
y_location_1: torch.Size([1000000])
y_location_2: torch.Size([1000000])
node_features: torch.Size([32, 1000000])
edge_features: torch.Size([34, 1000000])
Graph structure: Data(edge_index=[2, 34], y_leak_detection=[1000000], y_location_1=[1000000], y_location_2=[1000000], node_features=[32, 1000000], num_nodes=32, edge_features=[34, 1000000])


In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Step 1: Data preparation with correct shapes
print("Processing graph data for leak detection task...")

# Print initial shapes
print("\nInitial data shapes:")
print(f"Node features: {graph_data.node_features.shape}")
print(f"Edge features: {graph_data.edge_features.shape}")
print(f"Edge index: {graph_data.edge_index.shape}")
print(f"Target variable shape: {graph_data.y_leak_detection.shape}")

# Reshape node features for GCN
def reshape_for_gcn(node_features, window_size=20, stride=10):
    """
    Reshape node time series data into appropriate GCN format.
    Returns: Features with shape [num_nodes, num_features].
    """
    num_nodes = node_features.shape[0]
    num_timesteps = node_features.shape[1]
    num_windows = (num_timesteps - window_size) // stride + 1
    
    features = torch.zeros((num_nodes, num_windows))
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        features[:, i] = node_features[:, start_idx:end_idx].mean(dim=1)
    
    return features

# Reshape edge features for GCN
def reshape_edge_features(edge_features, window_size=20, stride=10):
    """
    Reshape edge time series data into appropriate GCN format.
    Returns: Features with shape [num_edges, num_features].
    """
    num_edges = edge_features.shape[0]
    num_timesteps = edge_features.shape[1]
    num_windows = (num_timesteps - window_size) // stride + 1
    
    features = torch.zeros((num_edges, num_windows))
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        features[:, i] = edge_features[:, start_idx:end_idx].mean(dim=1)
    
    return features

# Reshape node and edge features
print("\nReshaping data for GCN...")
node_features = reshape_for_gcn(graph_data.node_features, window_size=20, stride=10)
edge_features = reshape_edge_features(graph_data.edge_features, window_size=20, stride=10)
print(f"Reshaped node features: {node_features.shape}")
print(f"Reshaped edge features: {edge_features.shape}")

# Aggregate time-series labels for nodes
def aggregate_labels(y_leak_detection, num_nodes):
    """
    Aggregate time series labels to get one label per node.
    Uses majority voting to determine node label.
    """
    y_reshaped = y_leak_detection.view(num_nodes, -1)
    threshold = 0.25
    node_labels = (y_reshaped.float().mean(dim=1) > threshold).float()
    return node_labels

y_node = aggregate_labels(graph_data.y_leak_detection, node_features.shape[0])
print(f"Node labels shape: {y_node.shape}")

# Step 2: Define the GCN model with edge features
class GCNWithEdgeFeatures(MessagePassing):
    def __init__(self, in_channels, edge_channels, hidden_channels=64, dropout_prob=0.5):
        super(GCNWithEdgeFeatures, self).__init__(aggr='add')  # Aggregation method (sum)
        self.node_mlp = torch.nn.Sequential(
            torch.nn.Linear(in_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_prob),
        )
        self.edge_mlp = torch.nn.Sequential(
            torch.nn.Linear(edge_channels, hidden_channels),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_prob),
        )
        self.final_mlp = torch.nn.Linear(hidden_channels, 1)
    
    def forward(self, x, edge_index, edge_attr):
        """
        x: Node features [num_nodes, in_channels]
        edge_index: Edge indices [2, num_edges]
        edge_attr: Edge features [num_edges, edge_channels]
        """
        x = self.propagate(edge_index, x=x, edge_attr=edge_attr)
        return torch.sigmoid(self.final_mlp(x))
    
    def message(self, x_j, edge_attr):
        """
        x_j: Features of source nodes [num_edges, hidden_channels]
        edge_attr: Edge features [num_edges, edge_channels]
        """
        edge_messages = self.edge_mlp(edge_attr)
        node_messages = self.node_mlp(x_j)
        return edge_messages + node_messages

# Step 3: Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Move data to device
node_features = node_features.to(device)
edge_index = graph_data.edge_index.to(device)
edge_features = edge_features.to(device)
y_node = y_node.to(device)

# Initialize model
in_channels = node_features.shape[1]  # Number of features per node
edge_channels = edge_features.shape[1]  # Number of features per edge
model = GCNWithEdgeFeatures(in_channels, edge_channels).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=0.01)

# Step 4: Training loop
print("\nStarting training...")
model.train()
num_epochs = 1000

for epoch in range(num_epochs):
    optimizer.zero_grad()
    out = model(node_features, edge_index, edge_features)
    loss = F.binary_cross_entropy(out.squeeze(), y_node)
    loss.backward()
    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Step 5: Evaluation
print("\nEvaluating model...")
model.eval()
with torch.no_grad():
    pred = model(node_features, edge_index, edge_features)
    pred_labels = (pred.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_node.cpu(), pred_labels.cpu())
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_node.cpu(), pred_labels.cpu(), zero_division=0))

Processing graph data for leak detection task...

Initial data shapes:
Node features: torch.Size([32, 1000000])
Edge features: torch.Size([34, 1000000])
Edge index: torch.Size([2, 34])
Target variable shape: torch.Size([1000000])

Reshaping data for GCN...
Reshaped node features: torch.Size([32, 99999])
Reshaped edge features: torch.Size([34, 99999])
Node labels shape: torch.Size([32])

Using device: cpu

Starting training...
Epoch 10/1000, Loss: 40.6562
Epoch 20/1000, Loss: 40.6415
Epoch 30/1000, Loss: 40.6369
Epoch 40/1000, Loss: 40.6373
Epoch 50/1000, Loss: 40.6394
Epoch 60/1000, Loss: 40.6407
Epoch 70/1000, Loss: 40.6403
Epoch 80/1000, Loss: 40.6397
Epoch 90/1000, Loss: 40.6396
Epoch 100/1000, Loss: 40.6398
Epoch 110/1000, Loss: 40.6399
Epoch 120/1000, Loss: 40.6398
Epoch 130/1000, Loss: 40.6398
Epoch 140/1000, Loss: 40.6398
Epoch 150/1000, Loss: 40.6398
Epoch 160/1000, Loss: 40.6398
Epoch 170/1000, Loss: 40.6398
Epoch 180/1000, Loss: 40.6398
Epoch 190/1000, Loss: 40.6398
Epoch 200

we got good code we are almost there now we just have to use claude to add the deep learning techniuqes

In [13]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Enhanced GCN Model with improved structure and handling
class EnhancedGCNWithEdgeFeatures(MessagePassing):
    def __init__(self, in_channels, edge_channels, hidden_channels=128, num_layers=3, dropout_prob=0.5):
        """
        Enhanced GCN architecture with fixed dimensions for time series data.

        Args:
            in_channels: Number of input features per node (e.g., reshaped time series windows)
            edge_channels: Number of features per edge (e.g., reshaped time series windows)
            hidden_channels: Size of hidden layers
            num_layers: Number of message passing layers
            dropout_prob: Dropout probability
        """
        super(EnhancedGCNWithEdgeFeatures, self).__init__(aggr='add')  # Aggregation by addition
        self.num_layers = num_layers

        # Initial feature projection to hidden dimensions
        self.node_projection = torch.nn.Linear(in_channels, hidden_channels)
        self.edge_projection = torch.nn.Linear(edge_channels, hidden_channels)

        # Node processing layers
        self.node_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        # Edge processing layers
        self.edge_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        # Attention mechanism for node pairs
        self.attention = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.Tanh(),
            torch.nn.Linear(hidden_channels, 1)
        )

        # Skip connection transforms
        self.skip_transforms = torch.nn.ModuleList([
            torch.nn.Linear(hidden_channels, hidden_channels)
            for _ in range(num_layers - 1)
        ])

        # Final prediction layers
        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.LayerNorm(hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_prob),
            torch.nn.Linear(hidden_channels // 2, 1)
        )

    def forward(self, x, edge_index, edge_attr):
        """
        Forward pass with proper dimension handling.
        """
        # Initial projection to hidden dimension
        x = self.node_projection(x)  # [num_nodes, hidden_channels]
        edge_attr = self.edge_projection(edge_attr)  # [num_edges, hidden_channels]

        # Initialize previous layer output
        previous_layer = None

        # Process through multiple layers
        for i in range(self.num_layers):
            # Transform node features
            current_x = self.node_transforms[i](x if i == 0 else previous_layer)

            # Add skip connection if not first layer
            if i > 0:
                current_x = current_x + self.skip_transforms[i - 1](previous_layer)

            # Message passing with transformed edge features
            current_x = self.propagate(
                edge_index,
                x=current_x,
                edge_attr=self.edge_transforms[i](edge_attr)
            )

            # Store current layer output
            previous_layer = current_x

        # Final prediction
        return torch.sigmoid(self.final_layers(current_x))

    def message(self, x_i, x_j, edge_attr):
        """
        Message function with attention for properly shaped tensors.
        """
        # Compute attention weights
        attention_input = torch.cat([x_i, x_j], dim=-1)  # [num_edges, hidden_channels * 2]
        attention_weights = torch.softmax(self.attention(attention_input), dim=-1)

        # Combine node and edge features
        combined_features = x_j + edge_attr  # [num_edges, hidden_channels]

        return attention_weights * combined_features


# Modified training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Assuming node_features, edge_index, edge_features, and y_node are already prepared
node_features = node_features.to(device)
edge_index = graph_data.edge_index.to(device)
edge_features = edge_features.to(device)
y_node = y_node.to(device)

# Initialize enhanced model
in_channels = node_features.shape[1]
edge_channels = edge_features.shape[1]
model = EnhancedGCNWithEdgeFeatures(
    in_channels=in_channels,
    edge_channels=edge_channels,
    hidden_channels=128,
    num_layers=3,
    dropout_prob=0.5
).to(device)

# Training setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.1, weight_decay=0.01)

# Training loop
print("\nStarting training...")
model.train()
num_epochs = 1000

for epoch in range(num_epochs):
    optimizer.zero_grad()
    out = model(node_features, edge_index, edge_features)
    loss = F.binary_cross_entropy(out.squeeze(), y_node)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# Evaluation
print("\nEvaluating model...")
model.eval()
with torch.no_grad():
    pred = model(node_features, edge_index, edge_features)
    pred_labels = (pred.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_node.cpu(), pred_labels.cpu())
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_node.cpu(), pred_labels.cpu()))


Using device: cpu

Starting training...
Epoch 10/1000, Loss: 0.6578
Epoch 20/1000, Loss: 0.6581
Epoch 30/1000, Loss: 0.6442
Epoch 40/1000, Loss: 0.6245
Epoch 50/1000, Loss: 0.6506
Epoch 60/1000, Loss: 0.6331
Epoch 70/1000, Loss: 0.6260
Epoch 80/1000, Loss: 0.6529
Epoch 90/1000, Loss: 0.6989
Epoch 100/1000, Loss: 0.6570
Epoch 110/1000, Loss: 0.7361
Epoch 120/1000, Loss: 0.7242
Epoch 130/1000, Loss: 0.6800
Epoch 140/1000, Loss: 0.6696
Epoch 150/1000, Loss: 0.6349
Epoch 160/1000, Loss: 0.6712
Epoch 170/1000, Loss: 0.6841
Epoch 180/1000, Loss: 0.6536
Epoch 190/1000, Loss: 0.6698
Epoch 200/1000, Loss: 0.6683
Epoch 210/1000, Loss: 0.6883
Epoch 220/1000, Loss: 0.6783
Epoch 230/1000, Loss: 0.7249
Epoch 240/1000, Loss: 1.1553
Epoch 250/1000, Loss: 0.7067
Epoch 260/1000, Loss: 0.6221
Epoch 270/1000, Loss: 0.7414
Epoch 280/1000, Loss: 0.6811
Epoch 290/1000, Loss: 0.6840
Epoch 300/1000, Loss: 0.6505
Epoch 310/1000, Loss: 0.6704
Epoch 320/1000, Loss: 0.7273
Epoch 330/1000, Loss: 0.6438
Epoch 340/1

more advanced optimization

In [14]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from sklearn.metrics import accuracy_score, classification_report
import numpy as np


# Enhanced GCN Model with improved structure and handling
class EnhancedGCNWithEdgeFeatures(MessagePassing):
    def __init__(self, in_channels, edge_channels, hidden_channels=128, num_layers=3, dropout_prob=0.5):
        super(EnhancedGCNWithEdgeFeatures, self).__init__(aggr='add')
        self.num_layers = num_layers

        self.node_projection = torch.nn.Linear(in_channels, hidden_channels)
        self.edge_projection = torch.nn.Linear(edge_channels, hidden_channels)

        self.node_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        self.edge_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        self.attention = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.Tanh(),
            torch.nn.Linear(hidden_channels, 1)
        )

        self.skip_transforms = torch.nn.ModuleList([
            torch.nn.Linear(hidden_channels, hidden_channels)
            for _ in range(num_layers - 1)
        ])

        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.LayerNorm(hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_prob),
            torch.nn.Linear(hidden_channels // 2, 1)
        )

    def forward(self, x, edge_index, edge_attr):
        x = self.node_projection(x)
        edge_attr = self.edge_projection(edge_attr)
        previous_layer = None

        for i in range(self.num_layers):
            current_x = self.node_transforms[i](x if i == 0 else previous_layer)
            if i > 0:
                current_x = current_x + self.skip_transforms[i - 1](previous_layer)
            current_x = self.propagate(
                edge_index,
                x=current_x,
                edge_attr=self.edge_transforms[i](edge_attr)
            )
            previous_layer = current_x

        return torch.sigmoid(self.final_layers(current_x))

    def message(self, x_i, x_j, edge_attr):
        attention_input = torch.cat([x_i, x_j], dim=-1)
        attention_weights = torch.softmax(self.attention(attention_input), dim=-1)
        combined_features = x_j + edge_attr
        return attention_weights * combined_features


# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Assuming node_features, edge_index, edge_features, and y_node are already prepared
node_features = node_features.to(device)
edge_index = graph_data.edge_index.to(device)
edge_features = edge_features.to(device)
y_node = y_node.to(device)

# Initialize model
in_channels = node_features.shape[1]
edge_channels = edge_features.shape[1]
model = EnhancedGCNWithEdgeFeatures(
    in_channels=in_channels,
    edge_channels=edge_channels,
    hidden_channels=128,
    num_layers=3,
    dropout_prob=0.5
).to(device)

# Training setup
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

# Early stopping setup
patience = 20
best_loss = float('inf')
early_stop_counter = 0

# Training loop
num_epochs = 1000
train_losses, val_losses = [], []

print("\nStarting training...")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    out = model(node_features, edge_index, edge_features)
    train_loss = F.binary_cross_entropy(out.squeeze(), y_node)
    train_loss.backward()
    optimizer.step()

    # Validation (simulated here; replace with actual validation data)
    model.eval()
    with torch.no_grad():
        val_out = model(node_features, edge_index, edge_features)  # Replace with validation data
        val_loss = F.binary_cross_entropy(val_out.squeeze(), y_node)  # Replace y_node with validation labels

    train_losses.append(train_loss.item())
    val_losses.append(val_loss.item())

    # Scheduler and early stopping
    scheduler.step(val_loss)
    if val_loss < best_loss:
        best_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"\nEarly stopping triggered at epoch {epoch + 1}")
            break

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Evaluation
print("\nEvaluating model...")
model.eval()
with torch.no_grad():
    pred = model(node_features, edge_index, edge_features)
    pred_labels = (pred.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_node.cpu(), pred_labels.cpu())
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_node.cpu(), pred_labels.cpu()))


Using device: cpu

Starting training...




Epoch 10/1000, Train Loss: 0.6026, Val Loss: 0.5185
Epoch 20/1000, Train Loss: 0.4443, Val Loss: 0.4999
Epoch 30/1000, Train Loss: 0.5010, Val Loss: 0.4794
Epoch 40/1000, Train Loss: 0.4827, Val Loss: 0.4789
Epoch 50/1000, Train Loss: 0.5706, Val Loss: 0.4571
Epoch 60/1000, Train Loss: 0.4620, Val Loss: 0.4260
Epoch 70/1000, Train Loss: 0.4308, Val Loss: 0.4351
Epoch 80/1000, Train Loss: 0.4321, Val Loss: 0.4286

Early stopping triggered at epoch 84

Evaluating model...
Accuracy: 0.7500

Classification Report:
              precision    recall  f1-score   support

         0.0       0.67      0.93      0.78        15
         1.0       0.91      0.59      0.71        17

    accuracy                           0.75        32
   macro avg       0.79      0.76      0.75        32
weighted avg       0.80      0.75      0.74        32



  model.load_state_dict(torch.load('best_model.pth'))


# perfect now lets perform hyperparamaeter tuning

In [4]:
import torch
import numpy as np
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load and prepare the graph data
print("Loading graph data...")
graph_data = torch.load('graph_data_new_topology.pt')
print("Graph data loaded successfully!")

Loading graph data...


  graph_data = torch.load('graph_data_new_topology.pt')


Graph data loaded successfully!


In [12]:
# Step 2: Clip the graph data to the first million data points
num_data_points = 1_000_000  # Define the limit for the data points
graph_data.y_leak_detection = graph_data.y_leak_detection[:num_data_points]
graph_data.y_location_1 = graph_data.y_location_1[:num_data_points]
graph_data.y_location_2 = graph_data.y_location_2[:num_data_points]
graph_data.node_features = graph_data.node_features[:, :num_data_points]
graph_data.edge_features = graph_data.edge_features[:, :num_data_points]

# Verify the updated data shape
print("Updated graph data structure:")
print(f"y_leak_detection: {graph_data.y_leak_detection.shape}")
print(f"y_location_1: {graph_data.y_location_1.shape}")
print(f"y_location_2: {graph_data.y_location_2.shape}")
print(f"node_features: {graph_data.node_features.shape}")
print(f"edge_features: {graph_data.edge_features.shape}")

# Step 3: Reshape node and edge features for GCN
def reshape_for_gcn(data, window_size=20, stride=10):
    """
    Reshape time-series data for GCN format. 
    Returns reshaped data with shape [num_elements, num_windows].
    """
    num_elements, num_timesteps = data.shape
    num_windows = (num_timesteps - window_size) // stride + 1
    
    reshaped_data = torch.zeros((num_elements, num_windows))
    for i in range(num_windows):
        start_idx = i * stride
        end_idx = start_idx + window_size
        reshaped_data[:, i] = data[:, start_idx:end_idx].mean(dim=1)
    
    return reshaped_data

# Reshape node and edge features
print("\nReshaping data for GCN...")
node_features = reshape_for_gcn(graph_data.node_features, window_size=20, stride=10)
edge_features = reshape_for_gcn(graph_data.edge_features, window_size=20, stride=10)
print(f"Reshaped node features: {node_features.shape}")
print(f"Reshaped edge features: {edge_features.shape}")

# Step 4: Aggregate labels for nodes using majority voting
def aggregate_labels(y_leak_detection, num_nodes, threshold=0.25):
    """
    Aggregate time-series labels for each node using majority voting.
    """
    y_reshaped = y_leak_detection.view(num_nodes, -1)
    node_labels = (y_reshaped.float().mean(dim=1) > threshold).float()
    return node_labels

y_node = aggregate_labels(graph_data.y_leak_detection, node_features.shape[0])
print(f"Node labels shape: {y_node.shape}")

Updated graph data structure:
y_leak_detection: torch.Size([1000000])
y_location_1: torch.Size([1000000])
y_location_2: torch.Size([1000000])
node_features: torch.Size([32, 1000000])
edge_features: torch.Size([34, 1000000])

Reshaping data for GCN...
Reshaped node features: torch.Size([32, 99999])
Reshaped edge features: torch.Size([34, 99999])
Node labels shape: torch.Size([32])


In [16]:
graph_data.y_leak_detection.view(node_features.shape[0], -1).shape

torch.Size([32, 31250])

In [17]:
1000000 / 32

31250.0

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import MessagePassing
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import optuna

# Enhanced GCN Model with improved structure and handling
class EnhancedGCNWithEdgeFeatures(MessagePassing):
    def __init__(self, in_channels, edge_channels, hidden_channels=128, num_layers=3, dropout_prob=0.5):
        super(EnhancedGCNWithEdgeFeatures, self).__init__(aggr='add')
        self.num_layers = num_layers

        self.node_projection = torch.nn.Linear(in_channels, hidden_channels)
        self.edge_projection = torch.nn.Linear(edge_channels, hidden_channels)

        self.node_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        self.edge_transforms = torch.nn.ModuleList([
            torch.nn.Sequential(
                torch.nn.Linear(hidden_channels, hidden_channels),
                torch.nn.LayerNorm(hidden_channels),
                torch.nn.ReLU(),
                torch.nn.Dropout(dropout_prob)
            ) for _ in range(num_layers)
        ])

        self.attention = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels * 2, hidden_channels),
            torch.nn.Tanh(),
            torch.nn.Linear(hidden_channels, 1)
        )

        self.skip_transforms = torch.nn.ModuleList([
            torch.nn.Linear(hidden_channels, hidden_channels)
            for _ in range(num_layers - 1)
        ])

        self.final_layers = torch.nn.Sequential(
            torch.nn.Linear(hidden_channels, hidden_channels // 2),
            torch.nn.LayerNorm(hidden_channels // 2),
            torch.nn.ReLU(),
            torch.nn.Dropout(dropout_prob),
            torch.nn.Linear(hidden_channels // 2, 1)
        )

    def forward(self, x, edge_index, edge_attr):
        x = self.node_projection(x)
        edge_attr = self.edge_projection(edge_attr)
        previous_layer = None

        for i in range(self.num_layers):
            current_x = self.node_transforms[i](x if i == 0 else previous_layer)
            if i > 0:
                current_x = current_x + self.skip_transforms[i - 1](previous_layer)
            current_x = self.propagate(
                edge_index,
                x=current_x,
                edge_attr=self.edge_transforms[i](edge_attr)
            )
            previous_layer = current_x

        return torch.sigmoid(self.final_layers(current_x))

    def message(self, x_i, x_j, edge_attr):
        attention_input = torch.cat([x_i, x_j], dim=-1)
        attention_weights = torch.softmax(self.attention(attention_input), dim=-1)
        combined_features = x_j + edge_attr
        return attention_weights * combined_features


# Objective function for Optuna optimization
def objective(trial):
    # Define hyperparameters to optimize
    hidden_channels = trial.suggest_int('hidden_channels', 64, 256, step=64)
    num_layers = trial.suggest_int('num_layers', 2, 5)
    dropout_prob = trial.suggest_float('dropout_prob', 0.2, 0.5)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)

    # Initialize model
    model = EnhancedGCNWithEdgeFeatures(
        in_channels=node_features.shape[1],
        edge_channels=edge_features.shape[1],
        hidden_channels=hidden_channels,
        num_layers=num_layers,
        dropout_prob=dropout_prob
    ).to(device)

    # Optimizer and scheduler setup
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

    # Early stopping setup
    patience = 20
    best_loss = float('inf')
    early_stop_counter = 0

    # Training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(node_features, edge_index, edge_features)
        train_loss = F.binary_cross_entropy(out.squeeze(), y_node)
        train_loss.backward()
        optimizer.step()

        # Validation (simulated here; replace with actual validation data)
        model.eval()
        with torch.no_grad():
            val_out = model(node_features, edge_index, edge_features)  # Replace with validation data
            val_loss = F.binary_cross_entropy(val_out.squeeze(), y_node)  # Replace y_node with validation labels

        # Scheduler and early stopping
        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                break

    # Return the validation loss for Optuna optimization
    return best_loss


# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

# Assuming node_features, edge_index, edge_features, and y_node are already prepared
node_features = node_features.to(device)
edge_index = graph_data.edge_index.to(device)
edge_features = edge_features.to(device)
y_node = y_node.to(device)

# Optuna hyperparameter tuning
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)

# Initialize model with the best hyperparameters
best_params = study.best_params
model = EnhancedGCNWithEdgeFeatures(
    in_channels=node_features.shape[1],
    edge_channels=edge_features.shape[1],
    hidden_channels=best_params['hidden_channels'],
    num_layers=best_params['num_layers'],
    dropout_prob=best_params['dropout_prob']
).to(device)

# Final training with best hyperparameters
optimizer = torch.optim.AdamW(model.parameters(), lr=best_params['learning_rate'], weight_decay=best_params['weight_decay'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

# Early stopping setup
patience = 20
best_loss = float('inf')
early_stop_counter = 0

# Training loop
num_epochs = 1000
train_losses, val_losses = [], []

print("\nStarting training...")
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    out = model(node_features, edge_index, edge_features)
    train_loss = F.binary_cross_entropy(out.squeeze(), y_node)
    train_loss.backward()
    optimizer.step()

    # Validation (simulated here; replace with actual validation data)
    model.eval()
    with torch.no_grad():
        val_out = model(node_features, edge_index, edge_features)  # Replace with validation data
        val_loss = F.binary_cross_entropy(val_out.squeeze(), y_node)  # Replace y_node with validation labels

    train_losses.append(train_loss.item())
    val_losses.append(val_loss.item())

    # Scheduler and early stopping
    scheduler.step(val_loss)
    if val_loss < best_loss:
        best_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')  # Save the best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"\nEarly stopping triggered at epoch {epoch + 1}")
            break

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

# Load the best model
model.load_state_dict(torch.load('best_model.pth'))

# Evaluation
print("\nEvaluating model...")
model.eval()
with torch.no_grad():
    pred = model(node_features, edge_index, edge_features)
    pred_labels = (pred.squeeze() > 0.5).float()
    accuracy = accuracy_score(y_node.cpu(), pred_labels.cpu())
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_node.cpu(), pred_labels.cpu()))

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-01-15 16:33:08,986] A new study created in memory with name: no-name-e7849f93-e607-4ea6-b32c-cb9cfd24b468
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-2)



Using device: cuda


[I 2025-01-15 16:33:10,749] Trial 0 finished with value: 0.42835503816604614 and parameters: {'hidden_channels': 128, 'num_layers': 2, 'dropout_prob': 0.2340553769186014, 'learning_rate': 0.0003653196180549422, 'weight_decay': 1.6177545066464346e-05}. Best is trial 0 with value: 0.42835503816604614.
[I 2025-01-15 16:33:11,707] Trial 1 finished with value: 0.32135555148124695 and parameters: {'hidden_channels': 64, 'num_layers': 4, 'dropout_prob': 0.26259500016281323, 'learning_rate': 0.0007833955064294884, 'weight_decay': 0.003855047514345294}. Best is trial 1 with value: 0.32135555148124695.
[I 2025-01-15 16:33:12,887] Trial 2 finished with value: 0.4708130955696106 and parameters: {'hidden_channels': 192, 'num_layers': 3, 'dropout_prob': 0.3510274381452122, 'learning_rate': 1.3437554034674018e-05, 'weight_decay': 6.691516311445385e-05}. Best is trial 1 with value: 0.32135555148124695.
[I 2025-01-15 16:33:14,230] Trial 3 finished with value: 0.3552675247192383 and parameters: {'hidden

Best hyperparameters:  {'hidden_channels': 256, 'num_layers': 5, 'dropout_prob': 0.29322510010713654, 'learning_rate': 0.00047694845293874173, 'weight_decay': 1.807500817650948e-05}

Starting training...
Epoch 10/1000, Train Loss: 0.4862, Val Loss: 0.4335
Epoch 20/1000, Train Loss: 0.3338, Val Loss: 0.3449
Epoch 30/1000, Train Loss: 0.3743, Val Loss: 0.2774
Epoch 40/1000, Train Loss: 0.3263, Val Loss: 0.3012
Epoch 50/1000, Train Loss: 0.2323, Val Loss: 0.2254
Epoch 60/1000, Train Loss: 0.2097, Val Loss: 0.2141
Epoch 70/1000, Train Loss: 0.3080, Val Loss: 0.2086
Epoch 80/1000, Train Loss: 0.2222, Val Loss: 0.2110
Epoch 90/1000, Train Loss: 0.1938, Val Loss: 0.1909
Epoch 100/1000, Train Loss: 0.2805, Val Loss: 0.2117
Epoch 110/1000, Train Loss: 0.2021, Val Loss: 0.1913
Epoch 120/1000, Train Loss: 0.2560, Val Loss: 0.1813
Epoch 130/1000, Train Loss: 0.2017, Val Loss: 0.1720
Epoch 140/1000, Train Loss: 0.2074, Val Loss: 0.1697
Epoch 150/1000, Train Loss: 0.1944, Val Loss: 0.1721
Epoch 160/

  model.load_state_dict(torch.load('best_model.pth'))


now training the model with the percet hyperparametes


visualize it