In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch_geometric.datasets import AirfRANS
from torch_geometric.transforms import Compose, KNNGraph, RadiusGraph
from sklearn.preprocessing import StandardScaler
import os # For checking DATA_ROOT

# --- Ensure DATA_ROOT is defined (adjust if you mounted Google Drive) ---
# For Colab, a local path is usually fine as it's temporary storage.
# If you are mounting Google Drive for persistence, change this path accordingly.
DATA_ROOT = './data/AirfRANS'

# --- Custom Normalization Class (from previous steps) ---
class CustomNormalize(object):
    def __init__(self, x_scaler, y_scaler):
        self.x_scaler = x_scaler
        self.y_scaler = y_scaler

    def __call__(self, data):
        data.x = torch.tensor(self.x_scaler.transform(data.x.cpu().numpy()), dtype=torch.float)
        data.y = torch.tensor(self.y_scaler.transform(data.y.cpu().numpy()), dtype=torch.float)
        return data

# --- Data Loading and Preprocessing Function (from previous steps) ---
# Added U_infinity_val return for later use in Cp calculation.
# We'll stick to KNNGraph (k=15) for analysis, based on our previous discussion to skip problematic RadiusGraph.
def load_and_preprocess_airfrans(graph_transform_type='knn', k=15, r=0.05, data_root='./data/AirfRANS'):
    print(f"Loading AirfRANS dataset with {graph_transform_type} graph transform...")

    if graph_transform_type == 'knn':
        graph_transform = KNNGraph(k=k)
        print(f"Using KNNGraph with k={k}")
    elif graph_transform_type == 'radius':
        # This part is mostly for completeness of the function signature,
        # but we'll avoid calling it with 'radius' for now due to resource issues.
        graph_transform = RadiusGraph(r=r)
        print(f"Using RadiusGraph with r={r}")
    else:
        raise ValueError("graph_transform_type must be 'knn' or 'radius'")

    initial_train_dataset = AirfRANS(root=data_root, task='full', train=True, transform=None)
    
    # Calculate U_infinity from raw data to be consistent for Cp calculation
    # We take the freestream velocity from the first node of the first sample (assuming it's a freestream point)
    # A more robust way might be to average over several samples or specific boundary conditions if available.
    # For now, this is a pragmatic approximation.
    first_sample_raw = initial_train_dataset[0]
    avg_inlet_vx = first_sample_raw.x[0,0].item() # .item() for scalar tensors
    avg_inlet_vy = first_sample_raw.x[0,1].item()
    U_infinity = np.sqrt(avg_inlet_vx**2 + avg_inlet_vy**2)
    print(f"Approximated U_infinity from first sample: {U_infinity:.4f}")


    x_scaler = StandardScaler()
    y_scaler = StandardScaler()

    print("Fitting scalers incrementally using partial_fit()...")
    for i, data in enumerate(initial_train_dataset):
        x_scaler.partial_fit(data.x.numpy())
        y_scaler.partial_fit(data.y.numpy())
        if (i + 1) % 100 == 0 or (i + 1) == len(initial_train_dataset):
            print(f"  Processed {i + 1}/{len(initial_train_dataset)} samples for scaler fitting.")

    normalize_transform = CustomNormalize(x_scaler, y_scaler)

    train_transform_final = Compose([graph_transform, normalize_transform])
    test_transform_final = Compose([graph_transform, normalize_transform])

    train_dataset_preprocessed = AirfRANS(root=data_root, task='full', train=True, transform=train_transform_final)
    test_dataset_preprocessed = AirfRANS(root=data_root, task='full', train=False, transform=test_transform_final)

    print(f"Loaded {len(train_dataset_preprocessed)} training samples after preprocessing.")
    print(f"Loaded {len(test_dataset_preprocessed)} test samples after preprocessing.")

    return train_dataset_preprocessed, test_dataset_preprocessed, x_scaler, y_scaler, U_infinity # Return U_infinity

# --- In-depth Data Analysis - Part 1: Initial Setup and Dataset Statistics ---

print("\n--- Starting In-depth Data Analysis - Part 1: Initial Setup and Dataset Statistics ---")

# Load data with KNNGraph (k=15 chosen as a reasonable default for analysis)
# This will perform all preprocessing steps: graph construction and normalization
train_dataset_analysis, _, x_scaler_analysis, y_scaler_analysis, U_infinity_analysis = load_and_preprocess_airfrans(graph_transform_type='knn', k=15, data_root=DATA_ROOT)

### 1. Dataset Statistics
print("\n### 1. Dataset Statistics ###")
total_nodes = sum(data.num_nodes for data in train_dataset_analysis)
total_edges = sum(data.num_edges for data in train_dataset_analysis)
avg_nodes_per_graph = total_nodes / len(train_dataset_analysis)
avg_edges_per_graph = total_edges / len(train_dataset_analysis)

print(f"Total training samples: {len(train_dataset_analysis)}")
print(f"Total nodes across all training graphs: {total_nodes}")
print(f"Total edges across all training graphs: {total_edges}")
print(f"Average nodes per graph: {avg_nodes_per_graph:.2f}")
print(f"Average edges per graph: {avg_edges_per_graph:.2f}")
print(f"Average node degree (overall): {avg_edges_per_graph / avg_nodes_per_graph:.2f} (This should be 'k' if KNNGraph is used)")

# Example: Min/Max nodes in a sample (iterate to find, can be slow for many samples)
min_nodes = float('inf')
max_nodes = float('-inf')
for data in train_dataset_analysis:
    min_nodes = min(min_nodes, data.num_nodes)
    max_nodes = max(max_nodes, data.num_nodes)
print(f"Min nodes per graph: {min_nodes}")
print(f"Max nodes per graph: {max_nodes}")

print("\n--- Part 1: Initial Setup and Dataset Statistics Complete ---")
print("You should now see basic statistics about your AirfRANS dataset.")