In [1]:
import joblib
import numpy as np
import os
import pandas as pd
import torch
import torch.nn as nn

from pathlib import Path
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from torch_geometric.data import Data
from tqdm import tqdm

torch.set_printoptions(sci_mode=False)

DATASET_PATH = r"E:\gnn_data\processed_step_data_full_node_features"

In [28]:
pipeline = Pipeline([
    ("log", FunctionTransformer(np.log1p, validate=True)),
    ("scale", StandardScaler())
])
pipeline_r = Pipeline([
    ("log", FunctionTransformer(np.log1p, validate=True)),
    ("scale", RobustScaler())
])
data = pd.read_csv(r"./data/synced_dataset_final.csv")

In [29]:
features = [
    "faces", "edges", "vertices", "quantity",
    "height", "width", "depth", "volume", "area",
    "bbox_height", "bbox_width", "bbox_depth", "bbox_volume",
    "bbox_area",
]
X = data[features].copy()
X_scaled = pipeline.fit_transform(X)
scaled_df = pd.DataFrame(X_scaled, columns=features)

X_scaled_r = pipeline_r.fit_transform(X)
scaled_df_r = pd.DataFrame(X_scaled_r, columns=features)

In [26]:
scaled_df.describe()

Unnamed: 0,faces,edges,vertices,quantity,height,width,depth,volume,area,bbox_height,bbox_width,bbox_depth,bbox_volume,bbox_area
count,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0
mean,-1.234218e-16,3.162122e-16,-6.297217e-16,1.801779e-16,1.945921e-16,1.594574e-16,-2.333304e-16,-2.2972680000000002e-17,-1.927903e-16,1.094581e-16,6.720635e-16,-7.567471000000001e-17,3.040502e-17,-7.567471000000001e-17
std,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008
min,-2.787949,-3.000279,-3.018315,-0.6360161,-3.11937,-3.276087,-3.258024,-4.271474,-5.221088,-3.471897,-3.549728,-3.055757,-4.42995,-3.055757
25%,-0.6849822,-0.6546401,-0.6548404,-0.6360161,-0.6060144,-0.6956969,-0.7059594,-0.5814437,-0.5868495,-0.7020979,-0.6088806,-0.6112691,-0.5722838,-0.6112691
50%,-0.1343924,-0.09216015,-0.08796461,-0.3544402,-0.05200321,0.03966434,-0.007311508,-0.01287707,-0.03127788,0.005758808,0.07637761,-0.04075629,0.03674235,-0.04075629
75%,0.6034417,0.6281141,0.6242656,0.2339673,0.6866943,0.6864316,0.6762617,0.6666359,0.6564459,0.6539315,0.6303359,0.6876625,0.6659182,0.6876625
max,3.386978,3.340234,3.701113,5.250487,3.982437,3.722163,3.688466,3.172017,3.941093,3.549163,3.461221,3.808779,4.305456,3.808779


In [31]:
scaled_df_r.describe()

Unnamed: 0,faces,edges,vertices,quantity,height,width,depth,volume,area,bbox_height,bbox_width,bbox_depth,bbox_volume,bbox_area
count,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0,63097.0
mean,0.104308,0.071846,0.06877,0.40741,0.040228,-0.028698,0.00529,0.010318,0.025157,-0.004247,-0.061634,0.031377,-0.029674,0.031377
std,0.776148,0.779579,0.781802,1.149456,0.773576,0.723527,0.723479,0.801237,0.80432,0.737453,0.806968,0.76987,0.807629,0.76987
min,-2.059537,-2.26709,-2.290936,-0.323657,-2.372821,-2.399018,-2.351804,-3.412119,-4.174237,-2.564587,-2.926128,-2.321139,-3.607402,-2.321139
25%,-0.427336,-0.438494,-0.443181,-0.323657,-0.428566,-0.53205,-0.505453,-0.455553,-0.446854,-0.522007,-0.552977,-0.439217,-0.491863,-0.439217
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.572664,0.561506,0.556819,0.676343,0.571434,0.46795,0.494547,0.544447,0.553146,0.477993,0.447023,0.560783,0.508137,0.560783
max,2.733084,2.6758,2.962285,6.442568,3.12092,2.664368,2.673796,2.551835,3.195034,2.613073,2.731438,2.963616,3.44751,2.963616


In [19]:
data["features"] = scaled_df.values.tolist()

In [21]:
non_feature_cols = data.drop(columns=features)
new_df = pd.concat([non_feature_cols, scaled_df], axis=1)
new_df

Unnamed: 0,item_id,step_file,technology_id,material_id,post_processing_id,download_file_url,technology_name,is_cnc,multiclass_labels,graphml_file,...,height,width,depth,volume,area,bbox_height,bbox_width,bbox_depth,bbox_volume,bbox_area
0,100035,E:\gnn_data\step_files\100035_MakerVerse_Sampl...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,813,[1000],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\100035_MakerVerse_Sa...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008
1,100036,E:\gnn_data\step_files\100036_MakerVerse_Sampl...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,813,[1000],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\100036_MakerVerse_Sa...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008
2,100040,E:\gnn_data\step_files\100040_00048125_Stator_...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,813,[1000],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\100040_00048125_Stat...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008
3,100041,E:\gnn_data\step_files\100041_00048495_Table_L...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,1311,[277],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\100041_00048495_Tabl...,...,0.773427,0.040955,0.092603,0.661516,0.190447,0.166351,-0.245134,1.103236,0.465420,1.103236
4,100042,E:\gnn_data\step_files\100042_00048569_Seal_Ho...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,1315,[277],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\100042_00048569_Seal...,...,0.385735,-0.103938,0.199120,-0.009061,0.102957,0.281921,-0.389559,0.689213,0.266657,0.689213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63286,99989,E:\gnn_data\step_files\99989_MakerVerse_Sample...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,804,[277],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\99989_MakerVerse_Sam...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008
63287,99990,E:\gnn_data\step_files\99990_240332A01.step,feb0f26f-94a5-4be2-9d40-761bb2857ab6,1301,[277],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\99990_240332A01.graphml,...,0.695243,0.448623,0.227919,0.751649,0.331204,0.313167,0.161217,1.019742,0.687079,1.019742
63288,99991,E:\gnn_data\step_files\99991_MakerVerse_Sample...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,813,[1000],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\99991_MakerVerse_Sam...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008
63289,99993,E:\gnn_data\step_files\99993_MakerVerse_Sample...,feb0f26f-94a5-4be2-9d40-761bb2857ab6,813,[1000],https://prod-mv-user-files-upload.s3.eu-centra...,CNC (Metal),1,2,E:\gnn_data\graphml_files\99993_MakerVerse_Sam...,...,1.100018,0.944946,0.903087,1.299111,1.311256,1.045717,0.655937,1.452008,1.425970,1.452008


In [22]:
new_df.to_csv(r"./data/synced_dataset_final_scaled.csv", index=False)

In [39]:
data = torch.load(
    r"E:\gnn_data\processed_step_data_global_features\100044_00048893_Fixture1_step_001.pt",
    weights_only=False)

In [37]:
data["global_features"] = [1,2,3,4,5]

In [41]:
data.x

tensor([[  228.0000,    21.0000,     0.0297,  ...,     0.0000,     0.0006,
             2.0952],
        [  287.0000,     2.0000,     0.0028,  ...,     0.0021,     0.0011,
             0.0000],
        [   17.0000,     1.0000,     0.0014,  ...,     0.0000,     0.0006,
             0.0000],
        ...,
        [  146.0000,     1.0000,     0.0014,  ...,     0.0041,     0.0015,
             0.0000],
        [  146.0000,     1.0000,     0.0014,  ...,     0.0041,     0.0015,
             0.0000],
        [  146.0000,     1.0000,     0.0014,  ...,     0.0041,     0.0015,
             0.0000]])

In [25]:
min_linear_features = torch.full((4,), float('inf'))
max_linear_features = torch.full((4,), float('-inf'))

linear_features = data.x[:, 2:6]
max_linear_features = torch.max(max_linear_features,
                                linear_features.max(dim=0).values)
min_linear_features = torch.min(min_linear_features,
                                linear_features.min(dim=0).values)
max_linear_features, min_linear_features
# max_linear_features.shape, linear_features.max(dim=0).values.shape

(tensor([    0.0262,     0.0000,     0.0000,     0.0000]),
 tensor([    0.0000,     0.0000,     0.0000,     0.0000]))

In [27]:
def get_global_min_max(dataset_path):
    # Initialize min and max for log-transformed features (Node Degree, Avg Neighbor Degree)
    # These are features at index 0 and 5
    min_log_features = torch.full((2,), float('inf'))
    max_log_features = torch.full((2,), float('-inf'))

    # Initialize min and max for linearly scaled features (Degree Centrality, Betweenness, Closeness, PageRank)
    # These are features at index 1, 2, 3, 4
    min_linear_features = torch.full((4,), float('inf'))
    max_linear_features = torch.full((4,), float('-inf'))
    processed_files_count = 0

    for filename in tqdm(os.listdir(dataset_path)):
        if filename.endswith(".pt"):
            try:
                file_path = os.path.join(dataset_path, filename)
                data = torch.load(file_path, weights_only=False)
                if hasattr(data, "x") and data.x is not None:
                    node_degree = data.x[:,
                                  1:2]  # Assuming first column is node degree
                    avg_neighbor_degree = data.x[:, 6: 7]
                    log_node_degree = torch.log1p(node_degree)
                    log_avg_neighbor_degree = torch.log1p(avg_neighbor_degree)
                    # Update min/max for log features
                    min_log_features[0] = torch.min(min_log_features[0],
                                                    log_node_degree.min())
                    max_log_features[0] = torch.max(max_log_features[0],
                                                    log_node_degree.max())
                    min_log_features[1] = torch.min(min_log_features[1],
                                                    log_avg_neighbor_degree.min())
                    max_log_features[1] = torch.max(max_log_features[1],
                                                    log_avg_neighbor_degree.max())

                    # Linear features (Degree Centrality, Betweenness, Closeness, PageRank)
                    linear_features = data.x[:, 2:6]
                    max_linear_features = torch.max(max_linear_features[0],
                                                    linear_features.max(
                                                        dim=0).values)
                    min_linear_features = torch.min(min_linear_features[1],
                                                    linear_features.min(
                                                        dim=0).values)

                    processed_files_count += 1
                else:
                    print(
                        f"Invalid data in {filename}: 'x' attribute is missing or None")
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    return min_log_features, max_log_features, min_linear_features, max_linear_features, processed_files_count

In [28]:
min_log_features, max_log_features, min_linear_features, max_linear_features, processed_files_count = get_global_min_max(
    DATASET_PATH)

100%|██████████| 64580/64580 [02:42<00:00, 397.88it/s] 


In [29]:
min_log_features, max_log_features, min_linear_features, max_linear_features

(tensor([0., 0.]),
 tensor([10.6986, 10.0054]),
 tensor([0., 0., 0., 0.]),
 tensor([0.9438, 0.9438, 0.9438, 0.9438]))

In [33]:
def initialize_minmax_scaler(min_vals: torch.Tensor, max_vals: torch.Tensor,
                             feature_range: tuple = (-1, 1)):
    """
    Initializes a MinMaxScaler with pre-defined min and max values.
    Handles cases where data_range_ might be zero to prevent division by zero.
    """
    scaler = MinMaxScaler(feature_range=feature_range)
    min_np = min_vals.cpu().numpy()
    max_np = max_vals.cpu().numpy()

    scaler.data_min_ = min_np
    scaler.data_max_ = max_np
    scaler.data_range_ = scaler.data_max_ - scaler.data_min_

    # If data_range_ is zero (min_np == max_np), set scale_ to 1.0 to avoid division by zero.
    scaler.scale_ = np.where(scaler.data_range_ == 0,
                             (feature_range[1] - feature_range[0]) / 2,
                             # Map to midpoint if range is zero
                             (feature_range[1] - feature_range[
                                 0]) / scaler.data_range_)

    # The min_ attribute is used in the transformation formula: X_scaled = X_std * (max - min) + min
    # where X_std = (X - data_min_) / data_range_
    # So, X_scaled = (X - data_min_) * scale_ + feature_range[0]
    # This implies min_ = feature_range[0] - data_min_ * scale_
    scaler.min_ = feature_range[0] - scaler.data_min_ * scaler.scale_

    return scaler

In [66]:
def apply_feature_scaling(
        directory_path: str,
        min_log_values: torch.Tensor,
        max_log_values: torch.Tensor,
        min_linear_values: torch.Tensor,
        max_linear_values: torch.Tensor,
        feature_range: tuple = (-1, 1),
        output_directory: str = None
        # Default to overwrite files in directory_path
):
    log_scaler = initialize_minmax_scaler(min_log_values, max_log_values,
                                          feature_range)
    print(
        f"Log-transform Scaler (Features 0, 5) initialized with feature_range={feature_range}")
    print(f"  data_min_: {log_scaler.data_min_}")
    print(f"  data_max_: {log_scaler.data_max_}")
    print(f"  scale_: {log_scaler.scale_}")
    print(f"  min_ (offset): {log_scaler.min_}")

    # Initialize the scaler for linearly scaled features (1, 2, 3, 4)
    linear_scaler = initialize_minmax_scaler(min_linear_values,
                                             max_linear_values, feature_range)
    print(
        f"\nLinear Scaler (Features 1,2,3,4) initialized with feature_range={feature_range}")
    print(f"  data_min_: {linear_scaler.data_min_}")
    print(f"  data_max_: {linear_scaler.data_max_}")
    print(f"  scale_: {linear_scaler.scale_}")
    print(f"  min_ (offset): {linear_scaler.min_}")

    # Set up output directory
    if output_directory and not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"\nCreated output directory: {output_directory}")
    elif output_directory and output_directory == directory_path:
        print(
            "\nWarning: output_directory is the same as directory_path. Files will be overwritten.")
    elif not output_directory:
        print(
            "\nWarning: No output_directory specified. Files will be overwritten in the original directory.")

    processed_files_count = 0
    skipped_files_count = 0

    print(f"Applying scaling to files in '{directory_path}'...")
    for filename in tqdm(os.listdir(directory_path)):
        if filename.endswith(".pt"):
            input_file_path = os.path.join(directory_path, filename)
            output_file_path = os.path.join(
                output_directory if output_directory else directory_path,
                filename)

            try:
                data = torch.load(input_file_path, weights_only=False)

                if (hasattr(data, "x") and hasattr(data, "global_features")
                        and data.x is not None and data.global_features is not None
                        and data.x.shape[1] >= 6):
                    # Ensure data.x is on CPU for numpy conversion and to avoid device issues
                    original_x = data.x.cpu()
                    original_device = data.x.device  # Store original device to move back

                    # --- Process Features 0 (Node Degree) and 5 (Average Neighbor Degree) ---
                    # Extract both features as a single (N, 2) array for the log_scaler
                    log_features_orig = original_x[:, [1, 6]].numpy()
                    log_transformed_features = np.log1p(log_features_orig)
                    scaled_log_features_np = log_scaler.transform(
                        log_transformed_features)

                    # Split the scaled log features back into individual tensors
                    scaled_degree_torch = torch.from_numpy(
                        scaled_log_features_np[:, 0:1]).to(original_device)
                    scaled_avg_degree_torch = torch.from_numpy(
                        scaled_log_features_np[:, 1:2]).to(original_device)

                    # --- Process Features 1, 2, 3, 4 (Centralities, PageRank) ---
                    linear_features_orig = original_x[:, 2:6].numpy()
                    scaled_linear_features_np = linear_scaler.transform(
                        linear_features_orig)
                    scaled_linear_features_torch = torch.from_numpy(
                        scaled_linear_features_np).to(original_device)

                    # --- Concatenate all scaled features and original embeddings ---
                    # Ensure the order is correct: [F0_scaled, F1-4_scaled, F5_scaled, Embeddings]
                    data.x = torch.cat((
                        original_x[:, 0:1].to(original_device),
                        # Node ID or other feature
                        scaled_degree_torch,
                        scaled_linear_features_torch,
                        scaled_avg_degree_torch,
                        original_x[:, 7:].to(original_device)
                    # Original embeddings (features 6 to 37)
                    ), dim=1)

                    original_global = data.global_features.to("cpu")

                    # Save the modified Data object
                    torch.save(data, output_file_path)
                    processed_files_count += 1

                else:
                    print(
                        f"Skipping '{filename}': 'data.x' not found or has fewer than 6 features.")
                    skipped_files_count += 1

            except Exception as e:
                print(f"Error processing '{filename}': {e}")
                skipped_files_count += 1

    print(
        f"\nScaling complete. Processed {processed_files_count} files, skipped {skipped_files_count} files.")
    if not output_directory:
        print("Original files have been overwritten.")
    elif output_directory != directory_path:
        print(
            f"Scaled files saved to '{output_directory}'. Original files remain in '{directory_path}'.")

In [67]:
apply_feature_scaling(
    DATASET_PATH,
    min_log_features, max_log_features,
    min_linear_features, max_linear_features,
    feature_range=(-1, 1),
    output_directory=r"E:\gnn_data\processed_step_data_full_node_features_scaled")  # Set to None to overwrite files in DATASET_PATH)

Log-transform Scaler (Features 0, 5) initialized with feature_range=(-1, 1)
  data_min_: [0. 0.]
  data_max_: [10.69856  10.005435]
  scale_: [0.18694106 0.19989136]
  min_ (offset): [-1. -1.]

Linear Scaler (Features 1,2,3,4) initialized with feature_range=(-1, 1)
  data_min_: [0. 0. 0. 0.]
  data_max_: [0.9437746 0.9437746 0.9437746 0.9437746]
  scale_: [2.1191502 2.1191502 2.1191502 2.1191502]
  min_ (offset): [-1. -1. -1. -1.]
Applying scaling to files in 'E:\gnn_data\processed_step_data_full_node_features'...


100%|██████████| 64580/64580 [12:27<00:00, 86.37it/s] 


Scaling complete. Processed 64579 files, skipped 0 files.
Scaled files saved to 'E:\gnn_data\processed_step_data_full_node_features_scaled'. Original files remain in 'E:\gnn_data\processed_step_data_full_node_features'.





In [2]:
pt_files_dir = r"E:\gnn_data\processed_step_data_global_features"

print("Loading all graphs and concatenating node features...")

all_node_features = []
all_files = [f for f in os.listdir(pt_files_dir) if f.endswith('.pt')]

for i, file_name in tqdm(enumerate(all_files)):
    file_path = os.path.join(pt_files_dir, file_name)
    try:
        data = torch.load(file_path, weights_only=False)
        if data.x is not None:
            all_node_features.append(data.x[:, 1:].numpy())
    except Exception as e:
        print(f"Error loading {file_name}: {e}. Skipping...")
        continue

    if (i + 1) % 5000 == 0:
        print(f"  Processed {i + 1}/{len(all_files)} files...")


Loading all graphs and concatenating node features...


5026it [00:34, 156.57it/s]

  Processed 5000/63043 files...


10030it [01:07, 156.95it/s]

  Processed 10000/63043 files...


15021it [01:40, 145.66it/s]

  Processed 15000/63043 files...


20030it [02:13, 155.49it/s]

  Processed 20000/63043 files...


25026it [02:48, 151.31it/s]

  Processed 25000/63043 files...


30024it [03:21, 134.02it/s]

  Processed 30000/63043 files...


35024it [03:54, 149.79it/s]

  Processed 35000/63043 files...


40243it [04:16, 1356.62it/s]

  Processed 40000/63043 files...


45240it [04:20, 1288.16it/s]

  Processed 45000/63043 files...


50272it [04:24, 1404.02it/s]

  Processed 50000/63043 files...


55014it [04:44, 154.83it/s] 

  Processed 55000/63043 files...


60212it [04:53, 1214.97it/s]

  Processed 60000/63043 files...


63043it [04:55, 213.20it/s] 


In [3]:
if all_node_features:
    concatenated_features = np.vstack(all_node_features)
    print(f"Shape of concatenated features for fitting: {concatenated_features.shape}")

    # Initialize and fit the RobustScaler on the full dataset
    pipeline_robust = Pipeline([
        ('log1p_transform', FunctionTransformer(np.log1p, validate=True)), # validate=True for safety
        ('robust_scale', RobustScaler())
    ])

    pipeline_std = Pipeline([
        ('log1p_transform', FunctionTransformer(np.log1p, validate=True)), # validate=True for safety
        ('standard_scale', StandardScaler())
    ])
    pipeline_minmax = Pipeline([
        ('log1p_transform', FunctionTransformer(np.log1p, validate=True)), # validate=True for safety
        ('minmax_scale', MinMaxScaler(feature_range=(-5, 5)))
    ])

    # pipeline_robust.fit(concatenated_features)
    pipeline_std.fit(concatenated_features)
    pipeline_minmax.fit(concatenated_features)

    print("Scaler fitting complete on all 60K samples.")
    # robust_scaler = pipeline_robust.named_steps['robust_scale']
    std_scaler = pipeline_std.named_steps['standard_scale']
    minmax_scaler = pipeline_minmax.named_steps['minmax_scale']
    # print(f"Robust Scaler parameters: Median={final_scaler.center_}, IQR={final_scaler.scale_}")
    print(f"Standard Scaler parameters: Mean={std_scaler.mean_}, Std={std_scaler.scale_}")
    print(f"MinMax Scaler parameters: Min={minmax_scaler.data_min_}, Max={minmax_scaler.data_max_}")

    # robust_scaler_filename = "node_feature_log_robust_scaler.pkl"
    std_scaler_filename = "node_feature_log_std_scaler.pkl"
    minmax_scaler_filename = "node_feature_log_minmax_scaler.pkl"
    # --- Save the fitted scaler ---
    # with open(robust_scaler_filename, 'wb') as f:
    #     joblib.dump(pipeline_robust, f)
    with open(std_scaler_filename, 'wb') as f:
        joblib.dump(pipeline_std, f)
    with open(minmax_scaler_filename, 'wb') as f:
        joblib.dump(pipeline_minmax, f)
    # print(f"Fitted robust scaler saved to {robust_scaler_filename}")
    print(f"Fitted standard scaler saved to {std_scaler_filename}")
    print(f"Fitted minmax scaler saved to {minmax_scaler_filename}")
else:
    print("No node features found to fit a scaler on.")

Shape of concatenated features for fitting: (321257441, 6)
Scaler fitting complete on all 60K samples.
Standard Scaler parameters: Mean=[1.03862734e+00 4.54070689e-04 9.79995726e-06 5.81540038e-04
 1.96116633e-04 3.99148620e-01], Std=[4.39583700e-01 1.39641275e-03 2.07801287e-04 1.22655925e-03
 4.52087688e-04 5.91216076e-01]
MinMax Scaler parameters: Min=[0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 3.675917e-06
 0.000000e+00], Max=[10.69856     0.6646317   0.08863613  0.18032777  0.07316519 10.005435  ]
Fitted standard scaler saved to node_feature_log_std_scaler.pkl
Fitted minmax scaler saved to node_feature_log_minmax_scaler.pkl


In [4]:
huge_df = pd.DataFrame(concatenated_features)

In [5]:
huge_df.describe()

Unnamed: 0,0,1,2,3,4,5
count,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0
mean,2.303436,0.0004551718,9.821415e-06,0.0005824661,0.0001962381,0.9043913
std,13.17859,0.001366356,0.0002082126,0.001149,0.0004234017,8.258645
min,0.0,0.0,0.0,0.0,3.675924e-06,0.0
25%,1.0,4.169968e-05,0.0,7.243353e-05,2.468981e-05,0.0
50%,1.0,0.0001077412,0.0,0.000171403,5.754097e-05,0.0
75%,3.0,0.0003541076,0.0,0.0005213547,0.0001704606,1.0
max,44291.0,0.9437746,0.09268299,0.1976098,0.07590825,22145.5


In [6]:
# robust_scaled_huge_df = pd.DataFrame(pipeline_robust.transform(huge_df))
std_scaled_huge_df = pd.DataFrame(pipeline_std.transform(huge_df))
minmax_scaled_huge_df = pd.DataFrame(pipeline_minmax.transform(huge_df))

In [7]:
std_scaled_huge_df.describe()

Unnamed: 0,0,1,2,3,4,5
count,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0
mean,1.053394e-07,1.938603e-08,5.869062e-08,-3.489477e-08,-1.127562e-08,6.984958e-08
std,0.6326603,0.9271398,0.9918067,0.8875435,0.8987713,0.637973
min,-2.362752,-0.3251694,-0.04716023,-0.4741231,-0.4256712,-0.6751315
25%,-0.7859257,-0.295308,-0.04716023,-0.415071,-0.37919,-0.6751315
50%,-0.7859257,-0.2480178,-0.04716023,-0.3343921,-0.3065275,-0.6751315
75%,0.7909006,-0.07163049,-0.04716023,-0.04917922,-0.05678216,0.4972777
max,21.97518,475.6313,426.4956,146.5451,161.4047,16.24835


In [8]:
minmax_scaled_huge_df.describe()

Unnamed: 0,0,1,2,3,4,5
count,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0,321257400.0
mean,-4.029235,-4.993742,-4.999185,-4.968224,-4.97424,-4.601065
std,0.9212027,1.292732,1.292732,1.292732,1.292732,1.292732
min,-5.0,-5.0,-5.0,-5.0,-5.0,-5.0
25%,-4.352112,-4.999372,-5.0,-4.995984,-4.997128,-5.0
50%,-4.352112,-4.998379,-5.0,-4.990496,-4.992638,-5.0
75%,-3.704223,-4.994673,-5.0,-4.971096,-4.977205,-4.30723
max,5.0,5.0,5.0,5.0,4.999999,5.0


In [9]:
sample_data = torch.load(r"E:\gnn_data\processed_step_data_global_features\23595_AM75_H3_8018_1_REVA_F1_STEERING_HUB_ADAPTER_AFT__1_.pt", weights_only=False)
sample_data_scaled = pipeline_minmax.transform(sample_data.x[:, 1:].cpu())

In [10]:
pd.DataFrame(sample_data_scaled).describe()

Unnamed: 0,0,1,2,3,4,5
count,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0
mean,-3.97717,-4.987749,-4.999624,-4.947418,-4.953863,-4.527994
std,0.40989,0.016212,0.001731,0.020786,0.022181,0.616264
min,-4.352112,-4.994864,-5.0,-5.0,-4.977741,-5.0
25%,-4.352112,-4.994864,-5.0,-4.962147,-4.970557,-5.0
50%,-3.973121,-4.98973,-5.0,-4.949824,-4.959081,-5.0
75%,-3.704223,-4.984597,-5.0,-4.943977,-4.943008,-3.901984
max,-0.869687,-4.584564,-4.953756,-4.852027,-4.838858,-0.607938


In [11]:
pd.DataFrame(sample_data.x).describe()

Unnamed: 0,0,1,2,3,4,5,6
count,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0
mean,149.879517,2.387713,0.000815,3e-06,0.000949,0.000341,1.023968
std,104.724152,3.186995,0.001088,1.5e-05,0.000375,0.000162,2.337204
min,3.0,1.0,0.000341,0.0,0.0,0.000167,0.0
25%,59.0,1.0,0.000341,0.0,0.000683,0.000219,0.0
50%,146.0,2.0,0.000683,0.0,0.000905,0.000303,0.0
75%,240.0,3.0,0.001024,0.0,0.001011,0.000421,2.0
max,386.0,82.0,0.027996,0.00041,0.002672,0.001183,80.0


In [13]:
DATASET_PATH = r"E:\gnn_data\processed_step_data_global_features"
DATASET_SCALED_PATH = r"E:\gnn_data\processed_step_data_global_features_scaled"
os.makedirs(DATASET_SCALED_PATH, exist_ok=True)

processed_files_count = 0
skipped_files_count = 0

for filename in tqdm(os.listdir(DATASET_PATH)):
    if filename.endswith(".pt"):
        input_file_path = os.path.join(DATASET_PATH, filename)
        output_file_path = os.path.join(DATASET_SCALED_PATH, filename)
        try:
            data = torch.load(input_file_path, weights_only=False)
            if hasattr(data, "x") and data.x is not None:
                node_features = data.x[:, 1:]
                scaled_features = pipeline_minmax.transform(node_features.cpu().numpy())
                data.x[:, 1:] = torch.tensor(scaled_features,
                                              dtype=data.x.dtype)
                torch.save(data, output_file_path)
                processed_files_count += 1
            else:
                print(
                    f"Invalid data in {filename}: 'x' attribute is missing or None")
                skipped_files_count += 1
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            skipped_files_count += 1

100%|██████████| 63044/63044 [08:57<00:00, 117.37it/s]


In [14]:
original_data = torch.load(
    r"E:\gnn_data\processed_step_data_global_features\100044_00048893_Fixture1_step_001.pt",
    weights_only=False)
scaled_data = torch.load(
    r"E:\gnn_data\processed_step_data_global_features_scaled\100044_00048893_Fixture1_step_001.pt",
    weights_only=False)

In [15]:
pd.DataFrame(original_data.x).describe()

Unnamed: 0,0,1,2,3,4,5,6
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0
mean,192.606796,2.282885,0.003234,0.000165,0.00363,0.001414,0.93757
std,109.293488,1.575945,0.002232,0.000824,0.001661,0.000651,1.306762
min,3.0,1.0,0.001416,0.0,0.0,0.000569,0.0
25%,122.0,1.0,0.001416,0.0,0.002601,0.000927,0.0
50%,176.0,2.0,0.002833,4.3e-05,0.003238,0.00118,1.0
75%,257.0,2.5,0.003541,0.00017,0.004309,0.001969,1.0
max,386.0,21.0,0.029745,0.01429,0.011957,0.004066,20.0


In [16]:
pd.DataFrame(scaled_data.x).describe()

Unnamed: 0,0,1,2,3,4,5,6
count,707.0,707.0,707.0,707.0,707.0,707.0,707.0
mean,192.606796,-3.957569,-4.951463,-4.981407,-4.799116,-4.807339,-4.491939
std,109.293488,0.337237,0.033287,0.092368,0.091617,0.088879,0.527424
min,3.0,-4.352112,-4.978703,-5.0,-5.0,-4.922684,-5.0
25%,122.0,-4.352112,-4.978703,-5.0,-4.855954,-4.873787,-5.0
50%,176.0,-3.973121,-4.957437,-4.995193,-4.820753,-4.839366,-4.30723
75%,257.0,-3.838672,-4.946819,-4.980771,-4.761563,-4.731618,-4.30723
max,386.0,-2.110786,-4.558985,-3.399233,-4.340846,-4.445936,-1.957131


In [80]:
global_df = pd.read_csv(r"./data/synced_dataset_final_scaled.csv")
OUTPUT_DIR = Path(r"E:\gnn_data\processed_step_data_global_features_combined")
features = [
    "faces", "edges", "vertices", "quantity",
    "height", "width", "depth", "volume", "area",
    "bbox_height", "bbox_width", "bbox_depth", "bbox_volume",
    "bbox_area",
]
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
for file in tqdm(list(Path(DATASET_SCALED_PATH).glob("*.*pt"))[:]):
    try:
        data = torch.load(file, weights_only=False)
        file_id = int(file.stem.split("_")[0])
        data.global_features = torch.tensor(global_df[global_df.item_id == file_id][features].values, dtype=data.x.dtype)
        output_file_path = OUTPUT_DIR / file.name
        torch.save(data, output_file_path)
    except Exception as e:
        print(f"Error loading {file.name}: {e}")

100%|██████████| 63043/63043 [10:44<00:00, 97.89it/s] 


In [23]:
with open(r"E:\gnn_data\processed_step_data_global_features\dataset_mapping.pkl", "rb") as f:
    dataset_mapping = joblib.load(f)
dataset_mapping["total_failed"]

54

In [73]:
sample_path = Path(r"E:\gnn_data\processed_step_data_global_features_combined\99987_Pusher.pt")
sample = torch.load(Path(sample_path), weights_only=False)
sample.x

tensor([[228.0000,  -3.9731,  -4.9775,  ...,  -5.0000,  -4.9536,  -3.6145],
        [331.0000,  -3.9731,  -4.9775,  ...,  -5.0000,  -4.9536,  -3.9020],
        [  3.0000,  -3.7042,  -4.9662,  ...,  -4.9584,  -4.9337,  -3.7479],
        ...,
        [181.0000,  -3.9731,  -4.9775,  ...,  -4.9288,  -4.8628,  -4.3072],
        [182.0000,  -3.9731,  -4.9775,  ...,  -4.9273,  -4.8367,  -5.0000],
        [ 72.0000,  -3.9731,  -4.9775,  ...,  -4.8754,  -4.7694,  -5.0000]])

In [47]:
torch.tensor(global_df[global_df.item_id == 100041][features].values.tolist()).shape

torch.Size([1, 14])

In [59]:
global_df[global_df.item_id == 100035][features]

Unnamed: 0,faces,edges,vertices,quantity,height,width,depth,volume,area,bbox_height,bbox_width,bbox_depth,bbox_volume,bbox_area
0,216.0,637.0,423.0,1,163.662476,185.610681,163.662476,963533.514386,208702.010756,163.662476,185.610681,163.662476,4971657.0,163.662476


In [79]:
synced_dataset_final_scaled = pd.read_csv(r"./data/synced_dataset_final_scaled.csv")
synced_dataset_final_scaled[features].describe()

Unnamed: 0,faces,edges,vertices,quantity,height,width,depth,volume,area,bbox_height,bbox_width,bbox_depth,bbox_volume,bbox_area
count,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0,63291.0
mean,-3.754175e-16,2.694384e-16,-5.388768e-18,5.568394e-17,5.0295170000000006e-17,-2.658459e-16,-2.290226e-16,1.445986e-16,-3.493718e-16,-1.760331e-16,-2.568646e-16,1.167566e-16,-2.332887e-16,1.167566e-16
std,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008
min,-2.788245,-3.000654,-3.01868,-0.6230655,-3.119035,-3.275045,-3.254481,-4.267127,-5.218936,-3.468265,-3.550652,-3.054336,-4.427959,-3.054336
25%,-0.6858142,-0.6419232,-0.6555887,-0.6230655,-0.6054304,-0.6965314,-0.7107615,-0.5799896,-0.5882728,-0.7033272,-0.6069614,-0.6097639,-0.5726225,-0.6097639
50%,-0.1353647,-0.09308507,-0.08880481,-0.3547146,-0.05136448,0.0409551,-0.006880988,-0.01419641,-0.03143936,0.00772936,0.07719783,-0.03923133,0.03755414,-0.03923133
75%,0.6022812,0.6270531,0.62331,0.2060566,0.6870038,0.683204,0.6755456,0.6657674,0.6566541,0.6556717,0.6313362,0.68663,0.6668648,0.68663
max,3.385108,3.33866,3.699658,9.585701,3.983474,3.723741,3.689107,3.173189,3.942813,3.549576,3.463669,3.810437,4.307408,3.810437


In [1]:
import joblib
with open(r"E:\gnn_data\pyg_data_v2\dataset_mapping.pkl", "rb") as f:
    dataset_mapping = joblib.load(f)

In [14]:
from pathlib import Path
graph_files = set(file.stem for file in Path(r"E:\gnn_data\graphml_files").glob("*.graphml"))
step_files = set(file.stem for file in Path(r"E:\gnn_data\step_files").glob("*.*"))
unsed_step = Path(r"E:\gnn_data\step_broken")
difference = list(step_files - graph_files)
for file in Path(r"E:\gnn_data\step_files").glob("*.*"):
    if file.stem in difference:
        file.rename(unsed_step / file.name)

In [18]:
with open(r"E:\gnn_data\step_negative_area.txt", "r") as f:
    negative_area_files = [item.split(" - ")[0] for item in f.read().splitlines()]

In [26]:
file_stems = [Path(file).stem for file in negative_area_files]
step_file_tocheck = []
for file in Path(r"E:\gnn_data\step_files").glob("*.*"):
    if file.stem in file_stems and file.exists():
        step_file_tocheck.append(file.stem)

In [27]:
len(file_stems), len(step_file_tocheck)

(256, 245)

In [28]:
with open(r"E:\gnn_data\graphml_files\failed_files.pkl", "rb") as f:
    failed_files = joblib.load(f)
failed_files

[]