In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from scipy.spatial import Voronoi
from sklearn.model_selection import train_test_split

In [None]:
#Useful functions

def create_ffn(hidden_units, dropout_rate, input_shape=None, name=None):
    
    #Creates a sequential model (feed-forward network) 
   
    fnn_layers = []
    if input_shape is not None:
        fnn_layers.append(layers.Input(shape=input_shape))
    for units in hidden_units:
        fnn_layers.append(layers.BatchNormalization())
        fnn_layers.append(layers.Dropout(dropout_rate))
        fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))
    return keras.Sequential(fnn_layers, name=name)

def create_gru(hidden_units, dropout_rate):
    
    #Creates a GRU based model for combining nodes information
    
    inputs = keras.layers.Input(shape=(2, hidden_units[0]))
    x = inputs
    for units in hidden_units:
        x = layers.GRU(
            units=units,
            activation="tanh",
            recurrent_activation="sigmoid",
            return_sequences=True,
            dropout=dropout_rate,
            recurrent_dropout=dropout_rate
        )(x)
    return keras.Model(inputs=inputs, outputs=x)

#Convolution layer

class GraphConvLayer(layers.Layer):
    def __init__(self, hidden_units, dropout_rate=0.2, aggregation_type="mean",
                 combination_type="concat", normalize=False, *args, **kwargs):

        # Layer that processes messages in a graph: prepares messages from neighbours with a FFN, 
        # aggregates messages of neighbours through a specified method (sum,mean,max) and combines 
        # the node representation with the aggregated message

        super(GraphConvLayer, self).__init__(*args, **kwargs)
        self.aggregation_type = aggregation_type
        self.combination_type = combination_type
        self.normalize = normalize
        self.hidden_units = hidden_units
        self.dropout_rate = dropout_rate

        # FFN para preparar mensajes
        self.ffn_prepare = create_ffn(hidden_units, dropout_rate, name="ffn_prepare")
        # Función de actualización: puede ser una GRU o una FFN
        if self.combination_type == "gru":
            self.update_fn = create_gru(hidden_units, dropout_rate)
        else:
            self.update_fn = create_ffn(hidden_units, dropout_rate, name="update_ffn")

    def build(self, input_shape):
        #We can implement the variable initialization here if necessary 
        super(GraphConvLayer, self).build(input_shape)

    def prepare(self, node_representations, weights=None):
        messages = self.ffn_prepare(node_representations)
        if weights is not None:
            messages = messages * tf.expand_dims(weights, -1)
        return messages

    def aggregate(self, node_indices, neighbour_messages, node_representations):
        # As it can vary between images, we use the number of nodes dinamically
        # node_indices shape is [num_edges].
        # neighbour_messages shape: [num_edges, representation_dim].
        # node_repesentations shape is [num_nodes, representation_dim]
        num_nodes = tf.shape(node_representations)[0]
        if self.aggregation_type == "sum":
            aggregated_message = tf.math.unsorted_segment_sum(neighbour_messages, node_indices, num_segments=num_nodes)
        elif self.aggregation_type == "mean":
            aggregated_message = tf.math.unsorted_segment_mean(neighbour_messages, node_indices, num_segments=num_nodes)
        elif self.aggregation_type == "max":
            aggregated_message = tf.math.unsorted_segment_max(neighbour_messages, node_indices, num_segments=num_nodes)
        else:
            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
        return aggregated_message

    def update(self, node_representations, aggregated_messages):
        # node_repesentations shape is [num_nodes, representation_dim].
        # aggregated_messages shape is [num_nodes, representation_dim].
        if self.combination_type == "gru":
            h = tf.stack([node_representations, aggregated_messages], axis=1)
        elif self.combination_type == "concat":
            h = tf.concat([node_representations, aggregated_messages], axis=1)
        elif self.combination_type == "add":
            h = node_representations + aggregated_messages
        else:
            raise ValueError(f"Invalid combination type: {self.combination_type}.")
        node_embeddings = self.update_fn(h)
        if self.combination_type == "gru":
            # Seleccionamos la salida final de la secuencia GRU
            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
        if self.normalize:
            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
        return node_embeddings

    def call(self, inputs):
        """Process the inputs to produce the node_embeddings.

        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
            -node_representations: tensor with shape (num_nodes,feature_dim)
            -edges: tensor with shape (num_edges,2) 
            -edge_weights:with shape (num_edges,), as in our problem all the edges
            have the same weight this tensor is going to be a ones array
        Returns: node_embeddings of shape [num_nodes, representation_dim].
        """
        node_representations, edges, edge_weights = inputs
        # Divide the source and target indices
        source_indexes = edges[:, 0]
        target_indexes = edges[:, 1]
        # Obtain the neighbour (target) representations
        neighbour_representations = tf.gather(node_representations, target_indexes)
        neighbour_messages = self.prepare(neighbour_representations, edge_weights)
        aggregated_messages = self.aggregate(source_indexes, neighbour_messages, node_representations)
        return self.update(node_representations, aggregated_messages)

In [None]:
#Node Classifier model 

class GNNNodeClassifier(tf.keras.Model):
    def __init__(self, num_classes, hidden_units, aggregation_type="mean",
                 combination_type="concat", dropout_rate=0.2, normalize=True, *args, **kwargs):
        super(GNNNodeClassifier, self).__init__(*args, **kwargs)
        # Preprocessing: transform the node features
        self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
        # Convolutional layers
        self.conv1 = GraphConvLayer(hidden_units, dropout_rate, aggregation_type,
                                    combination_type, normalize, name="graph_conv1")
        self.conv2 = GraphConvLayer(hidden_units, dropout_rate, aggregation_type,
                                    combination_type, normalize, name="graph_conv2")
        # Postprocessing
        self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
        # Final layer that produces the logits for each node
        self.compute_logits = layers.Dense(units=num_classes, name="logits")

    def call(self, inputs):
        """
        Inputs should be a tuple of:
        (node_features, edges, edge_weights, input_node_indices)
        where:
            - node_features: tensor with shape (batch_size, num_nodes, feature_dim)
            - edges: tensor with shape (batch_size, num_edges, 2)
            - edge_weights: tensor with shape (batch_size, num_edges)
            - node_indices: tensor with shape (batch_size, num_nodes)
        Each input corresponds to a graph/image

        """
        node_features, edges, edge_weights, node_indices = inputs

        # Function that processes a single graph
        def process_graph(single_inputs):
            nf, e, ew, ni = single_inputs  # nf: (num_nodes, feature_dim), e: (num_edges, 2), etc.
            x = self.preprocess(nf)  # x: (num_nodes, hidden_dim)
            x1 = self.conv1((x, e, ew))
            x = x + x1  
            x2 = self.conv2((x, e, ew))
            x = x + x2  
            x = self.postprocess(x)
            # Obtain the representations for each node
            node_emb = tf.gather(x, ni)
            logits = self.compute_logits(node_emb)  # (num_nodes, num_classes)
            return logits

        # Apply tf.map_fn for processing each graph of the batch
        outputs = tf.map_fn(process_graph, (node_features, edges, edge_weights, node_indices),
                            fn_output_signature=tf.float32)
        # outputs have shape:(batch_size, num_nodes, num_classes)
        return outputs


In [None]:
#Functions for extracting data from the dataframe and building the dataset
def extract_graph_data(df, image_id):
    """
    Extracts data of the graph for a given image
      - Filters the rows with image_id.
      - Uses columns 'x' and 'y' por Voronoi tessellation
      - Extracts features for each node 
      - Label is the column 'activity'
    """
    df_img = df[df['image_id'] == image_id].reset_index(drop=True)
    num_nodes = df_img.shape[0]
    
    points = df_img[['x', 'y']].to_numpy()
    vor = Voronoi(points)
    # Obtains the edges (a pair of points) for the Voronoi tessellation
    if len(vor.ridge_points) > 0:
        edges = np.array(vor.ridge_points, dtype=np.int32)
    else:
        edges = np.empty((0, 2), dtype=np.int32)
    num_edges = edges.shape[0]
   
    edge_weights = np.ones((num_edges,), dtype=np.float32)
       
    feature_cols = [col for col in df_img.columns if col not in ['image_id', 'x', 'y', 'activity','label','type']]
    node_features = df_img[feature_cols].to_numpy().astype(np.float32)

    labels = df_img['activity'].to_numpy().astype(np.int32)
    # Modes indexes: just from 0 to num_nodes-1
    node_indexes = np.arange(num_nodes, dtype=np.int32)
    
    return node_features, edges, edge_weights, node_indexes, labels

#Creates a tf.data.Dataset from a dataframe
def create_graph_dataset(df, batch_size, feature_dim):
    image_ids = df['image_id'].unique()
    
    def gen():
        for img_id in image_ids:
            node_features, edges, edge_weights, node_indices, labels = extract_graph_data(df, img_id)
            # Ensure that the shapes are correct:
            node_features = np.reshape(node_features, (-1, feature_dim))
            # edges with shape:(num_edges, 2)
            edges = np.reshape(edges, (-1, 2))
            edge_weights = np.reshape(edge_weights, (-1,))
            node_indices = np.reshape(node_indices, (-1,))
            labels = np.reshape(labels, (-1,))
            yield (node_features, edges, edge_weights, node_indices), labels
    
    dataset = tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            (
                tf.TensorSpec(shape=(None, feature_dim), dtype=tf.float32),  # node_features
                tf.TensorSpec(shape=(None, 2), dtype=tf.int32),              # edges
                tf.TensorSpec(shape=(None,), dtype=tf.float32),              # edge_weights
                tf.TensorSpec(shape=(None,), dtype=tf.int32),                # node_indices
            ),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)  # labels
        )
    )
    # Use padded_batch para handling graphs with a varying number of edges
    dataset = dataset.padded_batch(
        batch_size,
        padded_shapes=(
            (
                tf.TensorShape([None, feature_dim]),  # node_features
                tf.TensorShape([None, 2]),              # edges
                tf.TensorShape([None]),                # edge_weights
                tf.TensorShape([None]),                # node_indices
            ),
            tf.TensorShape([None])  # labels
        ),
        padding_values=(
            (
                tf.constant(0, dtype=tf.float32),
                tf.constant(0, dtype=tf.int32),
                tf.constant(0, dtype=tf.float32),
                tf.constant(0, dtype=tf.int32),
            ),
            tf.constant(-1, dtype=tf.int32)
        )
    )
    return dataset

In [None]:
#Load the input data
density=0.008
fa=100
input_file=f'phia{density}/traj_phia{density}-T05-Fa{fa}-tau1.dat'
df=pd.read_csv(input_file, sep='\s+',names=["label", "type", "x", "y"])
cols_names=['area', 'perimeter', 'neighbours', 'max neighbour distance',
       'min neighbour distance', 'max vertices distance',
       'min vertices distance', 'max vertices-point distance',
       'min vertices-point distance', 'distance to center', 'activity',
       'particle type']
input_file2=f"phia{density}/particles-features-{density}-Fa{fa}.txt"
df2=pd.read_csv(input_file2, sep='\s+',names=cols_names)

#Create a dataframe that includes both, the voronoi features and the particle positions
df=df[0:2_000_000].join(df2)
df['image_id']=np.floor(df.index/1000) #Add a column with the id of each image

In [None]:
feature_cols = [col for col in df.columns if col not in ['image_id', 'x', 'y', 'activity','label','type']]
feature_dim = len(feature_cols)

# Model parameters
num_classes = 2       
hidden_units = [64, 64]
dropout_rate = 0.2
aggregation_type = "mean"
combination_type = "concat"
normalize = True
batch_size = 1  

images_ids=df['image_id'].unique()
train_images_ids,test_images_ids=train_test_split(images_ids,random_state=50,test_size=0.2)
train_df=df[df['image_id'].isin(train_images_ids)].reset_index(drop=True)
test_df=df[df['image_id'].isin(test_images_ids)].reset_index(drop=True)

# Create training and testing datasets
train_dataset = create_graph_dataset(train_df, batch_size, feature_dim)
test_dataset = create_graph_dataset(test_df, batch_size, feature_dim)

# Instance and compile the GNN model
model = GNNNodeClassifier(num_classes, hidden_units, aggregation_type, combination_type, dropout_rate, normalize)
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

In [None]:
# Fit the model
model.fit(train_dataset, epochs=10)

Epoch 1/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.9941 - loss: 0.0215
Epoch 2/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 15ms/step - accuracy: 0.9943 - loss: 0.0193

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.9940 - loss: 0.0215
Epoch 3/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 15ms/step - accuracy: 0.9940 - loss: 0.0194

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.9941 - loss: 0.0214
Epoch 4/10
[1m   7/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m30s[0m 19ms/step - accuracy: 0.9942 - loss: 0.0193

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.9941 - loss: 0.0213
Epoch 5/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 15ms/step - accuracy: 0.9943 - loss: 0.0198

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.9941 - loss: 0.0211
Epoch 6/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 14ms/step - accuracy: 0.9945 - loss: 0.0197

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.9942 - loss: 0.0210
Epoch 7/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 15ms/step - accuracy: 0.9947 - loss: 0.0192

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.9943 - loss: 0.0206
Epoch 8/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m23s[0m 15ms/step - accuracy: 0.9945 - loss: 0.0196

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.9943 - loss: 0.0205
Epoch 9/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 15ms/step - accuracy: 0.9943 - loss: 0.0202

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 16ms/step - accuracy: 0.9944 - loss: 0.0203
Epoch 10/10
[1m   9/1600[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24s[0m 15ms/step - accuracy: 0.9954 - loss: 0.0188

  self.gen.throw(typ, value, traceback)


[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.9944 - loss: 0.0200


  self.gen.throw(typ, value, traceback)


<keras.src.callbacks.history.History at 0x244daa288e0>

In [None]:
#Evaluate the model
accuracy=model.evaluate(test_dataset)
print(accuracy[1])


[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.9937 - loss: 0.0231
0.9935999512672424


  self.gen.throw(typ, value, traceback)
