In [1]:
import numpy as np
import torch 

<p align="center">
  <img src="images/caffeine.png" alt="drawing" width="500" align="left"/>
</p>

#### Define a graph 

* You need to define node features and Edge features for the graph 

* The node features has shape of: ~ (N, node_features)

* And the edge features dictionary has shape of: ~ (N, N, edge_features)

In [2]:
# usally we create this for all the molecules  in the problem (here we are showing for one molecule)
# So we need to update the ATOM_TYPE and BOND_TYPE lists based on the all elements and all type of bonds we are going to work on 
# also here we are considering one node feature (atom) and one edge feature (bond type), but in general we have more than one

ATOM_TYPES = {'C': 0, 'N': 1, 'O': 2} # nodes encoding 
BOND_TYPES = {'S': 0, 'D': 1, 'A': 2} # edges encoding 

# caffeine nodes 
sym_caffeine_nodes = {1: {'atom_type': 'C'},
                    2: {'atom_type': 'N'},
                    3: {'atom_type': 'C'},
                    4: {'atom_type': 'O'},
                    5: {'atom_type': 'C'},
                    6: {'atom_type': 'N'},
                    7: {'atom_type': 'C'},
                    8: {'atom_type': 'C'},
                    9: {'atom_type': 'N'},
                    10: {'atom_type': 'C'},
                    11: {'atom_type': 'N'},
                    12: {'atom_type': 'C'},
                    13: {'atom_type': 'C'},
                    14: {'atom_type': 'O'}}

# caffeine edges 
sym_caffeine_edges = {frozenset({1,2}): {'bond_type': 'S'},
                    frozenset({2,3}): {'bond_type': 'S'},
                    frozenset({2,13}): {'bond_type': 'S'},
                    frozenset({3,4}): {'bond_type': 'D'},
                    frozenset({3,5}): {'bond_type': 'S'},
                    frozenset({5,6}): {'bond_type': 'A'},
                    frozenset({5,10}): {'bond_type': 'A'},
                    frozenset({6,7}): {'bond_type': 'S'},
                    frozenset({6,8}): {'bond_type': 'A'},
                    frozenset({8,9}): {'bond_type': 'A'},
                    frozenset({9,10}): {'bond_type': 'A'},
                    frozenset({10,11}): {'bond_type': 'S'},
                    frozenset({11,12}): {'bond_type': 'S'},
                    frozenset({11,13}): {'bond_type': 'S'},
                    frozenset({13,14}): {'bond_type': 'D'}}

#### Node features 

1. We need to first encode the categorical node features into integers 
2. Then we need to create embeddings for (each) encoded integers
3. Finally the numerical node features will be stacked to have shape of: (N, node_feat_dim)

In [3]:
# encode node attributes 
enc_caffeine_nodes = {}
for node_idx, node_attributes in sym_caffeine_nodes.items():
    sym_atom_type = node_attributes['atom_type']
    encoded_atom_type = ATOM_TYPES[sym_atom_type]
    enc_caffeine_nodes[node_idx] = {'atom_type': encoded_atom_type}

enc_caffeine_nodes

{1: {'atom_type': 0},
 2: {'atom_type': 1},
 3: {'atom_type': 0},
 4: {'atom_type': 2},
 5: {'atom_type': 0},
 6: {'atom_type': 1},
 7: {'atom_type': 0},
 8: {'atom_type': 0},
 9: {'atom_type': 1},
 10: {'atom_type': 0},
 11: {'atom_type': 1},
 12: {'atom_type': 0},
 13: {'atom_type': 0},
 14: {'atom_type': 2}}

In [4]:
# encode edge attributes 
enc_caffeine_edges = {}
for edge, edge_attributes in sym_caffeine_edges.items():
    sym_bond_type = edge_attributes['bond_type']
    encoded_bond_type = BOND_TYPES[sym_bond_type]
    enc_caffeine_edges[edge] = {'bond_type': encoded_bond_type}
    
enc_caffeine_edges

{frozenset({1, 2}): {'bond_type': 0},
 frozenset({2, 3}): {'bond_type': 0},
 frozenset({2, 13}): {'bond_type': 0},
 frozenset({3, 4}): {'bond_type': 1},
 frozenset({3, 5}): {'bond_type': 0},
 frozenset({5, 6}): {'bond_type': 2},
 frozenset({5, 10}): {'bond_type': 2},
 frozenset({6, 7}): {'bond_type': 0},
 frozenset({6, 8}): {'bond_type': 2},
 frozenset({8, 9}): {'bond_type': 2},
 frozenset({9, 10}): {'bond_type': 2},
 frozenset({10, 11}): {'bond_type': 0},
 frozenset({11, 12}): {'bond_type': 0},
 frozenset({11, 13}): {'bond_type': 0},
 frozenset({13, 14}): {'bond_type': 1}}

In [5]:
# node embeddings (for catergorical data)
num_embeddings = len(ATOM_TYPES)
embedding_dim = 16  # This is a hyper parameter

# one embedding vector for one atom type (i.e. if the atom is same, then the embedding vector will be same )
ATOM_TYPE_EMBEDDINGS = torch.randn((num_embeddings, embedding_dim))
ATOM_TYPE_EMBEDDINGS

tensor([[-2.9771e+00,  7.2485e-01, -1.2857e+00,  9.5681e-01,  7.5506e-01,
          1.9142e+00, -1.5479e-01, -1.0595e+00,  1.8215e+00,  1.8802e+00,
          3.9737e-02,  3.7848e-01,  1.0095e+00, -1.7395e+00,  2.8441e-01,
         -2.6490e-01],
        [ 2.0475e+00,  7.0039e-01,  1.3285e+00, -1.5199e+00, -3.7156e-01,
         -1.0798e+00, -3.3433e-01, -1.7535e+00, -1.7819e+00, -9.9619e-01,
          3.1049e-01, -8.2550e-01,  2.3600e-01,  8.9366e-01, -1.5740e-01,
         -9.2283e-04],
        [ 3.8197e-02,  4.1376e-02, -3.5007e-01, -1.0524e-01, -3.2507e-01,
          1.5805e+00, -3.6525e-01,  5.5840e-01, -6.1863e-01, -1.0399e+00,
          1.0893e+00,  1.8452e+00,  3.2104e-01, -1.3400e+00, -2.3157e-02,
         -1.5424e-01]])

In [6]:
# embedded caffeine nodes 
embedded_caffeine_nodes = {}
for node_idx, encoded_node_features in enc_caffeine_nodes.items():
    encoded_atom_type = encoded_node_features['atom_type']
    embedded_atom_type = ATOM_TYPE_EMBEDDINGS[encoded_atom_type]  
    embedded_node_features = {'atom_type': embedded_atom_type}
    embedded_caffeine_nodes[node_idx] = embedded_node_features

# print first 3 
i=0
for idx,embedded_node_feats in embedded_caffeine_nodes.items():
    if i<3:
        print(idx,embedded_node_feats)
    i+=1

1 {'atom_type': tensor([-2.9771,  0.7249, -1.2857,  0.9568,  0.7551,  1.9142, -0.1548, -1.0595,
         1.8215,  1.8802,  0.0397,  0.3785,  1.0095, -1.7395,  0.2844, -0.2649])}
2 {'atom_type': tensor([ 2.0475e+00,  7.0039e-01,  1.3285e+00, -1.5199e+00, -3.7156e-01,
        -1.0798e+00, -3.3433e-01, -1.7535e+00, -1.7819e+00, -9.9619e-01,
         3.1049e-01, -8.2550e-01,  2.3600e-01,  8.9366e-01, -1.5740e-01,
        -9.2283e-04])}
3 {'atom_type': tensor([-2.9771,  0.7249, -1.2857,  0.9568,  0.7551,  1.9142, -0.1548, -1.0595,
         1.8215,  1.8802,  0.0397,  0.3785,  1.0095, -1.7395,  0.2844, -0.2649])}


In [7]:
# now we want to create tensors (here we are stacking node fatures for each node)
# Please read slide #1 in the GNN powerpoint 
caffeine_node_features_stack = dict()
for node_idx,embedded_features in sorted(embedded_caffeine_nodes.items()):
    for varibale_name, embedding in embedded_features.items():
        if varibale_name not in caffeine_node_features_stack:
            caffeine_node_features_stack[varibale_name] = []
        caffeine_node_features_stack[varibale_name].append(embedding)
        
stacked_node_features = dict()
for variable_name,stack in caffeine_node_features_stack.items():
    feature_tensor = torch.stack(stack)
    stacked_node_features[variable_name] = feature_tensor

stacked_node_features['atom_type'].shape

torch.Size([14, 16])

In [8]:
# let's combine the steps involving encoding and embedding (seperatly)

# STEP 1: ENCODING 
enc_caffeine_nodes_stack = {'atom_type': []}
for node_idx, node_attributes in sym_caffeine_nodes.items():
    sym_atom_type = node_attributes['atom_type']
    encoded_atom_type = ATOM_TYPES[sym_atom_type]
    enc_caffeine_nodes_stack['atom_type'].append(encoded_atom_type)
enc_caffeine_nodes_stack

{'atom_type': [0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2]}

In [9]:
# STEP2: EMBEDDING 
emb_caffeine_nodes_stack = {}
for variable_name, encoded_stack in enc_caffeine_nodes_stack.items():
    enc_tensor = torch.tensor(encoded_stack, dtype=torch.long)
    emb_tensor = ATOM_TYPE_EMBEDDINGS[enc_tensor] 
    emb_caffeine_nodes_stack[variable_name] = emb_tensor
emb_caffeine_nodes_stack

{'atom_type': tensor([[-2.9771e+00,  7.2485e-01, -1.2857e+00,  9.5681e-01,  7.5506e-01,
           1.9142e+00, -1.5479e-01, -1.0595e+00,  1.8215e+00,  1.8802e+00,
           3.9737e-02,  3.7848e-01,  1.0095e+00, -1.7395e+00,  2.8441e-01,
          -2.6490e-01],
         [ 2.0475e+00,  7.0039e-01,  1.3285e+00, -1.5199e+00, -3.7156e-01,
          -1.0798e+00, -3.3433e-01, -1.7535e+00, -1.7819e+00, -9.9619e-01,
           3.1049e-01, -8.2550e-01,  2.3600e-01,  8.9366e-01, -1.5740e-01,
          -9.2283e-04],
         [-2.9771e+00,  7.2485e-01, -1.2857e+00,  9.5681e-01,  7.5506e-01,
           1.9142e+00, -1.5479e-01, -1.0595e+00,  1.8215e+00,  1.8802e+00,
           3.9737e-02,  3.7848e-01,  1.0095e+00, -1.7395e+00,  2.8441e-01,
          -2.6490e-01],
         [ 3.8197e-02,  4.1376e-02, -3.5007e-01, -1.0524e-01, -3.2507e-01,
           1.5805e+00, -3.6525e-01,  5.5840e-01, -6.1863e-01, -1.0399e+00,
           1.0893e+00,  1.8452e+00,  3.2104e-01, -1.3400e+00, -2.3157e-02,
          -1.54

#### Edge features 

In [10]:
# bond type embedding 
num_embeddings = len(BOND_TYPES)
embedding_dim = 16  # This is a hyper parameter, you can set it to whatever value you like (though 1 will work poorly)
BOND_TYPE_EMBEDDINGS = torch.randn((num_embeddings, embedding_dim))
BOND_TYPE_EMBEDDINGS

tensor([[-0.4365, -0.3588, -0.5799, -0.3170, -0.6660, -0.4050, -0.7571, -1.0767,
          0.2133, -0.1208,  0.2787,  0.9280,  1.0981, -0.7721, -0.6524,  1.7539],
        [ 0.5037,  0.7694,  0.2166, -0.2285, -0.5482,  0.5331,  1.2060,  0.2541,
          0.2591,  1.3566,  2.0879, -1.1296, -1.7258, -0.8251, -0.3775, -0.1348],
        [-0.2887, -0.1910, -0.1301,  0.5817, -0.1459,  1.9326,  0.8524, -0.0810,
          1.8143, -0.6092, -0.2389, -1.7950, -0.1198, -0.8289,  0.4891, -1.9112]])

In [11]:
# create emplty (filled with zeros) caffine edge feature tensor 
caffeine_nodes = sym_caffeine_nodes.keys()
n_nodes = len(caffeine_nodes)
caffeine_nodes_indices = {node_idx: i for i, node_idx in enumerate(sorted(caffeine_nodes))} # to get indx -> int 
caffeine_edge_features_tensor = torch.zeros((n_nodes, n_nodes, embedding_dim), dtype=BOND_TYPE_EMBEDDINGS.dtype)

In [12]:
# embedded caffeine edges 
embedded_caffeine_edges = {}

for edge_idx, encoded_edge_features in enc_caffeine_edges.items():
    encoded_bond_type = encoded_edge_features['bond_type']
    embedded_bond_type = ATOM_TYPE_EMBEDDINGS[encoded_bond_type]
    embedded_bond_features = {'bond_type':embedded_bond_type}
    embedded_caffeine_edges[edge_idx] = embedded_bond_features
    
# print first 3  
i=0
for idx,embedded_edge_feats in embedded_caffeine_edges.items():
    if i<3:
        print(idx,embedded_edge_feats)
    i+=1

frozenset({1, 2}) {'bond_type': tensor([-2.9771,  0.7249, -1.2857,  0.9568,  0.7551,  1.9142, -0.1548, -1.0595,
         1.8215,  1.8802,  0.0397,  0.3785,  1.0095, -1.7395,  0.2844, -0.2649])}
frozenset({2, 3}) {'bond_type': tensor([-2.9771,  0.7249, -1.2857,  0.9568,  0.7551,  1.9142, -0.1548, -1.0595,
         1.8215,  1.8802,  0.0397,  0.3785,  1.0095, -1.7395,  0.2844, -0.2649])}
frozenset({2, 13}) {'bond_type': tensor([-2.9771,  0.7249, -1.2857,  0.9568,  0.7551,  1.9142, -0.1548, -1.0595,
         1.8215,  1.8802,  0.0397,  0.3785,  1.0095, -1.7395,  0.2844, -0.2649])}


In [13]:
for (node_u,node_v),features in embedded_caffeine_edges.items():
    node_u_idx = caffeine_nodes_indices[node_u]
    node_v_idx = caffeine_nodes_indices[node_v]
    caffeine_edge_features_tensor[node_u_idx,node_v_idx] = features['bond_type']
    
# has shape of : (N,N,d_edge_feat)
caffeine_edge_features_tensor.shape

torch.Size([14, 14, 16])

#### Creating Categorical and continuos  variable class 

* This makes easy to handle categorical data and continuous data effectively 
* For example if we want to define the "NULL" category this makes it easier, we don't need to handle it separately every time when we define categorical variable 

In [14]:
# class for continous data 
class ContinuousVariable:
    def __init__(self, name):
        self.name = name

    def __repr__(self):
        return f'{self.name}'

    def __eq__(self, other):
        return self.name == other.name

    def __hash__(self):
        return hash(self.name)

In [15]:
# class for categorical data 

class CategoricalVariable:
    def __init__(self, name, values, add_null_value=True):
        self.name = name
        self.has_null_value = add_null_value
        if self.has_null_value:
            self.null_value = None
            values = (None,) + tuple(values)
        self.values = tuple(values)
        self.value_to_idx_mapping = {v: i for i, v in enumerate(values)}
        self.inv_value_to_idx_mapping = {i: v for v, i in
                                            self.value_to_idx_mapping.items()}

        if self.has_null_value:
            self.null_value_idx = self.value_to_idx_mapping[self.null_value]

    def get_null_idx(self):
        if self.has_null_value:
            return self.null_value_idx
        else:
            raise RuntimeError(f"Categorical variable {self.name} has no null value")

    def value_to_idx(self, value):
        return self.value_to_idx_mapping[value]

    def idx_to_value(self, idx):
        return self.inv_value_to_idx_mapping[idx]

    def __len__(self):
        return len(self.values)

    def __repr__(self):
        return f'{self.name}'

    def __eq__(self, other):
        return self.name == other.name and self.values == other.values

    def __hash__(self):
        return hash((self.name, self.values))

#### Encoding multiple node and edge attributes

* So far we encoded one node feature (atom type) and one edge feature (bond type)

* Let's try to generalize this for multiple node features (ex. 'atom_type', 'is_aromatic', 'num_hydrogen' and etc) and multiple edge features.

In [16]:
# node features (here we have 4 features)
ATOM_TYPE_VARIABLE = CategoricalVariable('atom_type', ['C', 'N', 'O'])
ATOM_IS_AROMATIC_VARIABLE = CategoricalVariable('is_aromatic', [True, False])
ATOM_HYDROGENS_VARIABLE = ContinuousVariable('num_hydrogens')
ATOM_VARIABLES = [ATOM_TYPE_VARIABLE, ATOM_IS_AROMATIC_VARIABLE, ATOM_HYDROGENS_VARIABLE]

# let's put node features in the node dictionary 
caffeine_nodes = {1: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                2: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                3: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                4: {ATOM_TYPE_VARIABLE: 'O', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                5: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                6: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                7: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                8: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 1},
                9: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                10: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                11: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                12: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                13: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                14: {ATOM_TYPE_VARIABLE: 'O', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0}}

# edge features (we have 3 here)
BOND_TYPES_VARIABLE = CategoricalVariable('bond_type', ('S', 'D', 'A'))
BOND_IS_AROMATIC_VARIABLE = CategoricalVariable('is_aromatic', (True, False))
BOND_VARIABLES = [BOND_TYPES_VARIABLE, BOND_IS_AROMATIC_VARIABLE]

# let's put the edge features in feature dictionary 
caffeine_edges = {frozenset({1,2}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({2,3}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({2,13}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({3,4}): {BOND_TYPES_VARIABLE: 'D', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({3,5}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({5,6}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({5,10}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({6,7}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({6,8}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({8,9}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({9,10}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({10,11}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({11,12}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({11,13}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({13,14}): {BOND_TYPES_VARIABLE: 'D', BOND_IS_AROMATIC_VARIABLE: False}}

In [17]:
# atom and bond variables (categorical and continous)

CATEGORICAL_ATOM_VARIABLES = [var for var in ATOM_VARIABLES if isinstance(var, CategoricalVariable)]
CONTINUOUS_ATOM_VARIABLES = [var for var in ATOM_VARIABLES if isinstance(var, ContinuousVariable)]

CATEGORICAL_BOND_VARIABLES = [var for var in BOND_VARIABLES if isinstance(var, CategoricalVariable)]
CONTINUOUS_BOND_VARIABLES = [var for var in BOND_VARIABLES if isinstance(var, ContinuousVariable)]

In [18]:
# per variable embedding 

from torch.nn import Embedding

def make_embedding(var, embedding_dim):
    num_embeddings = len(var)
    if var.has_null_value:
        pad_idx = var.get_null_idx()
        embedding = Embedding(num_embeddings, embedding_dim, padding_idx=pad_idx)
    else:
        embedding = Embedding(num_embeddings, embedding_dim)
    return embedding

In [19]:
# embedding for catergorical variables 

embedding_dim = 16 # This is a hyper parameter

ATOM_EMBEDDINGS = { var:make_embedding(var, embedding_dim) for var in CATEGORICAL_ATOM_VARIABLES }
BOND_EMBEDDINGS = { var: make_embedding(var, embedding_dim) for var in CATEGORICAL_BOND_VARIABLES }

In [20]:
ATOM_EMBEDDINGS

{atom_type: Embedding(4, 16, padding_idx=0),
 is_aromatic: Embedding(3, 16, padding_idx=0)}

In [21]:
# stacking features for multiple variables 

stacked_encoded_categorical_node_features = {var: [] for var in CATEGORICAL_ATOM_VARIABLES}
stacked_continuous_node_features = {var: [] for var in CONTINUOUS_ATOM_VARIABLES}

for node_idx, features in sorted(caffeine_nodes.items()):
  for var in CATEGORICAL_ATOM_VARIABLES:
    symbolic_value = features[var]
    encoded_value = var.value_to_idx(symbolic_value)
    stacked_encoded_categorical_node_features[var].append(encoded_value)
  for var in CONTINUOUS_ATOM_VARIABLES:
    value = features[var]
    stacked_continuous_node_features[var].append(value)

stacked_encoded_categorical_node_features, stacked_continuous_node_features

({atom_type: [1, 2, 1, 3, 1, 2, 1, 1, 2, 1, 2, 1, 1, 3],
  is_aromatic: [2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2]},
 {num_hydrogens: [3, 0, 0, 0, 0, 0, 3, 1, 0, 0, 0, 3, 0, 0]})

In [22]:
n_nodes = len(caffeine_nodes)
caffeine_nodes_indices = {node_idx: i for i, node_idx in enumerate(sorted(caffeine_nodes.keys()))}

# stacked catogorical edge features encoding 
stacked_encoded_categorical_edge_features = {}

for var in CATEGORICAL_BOND_VARIABLES:
    pairwise_tensor = torch.zeros((n_nodes,n_nodes),dtype=torch.long)
    for (node_u,node_v), features in caffeine_edges.items():
         node_u_idx = caffeine_nodes_indices[node_u]
         node_v_idx = caffeine_nodes_indices[node_v]
         symbolic_value = features[var]
         encoded_value = var.value_to_idx(symbolic_value)
         pairwise_tensor[node_u_idx,node_v_idx] = encoded_value
         pairwise_tensor[node_v_idx,node_u_idx] = encoded_value
         
    stacked_encoded_categorical_edge_features[var] = pairwise_tensor
    
stacked_encoded_categorical_edge_features


{bond_type: tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0],
         [0, 0, 0, 0, 3, 0, 1, 3, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0],
         [0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 1, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]]),
 is_aromatic: tensor([[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0],
         [0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 2, 0

In [23]:
# stacked continous edge feature 
stacked_continuous_edge_features = {}

for var in CONTINUOUS_BOND_VARIABLES:
    pairwise_tensor = torch.zeros((n_nodes,n_nodes),dtype=torch.float32)
    for (node_u,node_v), features in caffeine_edges.items():
        node_u_idx = caffeine_nodes_indices[node_u]
        node_v_idx = caffeine_nodes_indices[node_v]
        value = features[var]
        pairwise_tensor[node_u_idx, node_v_idx] = value
        pairwise_tensor[node_v_idx, node_u_idx] = value
    stacked_encoded_categorical_edge_features[var] = pairwise_tensor
    
stacked_continuous_edge_features

{}

In [24]:
# lets embed the stacked encoded bond features 

embedded_node_features = dict()
for var, encoded_features in stacked_encoded_categorical_node_features.items():
  tensor_features = torch.tensor(encoded_features, dtype=torch.long)
  embedding = ATOM_EMBEDDINGS[var]
  embedded_features = embedding(tensor_features)
  embedded_node_features[var] = embedded_features

# for the continous variables we don't need embedding 
continuous_node_features = dict()
for var, features in stacked_continuous_node_features.items():
    continuous_node_features[var] = torch.tensor(features, dtype=torch.float32)
    

In [25]:
# embed the steacked encoded edge features 
embedded_edge_features = dict()
for var, encoded_features in stacked_encoded_categorical_edge_features.items():
    tensor_features = torch.tensor(encoded_features, dtype=torch.long)
    embedding = BOND_EMBEDDINGS[var]
    embedded_features = embedding(tensor_features)
    embedded_edge_features[var] = embedded_features

# for the continous edge features no embedding is needed 
continuous_edge_features = dict()
for var, features in stacked_continuous_edge_features.items():
    continuous_edge_features[var] = torch.tensor(features, dtype=torch.float32)

  tensor_features = torch.tensor(encoded_features, dtype=torch.long)


#### Let's examine the shapes of encoded (and embedded if categorical) bond and edge features 

In [26]:
# categorical node features 
for name,value in embedded_node_features.items():
    print(f'var_name = "{name}" and shape = {value.shape}')

var_name = "atom_type" and shape = torch.Size([14, 16])
var_name = "is_aromatic" and shape = torch.Size([14, 16])


In [27]:
# continuous node features 

for name,value in continuous_node_features.items():
    print(f'var_name = "{name}" and shape = {value.shape}')

var_name = "num_hydrogens" and shape = torch.Size([14])


In [28]:
# categorical edge features 
for name,value in embedded_edge_features.items():
    print(f'var_name = "{name}" and shape = {value.shape}')

var_name = "bond_type" and shape = torch.Size([14, 14, 16])
var_name = "is_aromatic" and shape = torch.Size([14, 14, 16])


In [29]:
# continuous edge features 
for name,value in continuous_node_features.items():
    print(f'var_name = "{name}" and shape = {value.shape}')

var_name = "num_hydrogens" and shape = torch.Size([14])


In [30]:
# combine embedded and continous node features

# stack along 0 dim and sum along 0 dim -> sum the node embeddings 
stacked_node_embedding = torch.stack(tuple(embedded_node_features.values()))  
aggregated_node_embeddings = torch.sum(stacked_node_embedding,dim=0)          

# stack cts features along last axis -> concatanting 
continuous_node_features_stacked = torch.stack(tuple(continuous_node_features.values()), dim=-1) 

# concatante agg. node embeddings with stacked cts features to form node feature tensor 
node_features = torch.concat([aggregated_node_embeddings, continuous_node_features_stacked], dim=-1)
node_features.shape

torch.Size([14, 17])

In [31]:
# combine embedded and continous node features 

# stack edge embeddings and aggregate (for. ex sum)
stacked_edge_embeddings = torch.stack(tuple(embedded_edge_features.values()))
aggregated_edge_embeddings = torch.sum(stacked_edge_embeddings, dim=0)

# stack cts variable 
# continuous_edge_features_stacked = torch.stack(tuple(continuous_edge_features.values()), dim=-1) 
# edge_features = torch.concat([aggregated_edge_embeddings,continuous_edge_features_stacked])

# since we don't have cts edge features 
edge_features = aggregated_edge_embeddings
edge_features.shape

torch.Size([14, 14, 16])