In [1]:
import numpy as np
import torch 

<p align="center">
  <img src="images/caffeine.png" alt="drawing" width="500" align="left"/>
</p>

#### Define a graph 

* You need to define node features and Edge features for the graph 

* The node features has shape of: ~ (N, node_features)

* And the edge features dictionary has shape of: ~ (N, N, edge_features)

In [2]:
# usally we create this for all the molecules  in the problem (here we are showing for one molecule)
# So we need to update the ATOM_TYPE and BOND_TYPE lists based on the all elements and all type of bonds we are going to work on 
# also here we are considering one node feature (atom) and one edge feature (bond type), but in general we have more than one

ATOM_TYPES = {'C': 0, 'N': 1, 'O': 2} # nodes encoding 
BOND_TYPES = {'S': 0, 'D': 1, 'A': 2} # edges encoding 

# caffeine nodes 
sym_caffeine_nodes = {1: {'atom_type': 'C'},
                    2: {'atom_type': 'N'},
                    3: {'atom_type': 'C'},
                    4: {'atom_type': 'O'},
                    5: {'atom_type': 'C'},
                    6: {'atom_type': 'N'},
                    7: {'atom_type': 'C'},
                    8: {'atom_type': 'C'},
                    9: {'atom_type': 'N'},
                    10: {'atom_type': 'C'},
                    11: {'atom_type': 'N'},
                    12: {'atom_type': 'C'},
                    13: {'atom_type': 'C'},
                    14: {'atom_type': 'O'}}

# caffeine edges 
sym_caffeine_edges = {frozenset({1,2}): {'bond_type': 'S'},
                    frozenset({2,3}): {'bond_type': 'S'},
                    frozenset({2,13}): {'bond_type': 'S'},
                    frozenset({3,4}): {'bond_type': 'D'},
                    frozenset({3,5}): {'bond_type': 'S'},
                    frozenset({5,6}): {'bond_type': 'A'},
                    frozenset({5,10}): {'bond_type': 'A'},
                    frozenset({6,7}): {'bond_type': 'S'},
                    frozenset({6,8}): {'bond_type': 'A'},
                    frozenset({8,9}): {'bond_type': 'A'},
                    frozenset({9,10}): {'bond_type': 'A'},
                    frozenset({10,11}): {'bond_type': 'S'},
                    frozenset({11,12}): {'bond_type': 'S'},
                    frozenset({11,13}): {'bond_type': 'S'},
                    frozenset({13,14}): {'bond_type': 'D'}}

#### Node features 

1. We need to first encode the categorical node features into integers 
2. Then we need to create embeddings for (each) encoded integers
3. Finally the numerical node features will be stacked to have shape of: (N, node_feat_dim)

In [3]:
# encode node attributes 
enc_caffeine_nodes = {}
for node_idx, node_attributes in sym_caffeine_nodes.items():
    sym_atom_type = node_attributes['atom_type']
    encoded_atom_type = ATOM_TYPES[sym_atom_type]
    enc_caffeine_nodes[node_idx] = {'atom_type': encoded_atom_type}

enc_caffeine_nodes

{1: {'atom_type': 0},
 2: {'atom_type': 1},
 3: {'atom_type': 0},
 4: {'atom_type': 2},
 5: {'atom_type': 0},
 6: {'atom_type': 1},
 7: {'atom_type': 0},
 8: {'atom_type': 0},
 9: {'atom_type': 1},
 10: {'atom_type': 0},
 11: {'atom_type': 1},
 12: {'atom_type': 0},
 13: {'atom_type': 0},
 14: {'atom_type': 2}}

In [4]:
# encode edge attributes 
enc_caffeine_edges = {}
for edge, edge_attributes in sym_caffeine_edges.items():
    sym_bond_type = edge_attributes['bond_type']
    encoded_bond_type = BOND_TYPES[sym_bond_type]
    enc_caffeine_edges[edge] = {'bond_type': encoded_bond_type}
    
enc_caffeine_edges

{frozenset({1, 2}): {'bond_type': 0},
 frozenset({2, 3}): {'bond_type': 0},
 frozenset({2, 13}): {'bond_type': 0},
 frozenset({3, 4}): {'bond_type': 1},
 frozenset({3, 5}): {'bond_type': 0},
 frozenset({5, 6}): {'bond_type': 2},
 frozenset({5, 10}): {'bond_type': 2},
 frozenset({6, 7}): {'bond_type': 0},
 frozenset({6, 8}): {'bond_type': 2},
 frozenset({8, 9}): {'bond_type': 2},
 frozenset({9, 10}): {'bond_type': 2},
 frozenset({10, 11}): {'bond_type': 0},
 frozenset({11, 12}): {'bond_type': 0},
 frozenset({11, 13}): {'bond_type': 0},
 frozenset({13, 14}): {'bond_type': 1}}

In [5]:
# node embeddings (for catergorical data)
num_embeddings = len(ATOM_TYPES)
embedding_dim = 16  # This is a hyper parameter

# one embedding vector for one atom type (i.e. if the atom is same, then the embedding vector will be same )
ATOM_TYPE_EMBEDDINGS = torch.randn((num_embeddings, embedding_dim))
ATOM_TYPE_EMBEDDINGS

tensor([[ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
          1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614],
        [ 0.8430, -0.3465,  0.5837,  1.3363, -0.6648, -0.5628,  1.2693, -1.4810,
         -1.0072,  0.8442,  0.1683, -0.4431,  1.2046,  1.9453,  1.3180, -0.3411],
        [-0.7672, -1.8820, -1.9126,  0.9441,  0.8713,  0.2552, -0.7308, -0.5687,
         -1.7632, -0.4655, -0.0700, -0.5780, -1.9135,  0.4369, -0.0833,  0.5947]])

In [6]:
# embedded caffeine nodes 
embedded_caffeine_nodes = {}
for node_idx, encoded_node_features in enc_caffeine_nodes.items():
    encoded_atom_type = encoded_node_features['atom_type']
    embedded_atom_type = ATOM_TYPE_EMBEDDINGS[encoded_atom_type]  
    embedded_node_features = {'atom_type': embedded_atom_type}
    embedded_caffeine_nodes[node_idx] = embedded_node_features

# print first 3 
i=0
for idx,embedded_node_feats in embedded_caffeine_nodes.items():
    if i<3:
        print(idx,embedded_node_feats)
    i+=1

1 {'atom_type': tensor([ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
         1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614])}
2 {'atom_type': tensor([ 0.8430, -0.3465,  0.5837,  1.3363, -0.6648, -0.5628,  1.2693, -1.4810,
        -1.0072,  0.8442,  0.1683, -0.4431,  1.2046,  1.9453,  1.3180, -0.3411])}
3 {'atom_type': tensor([ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
         1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614])}


In [11]:
# now we want to create tensors (here we are stacking node fatures for each node)
# Please read slide #1 in the GNN powerpoint 
caffeine_node_features_stack = dict()
for node_idx,embedded_features in sorted(embedded_caffeine_nodes.items()):
    for varibale_name, embedding in embedded_features.items():
        if varibale_name not in caffeine_node_features_stack:
            caffeine_node_features_stack[varibale_name] = []
        caffeine_node_features_stack[varibale_name].append(embedding)
        
stacked_node_features = dict()
for variable_name,stack in caffeine_node_features_stack.items():
    feature_tensor = torch.stack(stack)
    stacked_node_features[variable_name] = feature_tensor

stacked_node_features['atom_type'].shape

torch.Size([14, 16])

In [9]:
# let's combine the steps involving encoding and embedding (seperatly)

# STEP 1: ENCODING 
enc_caffeine_nodes_stack = {'atom_type': []}
for node_idx, node_attributes in sym_caffeine_nodes.items():
    sym_atom_type = node_attributes['atom_type']
    encoded_atom_type = ATOM_TYPES[sym_atom_type]
    enc_caffeine_nodes_stack['atom_type'].append(encoded_atom_type)
enc_caffeine_nodes_stack

{'atom_type': [0, 1, 0, 2, 0, 1, 0, 0, 1, 0, 1, 0, 0, 2]}

In [10]:
# STEP2: EMBEDDING 
emb_caffeine_nodes_stack = {}
for variable_name, encoded_stack in enc_caffeine_nodes_stack.items():
    enc_tensor = torch.tensor(encoded_stack, dtype=torch.long)
    emb_tensor = ATOM_TYPE_EMBEDDINGS[enc_tensor] 
    emb_caffeine_nodes_stack[variable_name] = emb_tensor
emb_caffeine_nodes_stack

{'atom_type': tensor([[ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
           1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614],
         [ 0.8430, -0.3465,  0.5837,  1.3363, -0.6648, -0.5628,  1.2693, -1.4810,
          -1.0072,  0.8442,  0.1683, -0.4431,  1.2046,  1.9453,  1.3180, -0.3411],
         [ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
           1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614],
         [-0.7672, -1.8820, -1.9126,  0.9441,  0.8713,  0.2552, -0.7308, -0.5687,
          -1.7632, -0.4655, -0.0700, -0.5780, -1.9135,  0.4369, -0.0833,  0.5947],
         [ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
           1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614],
         [ 0.8430, -0.3465,  0.5837,  1.3363, -0.6648, -0.5628,  1.2693, -1.4810,
          -1.0072,  0.8442,  0.1683, -0.4431,  1.2046,  1.9453,  1.3180, -0.3411

#### Edge features 

In [26]:
# bond type embedding 
num_embeddings = len(BOND_TYPES)
embedding_dim = 16  # This is a hyper parameter, you can set it to whatever value you like (though 1 will work poorly)
BOND_TYPE_EMBEDDINGS = torch.randn((num_embeddings, embedding_dim))
BOND_TYPE_EMBEDDINGS

tensor([[-0.0578, -0.2147, -0.4536,  0.0059,  0.4652,  0.4706,  0.2161,  0.7681,
          0.1815,  0.5205,  0.5546,  0.1349,  0.8154,  1.2754,  0.2409,  0.7473],
        [-0.5989,  0.1858, -1.6824,  0.9686, -1.0566, -0.8747, -0.3959, -1.4704,
         -0.5375,  0.1018, -1.3207,  0.3371,  0.1343,  0.7643,  1.1319, -0.2975],
        [ 0.1107,  0.1844,  0.2405,  0.2023, -0.6089, -0.1008, -0.4015, -0.2590,
         -1.0013,  0.4503,  0.0435,  1.4452,  0.5089, -1.1479,  0.3041, -1.1514]])

In [27]:
# create emplty (filled with zeros) caffine edge feature tensor 
caffeine_nodes = sym_caffeine_nodes.keys()
n_nodes = len(caffeine_nodes)
caffeine_nodes_indices = {node_idx: i for i, node_idx in enumerate(sorted(caffeine_nodes))} # to get indx -> int 
caffeine_edge_features_tensor = torch.zeros((n_nodes, n_nodes, embedding_dim), dtype=BOND_TYPE_EMBEDDINGS.dtype)

In [28]:
# embedded caffeine edges 
embedded_caffeine_edges = {}

for edge_idx, encoded_edge_features in enc_caffeine_edges.items():
    encoded_bond_type = encoded_edge_features['bond_type']
    embedded_bond_type = ATOM_TYPE_EMBEDDINGS[encoded_bond_type]
    embedded_bond_features = {'bond_type':embedded_bond_type}
    embedded_caffeine_edges[edge_idx] = embedded_bond_features
    
# print first 3  
i=0
for idx,embedded_edge_feats in embedded_caffeine_edges.items():
    if i<3:
        print(idx,embedded_edge_feats)
    i+=1

frozenset({1, 2}) {'bond_type': tensor([ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
         1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614])}
frozenset({2, 3}) {'bond_type': tensor([ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
         1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614])}
frozenset({2, 13}) {'bond_type': tensor([ 0.5915,  0.2338, -0.5236, -0.5252, -0.2997, -0.2778, -0.8636,  2.6273,
         1.1535, -1.4986,  1.0591,  1.6145,  0.7571, -0.2929,  0.4991,  0.0614])}


In [32]:
for (node_u,node_v),features in embedded_caffeine_edges.items():
    node_u_idx = caffeine_nodes_indices[node_u]
    node_v_idx = caffeine_nodes_indices[node_v]
    caffeine_edge_features_tensor[node_u_idx,node_v_idx] = features['bond_type']
    
# has shape of : (N,N,d_edge_feat)
caffeine_edge_features_tensor.shape

torch.Size([14, 14, 16])

#### Creating Categorical and continuos  variable class 

* This makes easy to handle categorical data and continuous data effectively 
* For example if we want to define the "NULL" category this makes it easier, we don't need to handle it separately every time when we define categorical variable 

In [33]:
# class for continous data 
class ContinuousVariable:
    def __init__(self, name):
        self.name = name

    def __repr__(self):
        return f''

    def __eq__(self, other):
        return self.name == other.name

    def __hash__(self):
        return hash(self.name)

In [34]:
# class for categorical data 

class CategoricalVariable:
    def __init__(self, name, values, add_null_value=True):
        self.name = name
        self.has_null_value = add_null_value
        if self.has_null_value:
            self.null_value = None
            values = (None,) + tuple(values)
        self.values = tuple(values)
        self.value_to_idx_mapping = {v: i for i, v in enumerate(values)}
        self.inv_value_to_idx_mapping = {i: v for v, i in
                                            self.value_to_idx_mapping.items()}

        if self.has_null_value:
            self.null_value_idx = self.value_to_idx_mapping[self.null_value]

    def get_null_idx(self):
        if self.has_null_value:
            return self.null_value_idx
        else:
            raise RuntimeError(f"Categorical variable {self.name} has no null value")

    def value_to_idx(self, value):
        return self.value_to_idx_mapping[value]

    def idx_to_value(self, idx):
        return self.inv_value_to_idx_mapping[idx]

    def __len__(self):
        return len(self.values)

    def __repr__(self):
        return f''

    def __eq__(self, other):
        return self.name == other.name and self.values == other.values

    def __hash__(self):
        return hash((self.name, self.values))

#### Encoding multiple node and edge attributes

* So far we encoded one node feature (atom type) and one edge feature (bond type)

* Let's try to generalize this for multiple node features (ex. 'atom_type', 'is_aromatic', 'num_hydrogen' and etc) and multiple edge features.

In [35]:
# node features (here we have 4 features)
ATOM_TYPE_VARIABLE = CategoricalVariable('atom_type', ['C', 'N', 'O'])
ATOM_IS_AROMATIC_VARIABLE = CategoricalVariable('is_aromatic', [True, False])
ATOM_HYDROGENS_VARIABLE = ContinuousVariable('num_hydrogens')
ATOM_VARIABLES = [ATOM_TYPE_VARIABLE, ATOM_IS_AROMATIC_VARIABLE, ATOM_HYDROGENS_VARIABLE]

# let's put node features in the node dictionary 
caffeine_nodes = {1: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                2: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                3: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                4: {ATOM_TYPE_VARIABLE: 'O', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                5: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                6: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                7: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                8: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 1},
                9: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                10: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: True, ATOM_HYDROGENS_VARIABLE: 0},
                11: {ATOM_TYPE_VARIABLE: 'N', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                12: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 3},
                13: {ATOM_TYPE_VARIABLE: 'C', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0},
                14: {ATOM_TYPE_VARIABLE: 'O', ATOM_IS_AROMATIC_VARIABLE: False, ATOM_HYDROGENS_VARIABLE: 0}}

# edge features (we have 3 here)
BOND_TYPES_VARIABLE = CategoricalVariable('bond_type', ('S', 'D', 'A'))
BOND_IS_AROMATIC_VARIABLE = CategoricalVariable('is_aromatic', (True, False))
BOND_VARIABLES = [BOND_TYPES_VARIABLE, BOND_IS_AROMATIC_VARIABLE]

# let's put the edge features in feature dictionary 
caffeine_edges = {frozenset({1,2}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({2,3}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({2,13}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({3,4}): {BOND_TYPES_VARIABLE: 'D', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({3,5}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({5,6}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({5,10}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({6,7}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({6,8}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({8,9}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({9,10}): {BOND_TYPES_VARIABLE: 'A', BOND_IS_AROMATIC_VARIABLE: True},
                frozenset({10,11}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({11,12}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({11,13}): {BOND_TYPES_VARIABLE: 'S', BOND_IS_AROMATIC_VARIABLE: False},
                frozenset({13,14}): {BOND_TYPES_VARIABLE: 'D', BOND_IS_AROMATIC_VARIABLE: False}}