In [1]:
import dgl

ModuleNotFoundError: No module named 'dgl'

In [2]:
import dgl
import numpy as np
import torch

In [3]:
g = dgl.graph(([0, 0, 0, 0, 0], [1, 2, 3, 4, 5]), num_nodes=6)
# Equivalently, PyTorch LongTensors also work.
#g = dgl.graph((torch.LongTensor([0, 0, 0, 0, 0]), torch.LongTensor([1, 2, 3, 4, 5])), num_nodes=6)

# You can omit the number of nodes argument if you can tell the number of nodes from the edge list alone.
#g = dgl.graph(([0, 0, 0, 0, 0], [1, 2, 3, 4, 5]))

In [4]:
g

Graph(num_nodes=6, num_edges=5,
      ndata_schemes={}
      edata_schemes={})

In [5]:
print(g.edges())

(tensor([0, 0, 0, 0, 0]), tensor([1, 2, 3, 4, 5]))


In [22]:
print(g.nodes())

tensor([0, 1, 2, 3, 4, 5])


In [23]:
## Assign node features to graph nodes

In [24]:
g.ndata['cs_13C'] = 120 * torch.randn(6,1)

In [25]:
g.ndata['cs_1H'] = 8 * torch.randn(6,1)

In [26]:
print(g.ndata['cs_1H'])

tensor([[ 2.7110],
        [-0.4332],
        [-1.6889],
        [-4.5109],
        [-4.4433],
        [-2.4425]])


In [27]:
g.ndata['cs_13C']

tensor([[-133.0151],
        [-151.0247],
        [-205.0526],
        [-187.3692],
        [-171.7678],
        [ 190.0140]])

In [28]:
g.edata['bonding'] = torch.randn(5,1)

In [29]:
g.edata['bonding']

tensor([[-1.0553],
        [-0.8824],
        [-1.0031],
        [ 0.1669],
        [-2.2609]])

In [30]:
g.out_degrees(0)

5

## Setup DGL

In [31]:
# Contruct a two-layer GNN model
import dgl.nn as dglnn
import torch.nn as nn
import torch.nn.functional as F

class SAGE(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats):
        super().__init__()
        self.conv1 = dglnn.SAGEConv(
            in_feats=in_feats, out_feats=hid_feats, aggregator_type='mean')
        self.conv2 = dglnn.SAGEConv(
            in_feats=hid_feats, out_feats=out_feats, aggregator_type='mean')

    def forward(self, graph, inputs):
        # inputs are features of nodes
        h = self.conv1(graph, inputs)
        h = F.relu(h)
        h = self.conv2(graph, h)
        return h

In [None]:
model = SAGE()

## Get nodes (atoms) and edges (bonds)

In [1]:
from src.fileio import unpickle_variable

In [2]:
atomdata = unpickle_variable("/home/northja/datasets/bmrb/processed/all_atoms.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '/home/northja/datasets/bmrb/processed/all_atoms.pkl'

In [107]:
atomdata

Unnamed: 0,res_idx,res_id,atom_type,element,chem_shift,cs_error,x,y,z
0,1,MET,N,N,,,1.329,-0.0,-0.0
1,1,MET,CA,C,,,2.093,-0.001002,-1.242
2,1,MET,C,C,,,1.764001,-1.231001,-2.082
3,1,MET,O,O,,,2.258,-2.326,-1.816
4,1,MET,CB,C,,,3.593,0.041,-0.944
...,...,...,...,...,...,...,...,...,...
1915,128,LYS,HE2,H,,,33.168999,16.424999,-2.317
1916,128,LYS,HE3,H,,,31.486,16.77,-1.921
1917,128,LYS,HZ1,H,,,33.162998,18.924,-3.002
1918,128,LYS,HZ2,H,,,31.493999,18.686001,-3.146


In [108]:
import pandas as pd

In [109]:
bonds = unpickle_variable("/home/northja/datasets/bmrb/processed/bondlist.pkl")

In [110]:
bonds

array([[1504, 1488, 1487, ..., 1919, 1917, 1914],
       [1496, 1482, 1481, ..., 1906, 1906, 1904]])

## Assemble into dataset

In [39]:
import dgl
import torch
import numpy as np

In [12]:
# make bidirectional

u, v = torch.tensor(bonds[0]), torch.tensor(bonds[1])

g = dgl.graph((u, v))

In [13]:
gr = dgl.to_bidirected(g)

In [14]:
gr

Graph(num_nodes=1920, num_edges=3866,
      ndata_schemes={}
      edata_schemes={})

In [15]:
g

Graph(num_nodes=1920, num_edges=1933,
      ndata_schemes={}
      edata_schemes={})

In [None]:
gr

### One-hot encode data

Desired form: (NO ATOM TYPE)
- res_id (one-hot, will need to compute over whole dataset before scaling)
- element (one-hot, will need to compute over whole dataset before scaling)

In [111]:
onehot_element = pd.get_dummies(atomdata['element'])

In [112]:
onehot_residue = pd.get_dummies(atomdata['res_id'])

### Collect into feature tensor

In [113]:
shifts_and_positions = atomdata[["chem_shift", "x", "y", "z"]] 

In [114]:
features = pd.concat([shifts_and_positions, onehot_residue, onehot_element], axis=1)

In [115]:
features.dropna()

Unnamed: 0,chem_shift,x,y,z,ALA,ARG,ASN,ASP,CYS,GLN,GLU,GLY,HIS,ILE,LEU,LYS,MET,PHE,PRO,SER,THR,TRP,TYR,VAL,C,H,N,O,S
211,63.671,12.55,12.282,-24.191999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
212,177.309,14.054,12.171,-23.968,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
214,32.107,11.854,12.774,-22.921,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
224,123.453,14.682,13.294,-23.634001,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
225,55.521,16.118999,13.323,-23.393,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1893,7.983,28.215,16.215,3.665,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1898,126.154,31.768999,15.3,3.396,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1899,57.615,32.737999,15.385,2.309,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
1902,33.733,32.327999,16.476,1.317,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0


In [116]:
features["chem_shift"].where(features["chem_shift"] == torch.nan)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1915    NaN
1916    NaN
1917    NaN
1918    NaN
1919    NaN
Name: chem_shift, Length: 1920, dtype: object

In [117]:
fs = features

In [118]:
fs["has_cs"] = fs["chem_shift"] == np.nan

In [119]:
fs["has_cs"] = fs["has_cs"].astype(int)

In [120]:
fs

Unnamed: 0,chem_shift,x,y,z,ALA,ARG,ASN,ASP,CYS,GLN,GLU,GLY,HIS,ILE,LEU,LYS,MET,PHE,PRO,SER,THR,TRP,TYR,VAL,C,H,N,O,S,has_cs
0,,1.329,-0.0,-0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,,2.093,-0.001002,-1.242,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,,1.764001,-1.231001,-2.082,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,,2.258,-2.326,-1.816,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,,3.593,0.041,-0.944,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,,33.168999,16.424999,-2.317,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1916,,31.486,16.77,-1.921,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1917,,33.162998,18.924,-3.002,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1918,,31.493999,18.686001,-3.146,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [121]:
sum(fs["has_cs"])

0

In [122]:
fs = fs.fillna(0)

In [123]:
fs["has_cs"] = fs["chem_shift"] != 0

In [124]:
fs["has_cs"] = fs["has_cs"].astype(int)

In [125]:
fs

Unnamed: 0,chem_shift,x,y,z,ALA,ARG,ASN,ASP,CYS,GLN,GLU,GLY,HIS,ILE,LEU,LYS,MET,PHE,PRO,SER,THR,TRP,TYR,VAL,C,H,N,O,S,has_cs
0,0,1.329000,-3.241003e-07,-1.086373e-07,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,2.093000,-1.001682e-03,-1.242000e+00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,1.764001,-1.231001e+00,-2.082000e+00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,2.258000,-2.326000e+00,-1.816000e+00,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,3.593000,4.100004e-02,-9.440004e-01,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1915,0,33.168999,1.642500e+01,-2.317000e+00,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1916,0,31.486000,1.677000e+01,-1.921000e+00,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1917,0,33.162998,1.892400e+01,-3.002000e+00,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1918,0,31.493999,1.868600e+01,-3.146000e+00,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [126]:
sum(fs["has_cs"])

505

In [132]:
feature_tensor = fs.to_numpy().astype(float)

In [133]:
feature_tensor

array([[ 0.00000000e+00,  1.32899964e+00, -3.24100256e-07, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  2.09299970e+00, -1.00168213e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  1.76400054e+00, -1.23100126e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  3.31629982e+01,  1.89239998e+01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  3.14939995e+01,  1.86860008e+01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  3.25649986e+01,  1.77479992e+01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [134]:
from src.fileio import pickle_variable

In [135]:
pickle_variable(feature_tensor, "/home/northja/datasets/bmrb/processed/feature_tensor.pkl")

### Assemble into graph

In [69]:
gr

Graph(num_nodes=1920, num_edges=3866,
      ndata_schemes={}
      edata_schemes={})

In [None]:
gr

In [71]:
gr_copy = gr

In [None]:
gr_copy.ndata[""]

In [73]:
fsdict = fs.to_dict()

In [80]:
fs.columns.values

array(['chem_shift', 'x', 'y', 'z', 'ALA', 'ARG', 'ASN', 'ASP', 'CYS',
       'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE',
       'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL', 'C', 'H', 'N', 'O', 'S',
       'has_cs'], dtype=object)

In [79]:
fs.to_numpy().T

array([[0, 0, 0, ..., 0, 0, 0],
       [1.328999638557434, 2.0929996967315674, 1.7640005350112915, ...,
        33.16299819946289, 31.493999481201172, 32.564998626708984],
       [-3.241002559661865e-07, -0.0010016821324825287,
        -1.2310012578964233, ..., 18.923999786376953, 18.68600082397461,
        17.74799919128418],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=object)

In [92]:
feature_cols = list(zip(fs.columns.values, fs.to_numpy().astype(float).T))

In [102]:
feature_cols

[('chem_shift', array([0., 0., 0., ..., 0., 0., 0.])),
 ('x',
  array([ 1.32899964,  2.0929997 ,  1.76400054, ..., 33.1629982 ,
         31.49399948, 32.56499863])),
 ('y',
  array([-3.24100256e-07, -1.00168213e-03, -1.23100126e+00, ...,
          1.89239998e+01,  1.86860008e+01,  1.77479992e+01])),
 ('z',
  array([-1.08637323e-07, -1.24200022e+00, -2.08199978e+00, ...,
         -3.00199986e+00, -3.14599967e+00, -4.06099987e+00])),
 ('ALA', array([0., 0., 0., ..., 0., 0., 0.])),
 ('ARG', array([0., 0., 0., ..., 0., 0., 0.])),
 ('ASN', array([0., 0., 0., ..., 0., 0., 0.])),
 ('ASP', array([0., 0., 0., ..., 0., 0., 0.])),
 ('CYS', array([0., 0., 0., ..., 0., 0., 0.])),
 ('GLN', array([0., 0., 0., ..., 0., 0., 0.])),
 ('GLU', array([0., 0., 0., ..., 0., 0., 0.])),
 ('GLY', array([0., 0., 0., ..., 0., 0., 0.])),
 ('HIS', array([0., 0., 0., ..., 0., 0., 0.])),
 ('ILE', array([0., 0., 0., ..., 0., 0., 0.])),
 ('LEU', array([0., 0., 0., ..., 0., 0., 0.])),
 ('LYS', array([0., 0., 0., ..., 1.,

In [100]:
from src.fileio import pickle_variable

In [101]:
pickle_variable(variable=feature_cols, varname="features.pkl")

In [93]:
for k,v in feature_cols:
    gr_copy.ndata[k] = torch.tensor(v)

In [94]:
gr_copy

Graph(num_nodes=1920, num_edges=3866,
      ndata_schemes={'chem_shift': Scheme(shape=(), dtype=torch.float64), 'x': Scheme(shape=(), dtype=torch.float64), 'y': Scheme(shape=(), dtype=torch.float64), 'z': Scheme(shape=(), dtype=torch.float64), 'ALA': Scheme(shape=(), dtype=torch.float64), 'ARG': Scheme(shape=(), dtype=torch.float64), 'ASN': Scheme(shape=(), dtype=torch.float64), 'ASP': Scheme(shape=(), dtype=torch.float64), 'CYS': Scheme(shape=(), dtype=torch.float64), 'GLN': Scheme(shape=(), dtype=torch.float64), 'GLU': Scheme(shape=(), dtype=torch.float64), 'GLY': Scheme(shape=(), dtype=torch.float64), 'HIS': Scheme(shape=(), dtype=torch.float64), 'ILE': Scheme(shape=(), dtype=torch.float64), 'LEU': Scheme(shape=(), dtype=torch.float64), 'LYS': Scheme(shape=(), dtype=torch.float64), 'MET': Scheme(shape=(), dtype=torch.float64), 'PHE': Scheme(shape=(), dtype=torch.float64), 'PRO': Scheme(shape=(), dtype=torch.float64), 'SER': Scheme(shape=(), dtype=torch.float64), 'THR': Scheme(shape=

In [None]:
+