In [2]:
import torch
import torch_geometric
import numpy as np
from torch_geometric.data import Data

In [6]:
import networkx as nx

In [1]:
CONVERTED_TO_PYG_DATA = 0

### Read target

In [28]:
import pandas as pd

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('train.csv')

In [32]:
min(train_df.bandgap_energy_ev)

0.0001

In [29]:
train_df.head()

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793


### Graph structures

In [3]:
from torch_geometric.data import Dataset

In [4]:
import torch
import torch_geometric
from torch.nn import Embedding, Module
from torch_geometric.nn import GCNConv
import networkx as nx
import torch.nn.functional as F

import pandas as pd
from torch.nn import Sequential, Linear, ReLU, GRU
from torch_geometric.nn import NNConv, Set2Set

import pickle
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

In [41]:
def construct_pyg_pickle(input_filename, output_filename, is_train_data=True):
    """
    is_train_data: are we generating geometric data for tain or test?
    """
    elem_to_ix = {"Al": 0, "O": 1, "Ga": 2, "In": 3}
    nuclear_charge = {"Al": 13, "O": 8, "Ga": 31, "In": 49}
    list_of_data = []
    with open(input_filename, 'rb') as f:
        train = pickle.load(f)
    for k in train.keys():
        dict_node_element = nx.get_node_attributes(train[k], 'element')
        dict_edge_length = nx.get_edge_attributes(train[k], 'length')
        # copy length attribute from networkx to pyg
        
        assert (list(dict_edge_length.keys()) == list(train[k].edges))  # make sure that length of edge is mapped to the correct edges
        assert (list(dict_node_element.keys()) == list(train[k].nodes))  # make sure that element is mapped to the correct edges

        # for embedding later
        attrx = {}
        for n in dict_node_element:
            attrx[n] = {}
            attrx[n]['element'] = elem_to_ix[dict_node_element[n]]
            attrx[n]['z'] = nuclear_charge[dict_node_element[n]]
        # attrx == {0: {'element': 1}, 6: {'element': 2)}

        nx.set_node_attributes(train[k], attrx)
        pytorch_graph = torch_geometric.utils.from_networkx(train[k])
        # copy length attribute from networkx to pyg
        if is_train_data:
            pytorch_graph.bandgap_energy_ev = train_df[train_df['id'] == k]['bandgap_energy_ev'].values            
            pytorch_graph.formation_energy_ev_natom = train_df[train_df['id'] == k]['formation_energy_ev_natom'].values
        list_of_data.append(pytorch_graph)

    with open(output_filename, 'wb') as f:
        pickle.dump(list_of_data, f)
    return list_of_data

In [42]:
if not CONVERTED_TO_PYG_DATA:
    train = construct_pyg_pickle("train_struct.pickle", "train_struct_pyg.pickle")
else:
    with open("train_struct_pyg.pickle", 'rb') as f:
        train = pickle.load(f)

In [40]:
# same as above, for test
if not CONVERTED_TO_PYG_DATA:
    test = construct_pyg_pickle("test_struct.pickle", "test_struct_pyg.pickle", is_train_data=False)
else:
    with open("test_struct_pyg.pickle", 'rb') as f:
        test = pickle.load(f)

In [25]:
train

{1069: Data(edge_index=[2, 352], element=[80], xyz=[80, 3], z=[80], length=[352], num_nodes=80, bandgap_energy_ev=[1]),
 1855: Data(edge_index=[2, 420], element=[80], xyz=[80, 3], z=[80], length=[420], num_nodes=80, bandgap_energy_ev=[1]),
 797: Data(edge_index=[2, 106], element=[30], xyz=[30, 3], z=[30], length=[106], num_nodes=30, bandgap_energy_ev=[1]),
 2288: Data(edge_index=[2, 402], element=[80], xyz=[80, 3], z=[80], length=[402], num_nodes=80, bandgap_energy_ev=[1]),
 909: Data(edge_index=[2, 126], element=[38], xyz=[38, 3], z=[38], length=[126], num_nodes=38, bandgap_energy_ev=[1]),
 135: Data(edge_index=[2, 116], element=[30], xyz=[30, 3], z=[30], length=[116], num_nodes=30, bandgap_energy_ev=[1]),
 307: Data(edge_index=[2, 376], element=[80], xyz=[80, 3], z=[80], length=[376], num_nodes=80, bandgap_energy_ev=[1]),
 763: Data(edge_index=[2, 354], element=[80], xyz=[80, 3], z=[80], length=[354], num_nodes=80, bandgap_energy_ev=[1]),
 551: Data(edge_index=[2, 374], element=[80],

In [51]:
EMBED_DIM = 3
DIM = 32

In [52]:
class NomadGCN(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.elem_to_ix = {"Al": 0, "O": 1, "Ga": 2, "In": 3}
        self.embed = Embedding(len(self.elem_to_ix), EMBED_DIM)
        self.lin0 = torch.nn.Linear(EMBED_DIM, DIM)

        # nn = Sequential(Linear(5, 128), ReLU(), Linear(128, DIM * DIM))   # previous
        nn = Sequential(Linear(1, 128), ReLU(), Linear(128, DIM * DIM))
        self.conv = NNConv(DIM, DIM, nn, aggr='mean')
        self.gru = GRU(DIM, DIM)
        self.set2set = Set2Set(DIM, processing_steps=3)
        self.lin1 = torch.nn.Linear(2 * DIM, DIM)
        self.lin2 = torch.nn.Linear(DIM, 1)
    
    def forward(self, data):
        import pdb; pdb.set_trace()
        data.x = torch.tensor([self.elem_to_ix[w] for w in data.nodes])
        out = F.relu(self.lin0(self.embed(data.x)))
        h = out.unsqueeze(0)
        for i in range(3):
            m = F.relu(self.conv(out, data.edge_index, data.length))  # data.length is data.edge_attr
            out, h = self.gru(m.unsqueeze(0), h)
            out = out.squeeze(0)
        out = self.set2set(out, data.batch)
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
        return out.view(-1)        

In [53]:
def train_nn():
    for i in range(EPOCH):
        for data in nomad_loader:
            ouput = net(data)
            loss = mse(output, label)

In [54]:
class NomadDataset(Dataset):
    
    def __init__(self, data: dict):
        super().__init__(None, None, None, None)
        self.data = data
        self.data_indices = list(self.data.keys())
        
    def len(self):
        return len(self.data)

    def get(self, idx: int):
        return self.data[self.data_indices[idx]]

In [55]:
# dataset = NomadDataset()
nomad_loader = DataLoader(list(train.values()), shuffle=True)            
net = NomadGCN()
train_nn()



> [0;32m<ipython-input-52-b982f9ad56f6>[0m(19)[0;36mforward[0;34m()[0m
[0;32m     17 [0;31m    [0;32mdef[0m [0mforward[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mdata[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     18 [0;31m        [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 19 [0;31m        [0mdata[0m[0;34m.[0m[0mx[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mtensor[0m[0;34m([0m[0;34m[[0m[0mself[0m[0;34m.[0m[0melem_to_ix[0m[0;34m[[0m[0mw[0m[0;34m][0m [0;32mfor[0m [0mw[0m [0;32min[0m [0mdata[0m[0;34m.[0m[0mnodes[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m        [0mout[0m [0;34m=[0m [0mF[0m[0;34m.[0m[0mrelu[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mlin0[0m[0;34m([0m[0mself[0m[0;34m.[0m[0membed[0m[0;34m([0m[0mdata[0m[0;34m.[0m[0mx[0m[0;34m)[0m[0;34m)

ipdb>  c


AttributeError: 'GlobalStorage' object has no attribute 'nodes'