In [1]:
### INSTALLATION FOR COLAB USAGE
import torch
pytorch_version = f"torch-{torch.__version__}.html"
!pip install --no-index torch-scatter -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-sparse -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-cluster -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install --no-index torch-spline-conv -f https://pytorch-geometric.com/whl/$pytorch_version
!pip install torch-geometric
!pip install torchmetrics

Looking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_scatter-2.1.2%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m128.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt28cu126
Looking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.8.0%2Bcu126/torch_sparse-0.6.18%2Bpt28cu126-cp312-cp312-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt28cu126
Looking in links: https://pytorch-geometric.com/whl/torch-2.8.0+cu126.html
Collecting torch-cluster
 

In [2]:
### FILE LOADING FOR COLAB USAGE
!rm -rf /content/*
!ls /content

from google.colab import files
uploaded = files.upload()

!unzip train.zip -d /
!unzip test.zip -d /

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /train/1451/geometry.xyz  
   creating: /train/1452/
  inflating: /train/1452/geometry.xyz  
   creating: /train/1453/
  inflating: /train/1453/geometry.xyz  
   creating: /train/1454/
  inflating: /train/1454/geometry.xyz  
   creating: /train/1455/
  inflating: /train/1455/geometry.xyz  
   creating: /train/1456/
  inflating: /train/1456/geometry.xyz  
   creating: /train/1457/
  inflating: /train/1457/geometry.xyz  
   creating: /train/1458/
  inflating: /train/1458/geometry.xyz  
   creating: /train/1459/
  inflating: /train/1459/geometry.xyz  
   creating: /train/146/
  inflating: /train/146/geometry.xyz  
   creating: /train/1460/
  inflating: /train/1460/geometry.xyz  
   creating: /train/1461/
  inflating: /train/1461/geometry.xyz  
   creating: /train/1462/
  inflating: /train/1462/geometry.xyz  
   creating: /train/1463/
  inflating: /train/1463/geometry.xyz  
   creating: /train/1464/
  inflating: 

This notebook implements a pipeline to convert atomic coordinates from DFT output files into graph representations. It uses PyTorch Geometric to build and train a GNN model to predict groung state (GS) energies and band gap (BG) energies of semiconductors. It was written to approach the competition Nomad2018 Predicting Transparent Conductors hosted by Kaggle. It requires the file semiconductors_pipe_funcs.py where the pipelines are stored.

In [3]:
### PYTHON LIBRARIES
import numpy as np
import pandas as pd
import seaborn as sns
import random
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.data import Batch, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import Sequential, GCNConv, Linear
from torch_geometric.nn import global_mean_pool, global_add_pool
from torch.nn import Softmax, ReLU as Softmax, ReLU
from torchmetrics import MeanSquaredLogError as MSLE, MeanSquaredError as MSE

### MY UTILITIES LIBRARY
from semiconductors_pipe_funcs import *

### SET DEVICE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device", device)

Using device cuda


In [4]:
### SET SEED
SEED = 42
set_seed(SEED)

In [5]:
### DATA LOADING AD PREPROCESSING

train = pd.read_csv("train.csv", index_col='id')
#test = pd.read_csv("test.csv", index_col='id')

## Rename columns, one-hot encode spacegroup, rescale lattice parameters and separate targets (`E`, `Bandgap`)
X_train, y_train = my_pipeline(train)
#X_test, y_test = my_pipeline(test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_data['Bandgap'] = 0.1*y_data['Bandgap']


In [7]:
### GRAPHS CONSTRUCTION

## It gathers the element information and atomic coordinates from the DFT output files to build the graphs.
 ## Since the Nomad2018 database contains structures with widely different supercell sizes,
## all structures have been expanded to 240 atoms using the unit lattice vectors,
## so all structures are finite but are treated on the same footing.
## Nodes have 6 features: atomic number, electronegativity, and a 4 one-hot encoding of the element.
## Edges are build between nodes using a maximum interatomic distance d_max (given in Angstroms). Edge weights are set to 1/r**2.
## 12 graph attributes are 3 element concentrations, 3 rescaled lattice vectors and 6 one-hot enoded spacegroup.

d_max = 4.0
data_list = create_datalist(X_train,y_train, d_max=d_max)
#test_list = create_datalist(X_test,y_test, d_max=5.0)

creating list of Data objects using d_max=4.0


  graph_attr = torch.tensor([X.iloc[i]],dtype=torch.float32)
  ys = torch.tensor([y.iloc[i]],dtype=torch.float32)


In [10]:
torch.save(data_list,"data_list_dmax{d_max}.pt")

In [11]:
# Wrap data in a data loader
data_size = len(data_list)

data_loader = DataLoader(data_list[:int(data_size * 0.8)], batch_size=64, shuffle=True)
test_loader = DataLoader(data_list[int(data_size * 0.8):], batch_size=64, shuffle=True)

In [12]:
num_node_features = data_list[0].num_node_features
graph_attr_size = data_list[0].graph_attr.size(1)

In [13]:
### MODEL DEFINITION

embedding_size = 64

class GCN_Gfeatured(torch.nn.Module):
    def __init__(self):
        super(GCN_Gfeatured, self).__init__()

        ## NODE LEVEL CONVOLUTIONAL LAYERS
        self.node_encoder = GCNConv(num_node_features, embedding_size)
        self.conv1 = GCNConv(embedding_size, embedding_size)
        #self.conv2 = GCNConv(embedding_size, embedding_size)

        ## GRAPH FEATURE LAYERS
        self.graph1 = Linear(graph_attr_size, embedding_size)
        self.graph2 = Linear(embedding_size, embedding_size)

        # MIXED NODE-GRAPH LEVEL LAYERS
        self.mix = Linear(embedding_size*2, embedding_size)
        self.mix2 = Linear(embedding_size, embedding_size)
        self.out = Linear(embedding_size, 1)

    def forward(self, x, edge_index, edge_weight, graph_attr, batch_index):
        ## NODE LAYERS
        hidden = self.node_encoder(x, edge_index, edge_weight = edge_weight.squeeze())
        hidden = F.relu(hidden)

        hidden = self.conv1(hidden, edge_index, edge_weight = edge_weight.squeeze())
        hidden = F.relu(hidden)

        #hidden = self.conv2(hidden, edge_index, edge_weight = edge_weight.squeeze())
        #hidden = F.relu(hidden)

        ## POOLING NODES
        pooled = global_add_pool(hidden, batch_index)

        ## GLOBAL GRAPH LAYERS
        graph_features = self.graph1(graph_attr)
        graph_features = F.relu(graph_features)

        graph_features = self.graph2(graph_features)
        graph_features = F.relu(graph_features)

        ##COMBINE NODE + GRAPH FEATURES
        hidden = torch.cat([pooled, graph_features], dim=1)

        mix = self.mix(hidden)
        F.relu(mix)
        mix = self.mix2(mix)
        F.relu(mix)
        out = self.out(mix)

        return out


In [15]:
## MODEL, ERROR FUNCTION AND OPTOMIZER FOR GS ENERGY
set_seed(42)

model_E = GCN_Gfeatured()
print(model_E)
loss_fn = MSE()
optimizer = torch.optim.Adam(model_E.parameters(), lr=0.001)

## MODEL TO GPU
model_E = model_E.to(device)
loss_fn = loss_fn.to(device)

GCN_Gfeatured(
  (node_encoder): GCNConv(6, 64)
  (conv1): GCNConv(64, 64)
  (graph1): Linear(12, 64, bias=True)
  (graph2): Linear(64, 64, bias=True)
  (mix): Linear(128, 64, bias=True)
  (mix2): Linear(64, 64, bias=True)
  (out): Linear(64, 1, bias=True)
)


In [20]:
### TRAINING FOR GROUND STATE ENERGY

def train(data_loader):
    model_E.train()
    total_loss = 0
    for batch in data_loader:
      ## USE GPU
      #batch = dtype_2check(batch)
      batch.to(device)
      ## RESET GRADIENTS
      optimizer.zero_grad()
      # MODEL FEED
      pred = model_E(batch.x, batch.edge_index, batch.edge_weight, batch.graph_attr, batch.batch)
      ## LOSS AND GRADIENT CALCULATION
      E_real = batch.y[:,0].reshape([-1,1])
      #E_gap_real = batch.y[:,1].reshape([-1,1])
      loss = loss_fn(pred,E_real)
      #loss = loss_fn(pred, batch.y)
      loss.backward()
      ## UPDATE BY GRADIENT
      optimizer.step()
      total_loss += loss.item() * batch.num_graphs
      return total_loss / len(data_loader.dataset)


epochs = 600
for epoch in range(epochs):
    loss = train(data_loader)
    if epoch % 100 == 0:
      print(f"Epoch {epoch:04d} | Train Loss: {loss:.5f}")

Epoch 0000 | Train Loss: 0.00011
Epoch 0100 | Train Loss: 0.00009
Epoch 0200 | Train Loss: 0.00008
Epoch 0300 | Train Loss: 0.00004
Epoch 0400 | Train Loss: 0.00012
Epoch 0500 | Train Loss: 0.00008


In [21]:
### TESTING MODEL FOR GS ENERGY

test_batch = next(iter(test_loader))
with torch.no_grad():
    test_batch.to(device)
    pred = model_E(test_batch.x.float(), test_batch.edge_index, test_batch.edge_weight, test_batch.graph_attr, test_batch.batch)
    pred = pred.cpu()
    df = pd.DataFrame()
    df["E_real"] = test_batch.y[:,0].tolist()
    df["E_pred"] = pred[:,0]


In [22]:
## MODEL, ERROR FUNCTION AND OPTOMIZER FOR GAP ENERGY
set_seed(42)

model_Egap = GCN_Gfeatured()
#print(model)
loss_fn = MSE()
optimizer = torch.optim.Adam(model_Egap.parameters(), lr=0.001)

## MODEL TO GPU
model_Egap = model_Egap.to(device)
loss_fn = loss_fn.to(device)

In [27]:
### TRAINING MODEL FOR GAP ENERGY

def train(data_loader):
    model_Egap.train()
    total_loss = 0
    for batch in data_loader:
      ## USE GPU
      #batch = dtype_2check(batch)
      batch.to(device)
      ## RESET GRADIENTS
      optimizer.zero_grad()
      # MODEL FEED
      pred = model_Egap(batch.x, batch.edge_index, batch.edge_weight, batch.graph_attr, batch.batch)
      ## LOSS AND GRADIENT CALCULATION
      #E_real = batch.y[:,0].reshape([-1,1])
      Egap_real = 10.0*batch.y[:,1].reshape([-1,1])
      loss = loss_fn(pred,Egap_real)
      #loss = loss_fn(pred, batch.y)
      loss.backward()
      ## UPDATE BY GRADIENT
      optimizer.step()
      total_loss += loss.item() * batch.num_graphs
      return total_loss / len(data_loader.dataset)


epochs = 600
for epoch in range(epochs):
    loss = train(data_loader)
    if epoch % 100 == 0:
      print(f"Epoch {epoch:04d} | Train Loss: {loss:.5f}")


Epoch 0000 | Train Loss: 0.00153
Epoch 0100 | Train Loss: 0.00268
Epoch 0200 | Train Loss: 0.00182
Epoch 0300 | Train Loss: 0.00188
Epoch 0400 | Train Loss: 0.00220
Epoch 0500 | Train Loss: 0.00176


In [28]:
### TESTING MODEL FOR GAP ENERGY

test_batch = next(iter(test_loader))
with torch.no_grad():
    test_batch.to(device)
    pred = model_Egap(test_batch.x.float(), test_batch.edge_index, test_batch.edge_weight, test_batch.graph_attr, test_batch.batch)
    pred = pred.cpu()
    df["Egap_real"] = test_batch.y[:,1].tolist()
    df["Egap_pred"] = pred[:,0]


In [29]:
# ERROR COMPUTATION FOR KAGGLE COMPETITION

df["E_log_err"] = ( np.log(df["E_pred"]+1) - np.log(df["E_real"]+1)) **2
df["Egap_log_err"] = ( np.log(df["Egap_pred"]+1) - np.log(df["Egap_real"]+1)) **2

my_error = np.sqrt((df["E_log_err"]+df["Egap_log_err"]).sum(axis=0) / len(df))

my_error

np.float64(0.9527172182927559)

In [None]:
### FILE PREPARATION FOR KAGGLE SUBMISSION

# test_batch = next(iter(test_loader))
# with torch.no_grad():
#     test_batch.to(device)
#     pred = model(test_batch.x.float(), test_batch.edge_index, test_batch.edge_weight, test_batch.graph_attr, test_batch.batch)
#     df = pd.DataFrame()
#     df["E_real"] = test_batch.y[:,0].tolist()
#     df["E_pred"] = pred[:,0].tolist()
#     df["Egap_real"] = test_batch.y[:,1].tolist()
#     df["Egap_pred"] = pred[:,1].tolist()

#     #df["formation_energy_ev_natom"] = pred[:,0].tolist()
#     #df["bandgap_energy_ev"] = pred[:,1].tolist()


# #df.drop(['id'], axis=1, inplace=True)
# df.insert(loc=0, column='id', value=np.arange(len(df))+1)
# df.to_csv('submission.csv', header=True, index=False)

# from google.colab import files

# files.download('submission.csv')