In [None]:
import json
import os
import numpy as np
import pandas as pd
from pymatgen.entries.computed_entries import ComputedStructureEntry
import pickle
from torch_geometric.data import Data
import torch

In [None]:
os.chdir("database")
all_files = os.listdir()


data = []
missing = []
for filename in sorted(all_files):
    if ".json" in filename:
        with open(filename, "r") as file:
            try:
                entry = json.load(file)
            except:
                print(filename)
                continue

            cse = ComputedStructureEntry.from_dict(entry)
            if "rpa_similarity" in cse.data:
                data.append({"structure": cse.structure}
                            | cse.data | cse.parameters)
            elif cse.data["nsites"] <= 8:
                missing.append(cse.data["mat_id"])
            else:
                continue
data_df = pd.DataFrame(data)
os.chdir(os.pardir)

In [None]:
# drop materials which are close to being metals
data_df = data_df.drop(data_df[data_df["ipa_indirect_gap"] < 0.5].index)
# drop materials whose gap is too large to yield reasonable information
data_df = data_df.drop(data_df[data_df["ipa_direct_gap"] > 10].index)

data_df = data_df.drop(
    data_df[data_df.mat_id == "agm005546161"].index
)  # very rare YAMBO error

In [None]:
# pre-process spectral data
rpa_epsI_xx = []
rpa_epsI_yy = []
rpa_epsI_zz = []
ipa_epsI_xx = []
ipa_epsI_yy = []
ipa_epsI_zz = []
for idr, entry in data_df.iterrows():
    if type(entry["rpa_epsI"]) == float:
        rpa_epsI_xx.append(np.nan)
        rpa_epsI_yy.append(np.nan)
        rpa_epsI_zz.append(np.nan)
        ipa_epsI_xx.append(np.nan)
        ipa_epsI_yy.append(np.nan)
        ipa_epsI_zz.append(np.nan)
        continue
    rpa_epsI_xx.append(entry["rpa_epsI"]["xx"])
    ipa_epsI_xx.append(entry["ipa_epsI"]["xx"])
    if "yy" in entry["rpa_epsI"].keys():
        rpa_epsI_yy.append(entry["rpa_epsI"]["yy"])
        ipa_epsI_yy.append(entry["ipa_epsI"]["yy"])
    else:
        rpa_epsI_yy.append(entry["rpa_epsI"]["xx"])
        ipa_epsI_yy.append(entry["ipa_epsI"]["xx"])
    if "zz" in entry["rpa_epsI"].keys():
        rpa_epsI_zz.append(entry["rpa_epsI"]["zz"])
        ipa_epsI_zz.append(entry["ipa_epsI"]["zz"])
    else:
        rpa_epsI_zz.append(entry["rpa_epsI"]["xx"])
        ipa_epsI_zz.append(entry["ipa_epsI"]["xx"])
data_df["rpa_epsI_xx"] = rpa_epsI_xx
data_df["rpa_epsI_yy"] = rpa_epsI_yy
data_df["rpa_epsI_zz"] = rpa_epsI_zz
data_df["ipa_epsI_xx"] = ipa_epsI_xx
data_df["ipa_epsI_yy"] = ipa_epsI_yy
data_df["ipa_epsI_zz"] = ipa_epsI_zz

rpa_epsR_xx = []
rpa_epsR_yy = []
rpa_epsR_zz = []
ipa_epsR_xx = []
ipa_epsR_yy = []
ipa_epsR_zz = []
for idr, entry in data_df.iterrows():
    if type(entry["rpa_epsI"]) == float:
        rpa_epsR_xx.append(np.nan)
        rpa_epsR_yy.append(np.nan)
        rpa_epsR_zz.append(np.nan)
        ipa_epsR_xx.append(np.nan)
        ipa_epsR_yy.append(np.nan)
        ipa_epsR_zz.append(np.nan)
        continue
    rpa_epsR_xx.append(entry["rpa_epsR"]["xx"])
    ipa_epsR_xx.append(entry["ipa_epsR"]["xx"])
    if "yy" in entry["rpa_epsR"].keys():
        rpa_epsR_yy.append(entry["rpa_epsR"]["yy"])
        ipa_epsR_yy.append(entry["ipa_epsR"]["yy"])
    else:
        rpa_epsR_yy.append(entry["rpa_epsR"]["xx"])
        ipa_epsR_yy.append(entry["ipa_epsR"]["xx"])
    if "zz" in entry["rpa_epsR"].keys():
        rpa_epsR_zz.append(entry["rpa_epsR"]["zz"])
        ipa_epsR_zz.append(entry["ipa_epsR"]["zz"])
    else:
        rpa_epsR_zz.append(entry["rpa_epsR"]["xx"])
        ipa_epsR_zz.append(entry["ipa_epsR"]["xx"])
data_df["rpa_epsR_xx"] = rpa_epsR_xx
data_df["rpa_epsR_yy"] = rpa_epsR_yy
data_df["rpa_epsR_zz"] = rpa_epsR_zz
data_df["ipa_epsR_xx"] = ipa_epsR_xx
data_df["ipa_epsR_yy"] = ipa_epsR_yy
data_df["ipa_epsR_zz"] = ipa_epsR_zz

In [None]:
data_df.describe()

In [None]:
with open("database/data_300_rpa.pckl", "wb+") as file:
    pickle.dump(data_df, file)

In [None]:
# create graphs from preset cutoff length and gaussian expanded edges (target = RPA)
graphs = []
for data_idx, entry in data_df.iterrows():
    if data_idx % 100 == 0:
        print(data_idx)
    if type(entry["rpa_epsI_xx"]) == float:
        continue
    structure = entry["structure"]
    gap = entry["ipa_direct_gap"]
    epsi0 = entry["rpa_epsI_xx"]
    epsi1 = entry["rpa_epsI_yy"]
    epsi2 = entry["rpa_epsI_zz"]
    epsr0 = entry["rpa_epsR_xx"]
    epsr1 = entry["rpa_epsR_yy"]
    epsr2 = entry["rpa_epsR_zz"]

    epsi0_ipa = entry["ipa_epsI_xx"]
    epsi1_ipa = entry["ipa_epsI_yy"]
    epsi2_ipa = entry["ipa_epsI_zz"]
    epsr0_ipa = entry["ipa_epsR_xx"]
    epsr1_ipa = entry["ipa_epsR_yy"]
    epsr2_ipa = entry["ipa_epsR_zz"]

    epsr = 1 / 3 * (epsr0 + epsr1 + epsr2)
    epsi = 1 / 3 * (epsi0 + epsi1 + epsi2)
    epsr_ipa = 1 / 3 * (epsr0_ipa + epsr1_ipa + epsr2_ipa)
    epsi_ipa = 1 / 3 * (epsi0_ipa + epsi1_ipa + epsi2_ipa)
    nbr_fea_idx = []
    nbr_fea = []

    self_fea_idx = []

    all_nbrs = structure.get_all_neighbors(5)
    for site, nbr in enumerate(all_nbrs):
        nbr_fea_idx_sub, nbr_fea_sub, self_fea_idx_sub = [], [], []

        for n in range(len(nbr)):
            self_fea_idx_sub.append(site)

        for j in range(len(nbr)):
            nbr_fea_idx_sub.append(nbr[j][2])

        for j in range(len(nbr)):
            nbr_fea_sub.append(nbr[j][1])

        nbr_fea_idx.append(nbr_fea_idx_sub)
        nbr_fea.append(nbr_fea_sub)

        self_fea_idx.append(self_fea_idx_sub)

    edges = torch.stack(
        (
            torch.tensor(
                [item for items in self_fea_idx for item in items], dtype=torch.long
            ),
            torch.tensor(
                [item for items in nbr_fea_idx for item in items], dtype=torch.long
            ),
        )
    )
    nbr_fea = [item for items in nbr_fea for item in items]
    x_vals = np.linspace(0, 5, 51)
    edge_attr = np.sqrt(10 / np.pi) * np.array(
        [np.exp(-10 * (nbr_fea - val) ** 2) for val in x_vals]
    )
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    edge_attr = torch.transpose(edge_attr, 0, 1)

    atoms = np.array(range(len(all_nbrs)))
    self_fea = []

    for atom_id in atoms:
        # encode atom information by group and row
        group = torch.tensor(
            structure.species[atom_id].group - 1, dtype=torch.int64)
        if group > 1:
            group -= 10
        row = torch.tensor(
            structure.species[atom_id].row - 1, dtype=torch.int64)
        group = torch.nn.functional.one_hot(group, num_classes=8)
        row = torch.nn.functional.one_hot(row, num_classes=5)
        self_fea.append(torch.hstack([group, row]))

    self_fea = torch.vstack(self_fea)
    nbr_fea = torch.tensor(nbr_fea, dtype=torch.float)
    edge_attr = edge_attr

    try:
        graph = Data(
            x=self_fea.to(torch.float32),
            edge_index=edges,
            edge_attr=edge_attr,
            y=torch.tensor(epsi, dtype=torch.float),
            ipa=torch.tensor(epsi_ipa, dtype=torch.float),
        )
    except:
        print(entry["mat_id"])
        continue

    graph["mat_id"] = entry["mat_id"]
    graphs.append(graph)

with open("database/graphs_300_rpa.pckl", "wb+") as file:
    pickle.dump(graphs, file)