In [None]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pymatgen.entries.computed_entries import ComputedStructureEntry
import pickle
from torch_geometric.data import Data
import torch
import networkx as nx
import torch_geometric
from pymatgen.analysis.dimensionality import get_dimensionality_cheon
import scipy
import math

In [None]:
os.chdir("database_300")
all_files = os.listdir()


data = []

for filename in sorted(all_files):
    if ".json" in filename:
        with open(filename, "r") as file:
            try:
                entry = json.load(file)
            except:
                print(filename)
                continue

            cse = ComputedStructureEntry.from_dict(entry)
            if "ipa_epsI_0" in cse.data:
                data.append({"structure": cse.structure} | cse.data | cse.parameters)
            else:
                print(filename)
data_df = pd.DataFrame(data)
os.chdir(os.pardir)

In [None]:
# drop materials which are close to being metals
data_df = data_df.drop(data_df[data_df["ipa_indirect_gap"] < 0.5].index)
# drop materials whose gap is too large to yield reasonable information
data_df = data_df.drop(data_df[data_df["ipa_direct_gap"] > 10].index)

data_df = data_df.drop(
    data_df[data_df.mat_id == "agm005546161"].index
)  # very rare YAMBO error

In [None]:
with open("data/data_300.pckl", "wb+") as file:
    pickle.dump(data_df, file)

In [None]:
# Im(\varepsilon)
# create graphs from preset cutoff length and gaussian expanded edges
graphs = []
for data_idx, entry in data_df.iterrows():
    if data_idx % 100 == 0:
        print(data_idx)

    structure = entry["structure"]
    gap = entry["ipa_direct_gap"]
    epsi0 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_1"]) == np.ndarray:
        epsi1 = entry["ipa_epsI_1"]
    else:
        epsi1 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_2"]) == np.ndarray:
        epsi2 = entry["ipa_epsI_2"]
    else:
        epsi2 = entry["ipa_epsI_0"]
    epsr0 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_1"]) == np.ndarray:
        epsr1 = entry["ipa_epsR_1"]
    else:
        epsr1 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_2"]) == np.ndarray:
        epsr2 = entry["ipa_epsR_2"]
    else:
        epsr2 = entry["ipa_epsR_0"]

    epsr = 1 / 3 * (epsr0 + epsr1 + epsr2)
    epsi = 1 / 3 * (epsi0 + epsi1 + epsi2)
    nbr_fea_idx = []
    nbr_fea = []

    self_fea_idx = []

    all_nbrs = structure.get_all_neighbors(5)
    for site, nbr in enumerate(all_nbrs):
        nbr_fea_idx_sub, nbr_fea_sub, self_fea_idx_sub = [], [], []

        for n in range(len(nbr)):
            self_fea_idx_sub.append(site)

        for j in range(len(nbr)):
            nbr_fea_idx_sub.append(nbr[j][2])

        for j in range(len(nbr)):
            nbr_fea_sub.append(nbr[j][1])

        nbr_fea_idx.append(nbr_fea_idx_sub)
        nbr_fea.append(nbr_fea_sub)

        self_fea_idx.append(self_fea_idx_sub)

    edges = torch.stack(
        (
            torch.tensor(
                [item for items in self_fea_idx for item in items], dtype=torch.long
            ),
            torch.tensor(
                [item for items in nbr_fea_idx for item in items], dtype=torch.long
            ),
        )
    )
    nbr_fea = [item for items in nbr_fea for item in items]
    x_vals = np.linspace(0, 5, 51)
    edge_attr = np.sqrt(10 / np.pi) * np.array(
        [np.exp(-10 * (nbr_fea - val) ** 2) for val in x_vals]
    )
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    edge_attr = torch.transpose(edge_attr, 0, 1)

    atoms = np.array(range(len(all_nbrs)))
    self_fea = []

    for atom_id in atoms:
        # encode atom information by group and row
        group = torch.tensor(structure.species[atom_id].group - 1, dtype=torch.int64)
        if group > 2:
            group -= 10
        row = torch.tensor(structure.species[atom_id].row - 1, dtype=torch.int64)
        group = torch.nn.functional.one_hot(group, num_classes=8)
        row = torch.nn.functional.one_hot(row, num_classes=5)
        self_fea.append(torch.hstack([group, row]))

    self_fea = torch.vstack(self_fea)
    nbr_fea = torch.tensor(nbr_fea, dtype=torch.float)
    edge_attr = edge_attr

    try:
        graph = Data(
            x=self_fea.to(torch.float32),
            edge_index=edges,
            edge_attr=edge_attr,
            y=torch.tensor(epsi, dtype=torch.float),
        )
    except:
        print(entry["mat_id"])
        continue

    graph["mat_id"] = entry["mat_id"]
    graphs.append(graph)

with open("graphs/graphs_300_eps.pckl", "wb+") as file:
    pickle.dump(graphs, file)

In [None]:
# Re(n)
# create graphs from preset cutoff length and gaussian expanded edges
graphs = []
for data_idx, entry in data_df.iterrows():
    if data_idx % 100 == 0:
        print(data_idx)

    structure = entry["structure"]
    gap = entry["ipa_direct_gap"]
    epsi0 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_1"]) == np.ndarray:
        epsi1 = entry["ipa_epsI_1"]
    else:
        epsi1 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_2"]) == np.ndarray:
        epsi2 = entry["ipa_epsI_2"]
    else:
        epsi2 = entry["ipa_epsI_0"]
    epsr0 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_1"]) == np.ndarray:
        epsr1 = entry["ipa_epsR_1"]
    else:
        epsr1 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_2"]) == np.ndarray:
        epsr2 = entry["ipa_epsR_2"]
    else:
        epsr2 = entry["ipa_epsR_0"]

    epsr = 1 / 3 * (epsr0 + epsr1 + epsr2)
    epsi = 1 / 3 * (epsi0 + epsi1 + epsi2)
    nbr_fea_idx = []
    nbr_fea = []

    self_fea_idx = []

    all_nbrs = structure.get_all_neighbors(5)
    for site, nbr in enumerate(all_nbrs):
        nbr_fea_idx_sub, nbr_fea_sub, self_fea_idx_sub = [], [], []

        for n in range(len(nbr)):
            self_fea_idx_sub.append(site)

        for j in range(len(nbr)):
            nbr_fea_idx_sub.append(nbr[j][2])

        for j in range(len(nbr)):
            nbr_fea_sub.append(nbr[j][1])

        nbr_fea_idx.append(nbr_fea_idx_sub)
        nbr_fea.append(nbr_fea_sub)

        self_fea_idx.append(self_fea_idx_sub)

    edges = torch.stack(
        (
            torch.tensor(
                [item for items in self_fea_idx for item in items], dtype=torch.long
            ),
            torch.tensor(
                [item for items in nbr_fea_idx for item in items], dtype=torch.long
            ),
        )
    )
    nbr_fea = [item for items in nbr_fea for item in items]
    x_vals = np.linspace(0, 5, 51)
    edge_attr = np.sqrt(10 / np.pi) * np.array(
        [np.exp(-10 * (nbr_fea - val) ** 2) for val in x_vals]
    )
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    edge_attr = torch.transpose(edge_attr, 0, 1)

    atoms = np.array(range(len(all_nbrs)))
    self_fea = []

    for atom_id in atoms:
        # encode atom information by group and row
        group = torch.tensor(structure.species[atom_id].group - 1, dtype=torch.int64)
        if group > 2:
            group -= 10
        row = torch.tensor(structure.species[atom_id].row - 1, dtype=torch.int64)
        group = torch.nn.functional.one_hot(group, num_classes=8)
        row = torch.nn.functional.one_hot(row, num_classes=5)
        self_fea.append(torch.hstack([group, row]))

    self_fea = torch.vstack(self_fea)
    nbr_fea = torch.tensor(nbr_fea, dtype=torch.float)
    edge_attr = edge_attr

    try:
        graph = Data(
            x=self_fea.to(torch.float32),
            edge_index=edges,
            edge_attr=edge_attr,
            y=torch.tensor(
            np.sqrt(0.5 * (np.sqrt(epsr**2 + epsi**2) + epsr)),
            dtype=torch.float,
        ),
        )
    except:
        print(entry["mat_id"])
        continue

    graph["mat_id"] = entry["mat_id"]
    graphs.append(graph)

with open("graphs/graphs_300_n.pckl", "wb+") as file:
    pickle.dump(graphs, file)

In [None]:
os.chdir("database_100")
all_files = os.listdir()


data = []

for filename in sorted(all_files):
    if ".json" in filename:
        with open(filename, "r") as file:
            try:
                entry = json.load(file)
            except:
                print(filename)
                continue

            cse = ComputedStructureEntry.from_dict(entry)
            if "ipa_epsI_0" in cse.data:
                data.append({"structure": cse.structure} | cse.data | cse.parameters)
            else:
                print(filename)
data_df = pd.DataFrame(data)
os.chdir(os.pardir)

In [None]:
# drop materials which are close to being metals
data_df = data_df.drop(data_df[data_df["ipa_indirect_gap"] < 0.5].index)
# drop materials whose gap is too large to yield reasonable information
data_df = data_df.drop(data_df[data_df["ipa_direct_gap"] > 10].index)

data_df = data_df.drop(
    data_df[data_df.mat_id == "agm005546161"].index
)  # very rare YAMBO error

In [None]:
with open("data/data_100.pckl", "wb+") as file:
    pickle.dump(data_df, file)

In [None]:
# Im(\varepsilon)
# create graphs from preset cutoff length and gaussian expanded edges
graphs = []
for data_idx, entry in data_df.iterrows():
    if data_idx % 100 == 0:
        print(data_idx)

    structure = entry["structure"]
    gap = entry["ipa_direct_gap"]
    epsi0 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_1"]) == np.ndarray:
        epsi1 = entry["ipa_epsI_1"]
    else:
        epsi1 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_2"]) == np.ndarray:
        epsi2 = entry["ipa_epsI_2"]
    else:
        epsi2 = entry["ipa_epsI_0"]
    epsr0 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_1"]) == np.ndarray:
        epsr1 = entry["ipa_epsR_1"]
    else:
        epsr1 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_2"]) == np.ndarray:
        epsr2 = entry["ipa_epsR_2"]
    else:
        epsr2 = entry["ipa_epsR_0"]

    epsr = 1 / 3 * (epsr0 + epsr1 + epsr2)
    epsi = 1 / 3 * (epsi0 + epsi1 + epsi2)
    nbr_fea_idx = []
    nbr_fea = []

    self_fea_idx = []

    all_nbrs = structure.get_all_neighbors(5)
    for site, nbr in enumerate(all_nbrs):
        nbr_fea_idx_sub, nbr_fea_sub, self_fea_idx_sub = [], [], []

        for n in range(len(nbr)):
            self_fea_idx_sub.append(site)

        for j in range(len(nbr)):
            nbr_fea_idx_sub.append(nbr[j][2])

        for j in range(len(nbr)):
            nbr_fea_sub.append(nbr[j][1])

        nbr_fea_idx.append(nbr_fea_idx_sub)
        nbr_fea.append(nbr_fea_sub)

        self_fea_idx.append(self_fea_idx_sub)

    edges = torch.stack(
        (
            torch.tensor(
                [item for items in self_fea_idx for item in items], dtype=torch.long
            ),
            torch.tensor(
                [item for items in nbr_fea_idx for item in items], dtype=torch.long
            ),
        )
    )
    nbr_fea = [item for items in nbr_fea for item in items]
    x_vals = np.linspace(0, 5, 51)
    edge_attr = np.sqrt(10 / np.pi) * np.array(
        [np.exp(-10 * (nbr_fea - val) ** 2) for val in x_vals]
    )
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    edge_attr = torch.transpose(edge_attr, 0, 1)

    atoms = np.array(range(len(all_nbrs)))
    self_fea = []

    for atom_id in atoms:
        # encode atom information by group and row
        group = torch.tensor(structure.species[atom_id].group - 1, dtype=torch.int64)
        if group > 2:
            group -= 10
        row = torch.tensor(structure.species[atom_id].row - 1, dtype=torch.int64)
        group = torch.nn.functional.one_hot(group, num_classes=8)
        row = torch.nn.functional.one_hot(row, num_classes=5)
        self_fea.append(torch.hstack([group, row]))

    self_fea = torch.vstack(self_fea)
    nbr_fea = torch.tensor(nbr_fea, dtype=torch.float)
    edge_attr = edge_attr

    try:
        graph = Data(
            x=self_fea.to(torch.float32),
            edge_index=edges,
            edge_attr=edge_attr,
            y=torch.tensor(epsi, dtype=torch.float),
        )
    except:
        print(entry["mat_id"])
        continue

    graph["mat_id"] = entry["mat_id"]
    graphs.append(graph)

with open("graphs/graphs_100_eps.pckl", "wb+") as file:
    pickle.dump(graphs, file)

In [None]:
# Re(n)
# create graphs from preset cutoff length and gaussian expanded edges
graphs = []
for data_idx, entry in data_df.iterrows():
    if data_idx % 100 == 0:
        print(data_idx)

    structure = entry["structure"]
    gap = entry["ipa_direct_gap"]
    epsi0 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_1"]) == np.ndarray:
        epsi1 = entry["ipa_epsI_1"]
    else:
        epsi1 = entry["ipa_epsI_0"]
    if type(entry["ipa_epsI_2"]) == np.ndarray:
        epsi2 = entry["ipa_epsI_2"]
    else:
        epsi2 = entry["ipa_epsI_0"]
    epsr0 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_1"]) == np.ndarray:
        epsr1 = entry["ipa_epsR_1"]
    else:
        epsr1 = entry["ipa_epsR_0"]
    if type(entry["ipa_epsR_2"]) == np.ndarray:
        epsr2 = entry["ipa_epsR_2"]
    else:
        epsr2 = entry["ipa_epsR_0"]

    epsr = 1 / 3 * (epsr0 + epsr1 + epsr2)
    epsi = 1 / 3 * (epsi0 + epsi1 + epsi2)
    nbr_fea_idx = []
    nbr_fea = []

    self_fea_idx = []

    all_nbrs = structure.get_all_neighbors(5)
    for site, nbr in enumerate(all_nbrs):
        nbr_fea_idx_sub, nbr_fea_sub, self_fea_idx_sub = [], [], []

        for n in range(len(nbr)):
            self_fea_idx_sub.append(site)

        for j in range(len(nbr)):
            nbr_fea_idx_sub.append(nbr[j][2])

        for j in range(len(nbr)):
            nbr_fea_sub.append(nbr[j][1])

        nbr_fea_idx.append(nbr_fea_idx_sub)
        nbr_fea.append(nbr_fea_sub)

        self_fea_idx.append(self_fea_idx_sub)

    edges = torch.stack(
        (
            torch.tensor(
                [item for items in self_fea_idx for item in items], dtype=torch.long
            ),
            torch.tensor(
                [item for items in nbr_fea_idx for item in items], dtype=torch.long
            ),
        )
    )
    nbr_fea = [item for items in nbr_fea for item in items]
    x_vals = np.linspace(0, 5, 51)
    edge_attr = np.sqrt(10 / np.pi) * np.array(
        [np.exp(-10 * (nbr_fea - val) ** 2) for val in x_vals]
    )
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    edge_attr = torch.transpose(edge_attr, 0, 1)

    atoms = np.array(range(len(all_nbrs)))
    self_fea = []

    for atom_id in atoms:
        # encode atom information by group and row
        group = torch.tensor(structure.species[atom_id].group - 1, dtype=torch.int64)
        if group > 2:
            group -= 10
        row = torch.tensor(structure.species[atom_id].row - 1, dtype=torch.int64)
        group = torch.nn.functional.one_hot(group, num_classes=8)
        row = torch.nn.functional.one_hot(row, num_classes=5)
        self_fea.append(torch.hstack([group, row]))

    self_fea = torch.vstack(self_fea)
    nbr_fea = torch.tensor(nbr_fea, dtype=torch.float)
    edge_attr = edge_attr

    try:
        graph = Data(
            x=self_fea.to(torch.float32),
            edge_index=edges,
            edge_attr=edge_attr,
            y=torch.tensor(
            np.sqrt(0.5 * (np.sqrt(epsr**2 + epsi**2) + epsr)),
            dtype=torch.float,
        ),
        )
    except:
        print(entry["mat_id"])
        continue

    graph["mat_id"] = entry["mat_id"]
    graphs.append(graph)

with open("graphs/graphs_100_n.pckl", "wb+") as file:
    pickle.dump(graphs, file)