In [1]:
from jarvis.core.atoms import crop_square
import matplotlib.pyplot as plt
import os, glob
from jarvis.analysis.stem.convolution_apprx import STEMConv
from jarvis.db.figshare import data, get_jid_data
import matplotlib.pyplot as plt
from jarvis.core.atoms import Atoms, ase_to_atoms, get_supercell_dims
from jarvis.core.lattice import get_2d_lattice
from sklearn.model_selection import train_test_split
from collections import defaultdict
from jarvis.db.jsonutils import loadjson, dumpjson
from jarvis.core.atoms import Atoms
from pymatgen.core import Structure
from pymatgen.core.lattice import Lattice
import numpy as np

import json


In [2]:
#Read data

from datasets import load_dataset

data_train = load_dataset("json", data_files="./alpaca_mbj_bandgap_train.json", split="train")
data_test = load_dataset("json", data_files="./alpaca_mbj_bandgap_test.json", split="train")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Transform data to pandas DataFrame
train_df = data_train.to_pandas()[:5000]
test_df = data_test.to_pandas()[:1000]
# Save to CSV
#train_df.to_csv("alpaca_mbj_bandgap_train.csv", index=False)
#test_df.to_csv("alpaca_mbj_bandgap_test.csv", index=False)

In [4]:
def str_to_structure(gen_str):
    lines = [x for x in gen_str.split("\n") if len(x) > 0]
    lengths = [float(x) for x in lines[0].split(" ")]
    angles = [float(x) for x in lines[1].split(" ")]
    species = [x for x in lines[2::2]]
    coords = [[float(y) for y in x.split(" ")] for x in lines[3::2]]
    
    structure = Structure(
        lattice=Lattice.from_parameters(
            *(lengths + angles)),
        species=species,
        coords=coords, 
        coords_are_cartesian=False,
    )
    
    return structure

def structure_to_atoms(structure):
    import jarvis
    atoms = jarvis.core.atoms.pmg_to_atoms(structure)
    return atoms

def atom_to_image(atoms, size=224):
    """
    Convert Atoms object to a square image of given size.
    """
    dims = get_supercell_dims(atoms=atoms, enforce_c_size=50)
    s = atoms.make_supercell_matrix(dims)
    c = crop_square(s, csize=50)
    p = STEMConv(output_size=[size, size]).simulate_surface(atoms=c)[0]
    return p

# From list to image shape img to image
def list_to_image(img_list, size=224):
    """
    Convert a list to a 2D image of given size.
    """
    return json.loads(np.array(img_list).reshape(size, size))

In [5]:
for i, row in train_df.iterrows():
    gen_str = row['response']
    
    structure = str_to_structure(gen_str)
    atoms = structure_to_atoms(structure)

    image = atom_to_image(atoms, size=224)

    #save images on disk
    image_path = f"train/train_image_{i}.png"
    plt.imsave(image_path, image, cmap='gray')
    
    train_df.loc[i, 'id'] =  i  # Add an ID column
    train_df.loc[i, 'image'] = json.dumps(image.tolist())  # Store image as list for DataFrame

train_df.to_csv("alpaca_mbj_bandgap_train.csv", index=False)

for i, row in test_df.iterrows():
    gen_str = row['response']
    
    structure = str_to_structure(gen_str)
    atoms = structure_to_atoms(structure)
    
    image = atom_to_image(atoms, size=224)

    #save images on disk
    image_path = f"test/test_image_{i}.png"
    plt.imsave(image_path, image, cmap='gray')

    test_df.loc[i, 'id'] =  i  # Add an ID column 
    test_df.loc[i, 'image'] = json.dumps(image.tolist())  # Store image as list for DataFrame

test_df.to_csv("alpaca_mbj_bandgap_test.csv", index=False)