# Downloading and cleaning up Kim band-gap data from JARVIS

## Downloading data

In [6]:
from jarvis.db.jsonutils import dumpjson
from jarvis.db.jsonutils import loadjson
import numpy as np
from jarvis.db.figshare import data
from jarvis.core.atoms import Atoms

In [2]:
dataset = data('snumat')

Obtaining SNUMAT Hybrid functional dataset 10k...
Reference:https://www.nature.com/articles/s41597-020-00723-8
Loading the zipfile...
Loading completed.


In [3]:
snum_icsd_master = {}
for mat in dataset:
    d = {}
    id = mat['ICSD_number']
    d["GGA_gap"] = mat['Band_gap_GGA']
    d["HSE_gap"] = mat['Band_gap_HSE']
    d["GGA_opt"] = mat['Band_gap_GGA_optical']
    d["HSE_opt"] = mat['Band_gap_HSE_optical']
    d["Correction"] = d["HSE_gap"] - d["GGA_gap"]
    d["GGA_offset"] = d["GGA_opt"] - d["GGA_gap"]
    d["HSE_offset"] = d["HSE_opt"] - d["HSE_gap"]
    d['atoms'] = mat['atoms']
    snum_icsd_master[id] = d

## Cleaning up data - removing Lns, Acts, and deuterium

In [4]:
lns = ['La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu']
acts = ['Ac','Th','Pa','U','Np','Pu','Am','Cm','Bk','Cf','Es','Fm','Md','No','Lr']
deut = ['D']
banned = set(lns+acts+deut)

In [7]:
clean_dict = {}
for icsd,mat in snum_icsd_master.items():
    ats = Atoms.from_dict(mat['atoms'])
    at_list = ats.uniq_species
    if len(banned.intersection(at_list))==0:
        clean_dict[icsd]=mat

## Converting to MPIDs, averaging where necessary

In [8]:
icsd_to_mp = loadjson('../hybrid_gaps/icsd_to_mp.json')

In [9]:
multiples = {}
for icsd,mat in clean_dict.items():
    if icsd in icsd_to_mp.keys():
        mpid = icsd_to_mp[icsd]
        if mpid in multiples:
            multiples[mpid]['GGA_gap'].append(mat['GGA_gap'])
            multiples[mpid]['HSE_gap'].append(mat['HSE_gap'])
            multiples[mpid]['GGA_opt'].append(mat['GGA_opt'])
            multiples[mpid]['HSE_opt'].append(mat['HSE_opt'])
            multiples[mpid]['Correction'].append(mat['Correction'])
            multiples[mpid]['GGA_offset'].append(mat['GGA_offset'])
            multiples[mpid]['HSE_offset'].append(mat['HSE_offset'])
            multiples[mpid]['icsds'].append(icsd)
        else:
            d = {}
            d['GGA_gap'] = [mat['GGA_gap']]
            d['HSE_gap'] = [mat['HSE_gap']]
            d['GGA_opt'] = [mat['GGA_opt']]
            d['HSE_opt'] = [mat['HSE_opt']]
            d['Correction']=[mat['Correction']]
            d['GGA_offset'] = [mat['GGA_offset']]
            d['HSE_offset'] = [mat['HSE_offset']]
            d['atoms'] = mat['atoms']
            d['icsds'] = [icsd]
            multiples[mpid] = d

In [12]:
unique_mp_dict = {}
for mpid, d in multiples.items():
    new_d = {}
    new_d['icsds'] = d['icsds']
    for key in d.keys():
        if key not in ['atoms','icsds']:
            new_d[key] = np.mean(d[key])
    unique_mp_dict[mpid] = new_d

## Saving ICSD and MPID dicts to JSON files

In [11]:
dumpjson(clean_dict, 'clean_Kim_data.json')
dumpjson(unique_mp_dict, 'MPID_Kim_data.json')

Note - when presented with a set of ICSDs corresponding to the same MPID, we have ignored the ```atoms``` object, and will use the structure from the materials project instead. We can use the subtly diffferent ICSD structures if training with the ICSD dataset, though will need to be careful about data leakage - two very similar ICSD structures of the same material (e.g. GaAs example below) should in the same subset - e.g. both training - for realistic validation performance

In [18]:
unique_mp_dict['mp-2534'] ## MPID of stablest polymorph of GaAs

{'icsds': ['41981', '610533'],
 'GGA_gap': 0.18221700000000002,
 'HSE_gap': 0.9537105,
 'GGA_opt': 0.18221700000000002,
 'HSE_opt': 0.9537105,
 'Correction': 0.7714935,
 'GGA_offset': 0.0,
 'HSE_offset': 0.0}

In [14]:
clean_dict['41981']

{'GGA_gap': 0.183234,
 'HSE_gap': 0.954898,
 'GGA_opt': 0.183234,
 'HSE_opt': 0.954898,
 'Correction': 0.771664,
 'GGA_offset': 0.0,
 'HSE_offset': 0.0,
 'atoms': {'lattice_mat': [[0.0, 2.8768807790576263, 2.8768807790576263],
   [2.8768807790576263, 0.0, 2.8768807790576263],
   [2.8768807790576263, 2.8768807790576263, 0.0]],
  'coords': [[0.0, 0.0, 0.0], [0.25, 0.25, 0.25]],
  'elements': ['Ga', 'As'],
  'abc': [4.068523, 4.068523, 4.068523],
  'angles': [60.0, 60.0, 60.0],
  'cartesian': False,
  'props': ['', '']}}

In [16]:
clean_dict['610533']

{'GGA_gap': 0.1812,
 'HSE_gap': 0.952523,
 'GGA_opt': 0.1812,
 'HSE_opt': 0.952523,
 'Correction': 0.771323,
 'GGA_offset': 0.0,
 'HSE_offset': 0.0,
 'atoms': {'lattice_mat': [[0.0, 2.877146038384684, 2.877146038384684],
   [2.877146038384684, 0.0, 2.877146038384684],
   [2.877146038384684, 2.877146038384684, 0.0]],
  'coords': [[0.0, 0.0, 0.0], [0.25, 0.25, 0.25]],
  'elements': ['As', 'Ga'],
  'abc': [4.068905, 4.068905, 4.068905],
  'angles': [60.0, 60.0, 60.0],
  'cartesian': False,
  'props': ['', '']}}

Subtly different lattice parameters and gaps but effectively same material twice