Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve unphysical (greater than 1) occupancy handling in CifParser and add missing site label if not check_occu #3819

Merged
merged 21 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 11 additions & 9 deletions pymatgen/io/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,9 +323,9 @@ def __init__(
"""
Args:
filename (PathLike): CIF file, gzipped or bzipped CIF files are fine too.
occupancy_tolerance (float): If total occupancy of a site is between 1 and occupancy_tolerance, the
occupancies will be scaled down to 1.
site_tolerance (float): This tolerance is used to determine if two sites are in the same position,
occupancy_tolerance (float): If total occupancy of a site is between
1 and occupancy_tolerance, it will be scaled down to 1.
site_tolerance (float): This tolerance is used to determine if two sites are at the same position,
in which case they will be combined to a single disordered site. Defaults to 1e-4.
frac_tolerance (float): This tolerance is used to determine is a coordinate should be rounded to an ideal
value. e.g. 0.6667 is rounded to 2/3. This is desired if symmetry operations are going to be applied.
Expand Down Expand Up @@ -1027,11 +1027,11 @@ def get_matching_coord(

# Get occupancy
try:
occu = str2float(data["_atom_site_occupancy"][idx])
occu: float = str2float(data["_atom_site_occupancy"][idx])
except (KeyError, ValueError):
occu = 1

# If check_occu is True or the occupancy is greater than 0, create comp_d
# If don't check_occu or the occupancy is greater than 0, create comp_dict
if not check_occu or occu > 0:
# Create site coordinate
coord: Vector3D = (
Expand Down Expand Up @@ -1073,7 +1073,7 @@ def get_matching_coord(

if any(occu > 1 for occu in _sum_occupancies):
msg = (
f"Some occupancies ({_sum_occupancies}) sum to > 1! If they are within "
f"Some occupancies ({filter(lambda x: x<=1, _sum_occupancies)}) sum to > 1! If they are within "
"the occupancy_tolerance, they will be rescaled. "
f"The current occupancy_tolerance is set to: {self._occupancy_tolerance}"
)
Expand Down Expand Up @@ -1149,7 +1149,10 @@ def get_matching_coord(
all_species_noedit = all_species.copy() # save copy before scaling in case of check_occu=False, used below
for idx, species in enumerate(all_species):
total_occu = sum(species.values())
if 1 < total_occu <= self._occupancy_tolerance:
if check_occu and total_occu > self._occupancy_tolerance:
raise ValueError(f"Occupancy {total_occu} exceeded tolerance.")
Copy link
Contributor Author

@DanielYang59 DanielYang59 May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would help capture the occupancy > tolerance error.

Without this, if an occupancy is greater than tolerance, it would not be scaled, and passed directly into Structure. The Exception raised by Structure (because of unphysical occupancy) would be compressed and replaced with a general message which doesn't show the reason for failure:

pymatgen/pymatgen/io/cif.py

Lines 1291 to 1301 in bb68c78

for idx, data in enumerate(self._cif.data.values()):
try:
if struct := self._get_structure(data, primitive, symmetrized, check_occu=check_occu):
structures.append(struct)
except (KeyError, ValueError) as exc:
msg = f"No structure parsed for section {idx + 1} in CIF.\n{exc}"
if on_error == "raise":
raise ValueError(msg) from exc
if on_error == "warn":
warnings.warn(msg)

For example the following error message provided in #3816:

Some occupancies ([2.0, 2.0, 2.0, 1.0, 1.0]) sum to > 1! If they are within the occupancy_tolerance, they will be rescaled. The current occupancy_tolerance is set to: 1.0
No structure parsed for section 1 in CIF.
...
in CifParser.parse_structures(self, primitive, symmetrized, check_occu, on_error)
   1219     warnings.warn("Issues encountered while parsing CIF: " + "\n".join(self.warnings))
   1221 if len(structures) == 0:
-> 1222     raise ValueError("Invalid CIF file with no structures!")
   1223 return structures

ValueError: Invalid CIF file with no structures!


if total_occu > 1:
all_species[idx] = species / total_occu

if all_species and len(all_species) == len(all_coords) and len(all_species) == len(all_magmoms):
Expand Down Expand Up @@ -1198,6 +1201,7 @@ def get_matching_coord(
all_coords[idx],
lattice,
properties=site_properties,
label=all_labels[idx],
skip_checks=True,
)

Expand Down Expand Up @@ -1278,8 +1282,6 @@ def parse_structures(
"in the CIF file as is. If you want the primitive cell, please set primitive=True explicitly.",
UserWarning,
)
if not check_occu: # added in https://github.com/materialsproject/pymatgen/pull/2836
warnings.warn("Structures with unphysical site occupancies are not compatible with many pymatgen features.")
Copy link
Contributor Author

@DanielYang59 DanielYang59 May 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This warning should not be raised just because if not check_occu, it should really check the occupancy and only warn if the occupancy is "unphysical".

The following code has already done this, so I would suggest removing it:

pymatgen/pymatgen/io/cif.py

Lines 1011 to 1018 in 2e1c301

if any(occu > 1 for occu in sum_occu):
msg = (
f"Some occupancies ({sum_occu}) sum to > 1! If they are within "
"the occupancy_tolerance, they will be rescaled. "
f"The current occupancy_tolerance is set to: {self._occupancy_tolerance}"
)
warnings.warn(msg)
self.warnings.append(msg)


if primitive and symmetrized:
raise ValueError(
Expand Down
15 changes: 13 additions & 2 deletions tests/io/test_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,17 +731,28 @@ def test_empty(self):
cb2 = CifBlock.from_str(str(cb))
assert cb == cb2

def test_bad_cif(self):
def test_bad_occu(self):
filepath = f"{TEST_FILES_DIR}/cif/bad_occu.cif"
parser = CifParser(filepath)
with pytest.raises(
ValueError, match="No structure parsed for section 1 in CIF.\nSpecies occupancies sum to more than 1!"
Copy link
Contributor Author

@DanielYang59 DanielYang59 May 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous error message might be misleading, as this fails because the occupancy is greater than tolerance, not 1.

ValueError, match="No structure parsed for section 1 in CIF.\nOccupancy 1.556 exceeded tolerance."
):
parser.parse_structures(on_error="raise")
parser = CifParser(filepath, occupancy_tolerance=2)
struct = parser.parse_structures()[0]
assert struct[0].species["Al3+"] == approx(0.778)

def test_not_check_occu(self):
# Test large occupancy with check_occu turned off
with open(f"{TEST_FILES_DIR}/cif/site_type_symbol_test.cif") as cif_file:
cif_str = cif_file.read()
cif_str = cif_str.replace("Te Te 1.0000", "Te_label Te 10.0", 1)

structs = CifParser.from_str(cif_str).parse_structures(check_occu=False)

assert len(structs) > 0
assert set(structs[0].labels) == {"Te_label", "Ge"}

def test_one_line_symm(self):
cif_file = f"{TEST_FILES_DIR}/cif/OneLineSymmP1.cif"
parser = CifParser(cif_file)
Expand Down