In [20]:
import os
import csv
import re
from pymatgen.core import Structure
from pymatgen.symmetry.groups import SpaceGroup
from pymatgen.core import Composition

In [None]:
# Path to the folder containing the CIF files
folder_path = "data/cif"
output_csv = "cif.csv"

# Create a CSV file to write the extracted data
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row with the updated column names
    writer.writerow([
        "cod_database_code",               # COD_Database_Code
        "chemical_formula_structural",     # Chemical_Formula_Structural
        "chemical_name_mineral",           # Chemical_Name_Mineral
        "symmetry_cell_setting",
        "symmetry_lattice_setting",        # Symmetry_Cell_Setting
        "space_group_it_number",           # Space_Group_IT_Number
        "lattice_angle_alpha",             # Cell_Angle_Alpha
        "lattice_angle_beta",              # Cell_Angle_Beta
        "lattice_angle_gamma",             # Cell_Angle_Gamma
        "lattice_length_a",                # Cell_Length_A
        "lattice_length_b",                # Cell_Length_B
        "lattice_length_c",                # Cell_Length_C
        "lattice_volume"                   # Cell_Volume
    ])

    # Iterate through all CIF files in the folder
    for cif_file in os.listdir(folder_path):
        if cif_file.endswith(".cif"):
            cif_path = os.path.join(folder_path, cif_file)
            try:
                # Read the structure from the CIF file using pymatgen
                structure = Structure.from_file(cif_path)
                # Get space group info (symbol and IT number)
                space_group_symbol, space_group_it_number = structure.get_space_group_info()

                # Use the SpaceGroup class to access the crystal system
                space_group = SpaceGroup(space_group_symbol)               

                # Extract the desired data from the structure
                data = [
                    cif_file,  # cod_database_code is the CIF filename
                    structure.formula,  # chemical_formula_structural
                    structure.composition.reduced_formula,  # chemical_name_mineral
                    space_group.crystal_system.lower(),
                    structure.get_space_group_info()[0],  # symmetry_lattice_setting
                    structure.get_space_group_info()[1],  # space_group_it_number
                    structure.lattice.angles[0],  # lattice_angle_alpha
                    structure.lattice.angles[1],  # lattice_angle_beta
                    structure.lattice.angles[2],  # lattice_angle_gamma
                    structure.lattice.a,  # lattice_length_a
                    structure.lattice.b,  # lattice_length_b
                    structure.lattice.c,  # lattice_length_c
                    structure.lattice.volume  # lattice_volume
                ]

                # Write the data to the CSV file
                writer.writerow(data)

            except Exception as e:
                print(f"Error processing {cif_file}: {e}")

print(f"CSV file has been created: {output_csv}")

In [25]:
# Path to the folder containing the RUFF files
ruff_folder_path = "data/ruff"
output_csv = "ruff.csv"

# Create a CSV file to write the extracted data
def clean_formula(formula):
    """ Clean the formula string to make it compatible with pymatgen. """
    # Remove unwanted characters (like _ and =)
    formula = re.sub(r'[_=]', '', formula)
    # Handle fractional formulas (e.g., LiAl_1.00 becomes LiAl1.00)
    formula = re.sub(r'(\d+\.\d+)', r'\1', formula)
    # Ensure there's no parentheses or other invalid symbols
    formula = re.sub(r'[()]*', '', formula)
    # Replace some common issues (e.g., OH -> O H)
    formula = formula.replace('OH', 'O H')
    return formula

# Create a CSV file to write the extracted data
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header row with the column names
    writer.writerow([
        "ruff_id",                     # RUFF_ID
        "chemical_formula_structural", # Chemical_Formula_Structural
        "chemical_name_mineral",       # Chemical_Name_Mineral
        "symmetry_cell_setting",       # Symmetry_Cell_Setting
        "symmetry_lattice_setting",    # Symmetry_Cell_Setting
        "space_group_it_number",       # Space_Group_IT_Number
        "lattice_angle_alpha",         # Cell_Angle_Alpha
        "lattice_angle_beta",          # Cell_Angle_Beta
        "lattice_angle_gamma",         # Cell_Angle_Gamma
        "lattice_length_a",            # Cell_Length_A
        "lattice_length_b",            # Cell_Length_B
        "lattice_length_c",            # Cell_Length_C
        "lattice_volume"               # Cell_Volume
    ])

    # Iterate through all RUFF files in the folder
    for ruff_file in os.listdir(ruff_folder_path):
        if ruff_file.endswith(".txt"):
            ruff_path = os.path.join(ruff_folder_path, ruff_file)
            try:
                with open(ruff_path, 'r') as ruff_file_obj:
                    ruff_data = ruff_file_obj.read()

                    # Extract data using regular expressions
                    ruff_id = re.search(r"##RRUFFID=(\S+)", ruff_data)
                    chemical_formula = re.search(r"##IDEAL CHEMISTRY=([\w\(\)_\d\+\-]+)", ruff_data)
                    chemical_name = re.search(r"##NAMES=(\S+)", ruff_data)
                    cell_parameters = re.search(r"##CELL PARAMETERS=(.+?)crystal system:", ruff_data, re.DOTALL)
                    crystal_system = re.search(r"crystal system: (\S+)", ruff_data)

                    # Default values for missing data
                    ruff_id = ruff_id.group(1) if ruff_id else None
                    chemical_formula_raw = chemical_formula.group(1) if chemical_formula else None
                    chemical_name = chemical_name.group(1) if chemical_name else None
                    cell_params = cell_parameters.group(1).strip() if cell_parameters else None
                    crystal_system = crystal_system.group(1) if crystal_system else None
                    
                    # Clean the chemical formula
                    if chemical_formula_raw:
                        clean_formula_str = clean_formula(chemical_formula_raw)
                        formula = Composition(clean_formula_str).formula
                    else:
                        formula = None

                    # Parse cell parameters
                    if cell_params:
                        cell_data = re.findall(r"([a-z]+):\s?([\d\.\(\)]+)", cell_params)
                        cell_dict = dict(cell_data)

                        a = cell_dict.get('a', None)
                        b = cell_dict.get('b', None)
                        c = cell_dict.get('c', None)
                        alpha = cell_dict.get('alpha', None)
                        beta = cell_dict.get('beta', None)
                        gamma = cell_dict.get('gamma', None)
                        volume = cell_dict.get('volume', None)
                    else:
                        a = b = c = alpha = beta = gamma = volume = None
                    
                    # Handle symmetry and space group if available (setting them to None if not found)
                    symmetry_cell_setting = crystal_system if crystal_system else None
                    space_group_it_number = None  # Assuming RUFF files don't have space group info

                    # Write the data to the CSV file
                    writer.writerow([
                        ruff_id,                # ruff_id
                        formula,                # chemical_formula_structural (using normalized formula)
                        chemical_name,          # chemical_name_mineral
                        symmetry_cell_setting,  # symmetry_cell_setting
                        symmetry_cell_setting,  # symmetry_lattice_setting (same as symmetry_cell_setting)
                        space_group_it_number,  # space_group_it_number
                        alpha,                  # lattice_angle_alpha
                        beta,                   # lattice_angle_beta
                        gamma,                  # lattice_angle_gamma
                        a,                      # lattice_length_a
                        b,                      # lattice_length_b
                        c,                      # lattice_length_c
                        volume                  # lattice_volume
                    ])

            except Exception as e:
                print(f"Error processing {ruff_file}: {e}")

print(f"CSV file has been created: {output_csv}")

CSV file has been created: ruff.csv
