# Featurize HOMO/LUMO

In [11]:
import os
import itertools
import math

def find_alpha_occ_and_virt_eigenvalues(file_path):
    try:
        with open(file_path, 'r') as file:
            last_eigenvalue = None

            for line in file:
                # Check if the line contains 'Alpha occ. eigenvalues'
                if ' Alpha  occ.' in line:
                    # Split the line into elements and extract the last one
                    eigenvalues = line.split()
                    last_eigenvalue = eigenvalues[-1]
                
                # Check if the next line starts with 'Alpha virt.'
                elif last_eigenvalue is not None and line.startswith(' Alpha virt.'):
                    # Split the line into elements and extract the first one
                    virt_values = line.split()
                    first_virt_value = virt_values[4]

                    # Return both the last eigenvalue and the first value in 'Alpha virt.'
                    return last_eigenvalue, first_virt_value

            # If the function reaches this point, it means 'Alpha occ. eigenvalues' was not found
            return None, None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

def process_files_in_folder(folder_path, output_file_path):
    try:
        with open(output_file_path, 'w') as output_file:
            # Iterate over all files in the folder
            for filename in os.listdir(folder_path):
                if filename.endswith(".log"):
                    file_path = os.path.join(folder_path, filename)
                    
                    # Apply the function to the current file
                    last_eigenvalue, first_virt_value = find_alpha_occ_and_virt_eigenvalues(file_path)
                    
                    # Write the results to the output file along with the file name
                    output_file.write(f"File: {filename} ,   {last_eigenvalue} ,    {first_virt_value}\n")
                    #output_file.write(f"Last eigenvalue: {last_eigenvalue}\n")
    except Exception as e:
        print(f"An error occurred: {e}")

# Replace 'your_folder_path' with the actual path to your folder containing .log files
folder_path = '../Data/DFT_structures/'
# Replace 'output.txt' with the desired output file path
output_file_path = '../Data/HOMOLUMO.txt'

process_files_in_folder(folder_path, output_file_path)


# Featurize polarizability

In [8]:
def extract_polarizabilities(file_path):
    try:
        with open(file_path, 'r') as file:
            exact_polarizabilities = []
            isotropic_polarizabilities = []

            for line in file:
                # Check if the line starts with 'Exact polarizability'
                if line.startswith('  Exact polarizability'):
                    # Split the line into elements and extract the values
                    polarizabilities = line.split()[2:]
                    exact_polarizabilities.extend(map(float, polarizabilities))
                
                # Check if the line starts with 'Isotropic polarizability'
                elif line.startswith(' Isotropic polarizability'):
                    # Split the line into elements and extract the values
                    polarizabilities = line.split()[3:]
                    isotropic_polarizabilities.extend(polarizabilities)

            # Return the extracted values
            return (
                ', '.join(map(str, exact_polarizabilities)),
                isotropic_polarizabilities
            )
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

def process_files_in_folder(folder_path, output_file_path):
    try:
        with open(output_file_path, 'w') as output_file:
            # Iterate over all files in the folder
            for filename in os.listdir(folder_path):
                if filename.endswith(".log"):
                    file_path = os.path.join(folder_path, filename)
                    
                    # Extract polarizabilities from the current file
                    exact_polarizabilities, isotropic_polarizabilities = extract_polarizabilities(file_path)
                    
                    # Write the results to the output file along with the file name
                    output_file.write(f"File: {filename} ,  {exact_polarizabilities},  {isotropic_polarizabilities[2]}\n")

    except Exception as e:
        print(f"An error occurred: {e}")

# Replace 'output_file_path' with the desired output file path
output_file_path = '../Data/n_polar.txt'

process_files_in_folder("./", output_file_path)

# Featurize  size and shape

In [12]:


def calculate_distance(point1, point2):
    return math.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2 + (point1[2] - point2[2])**2)

def calculate_max_distance(coordinates):
    max_distance = 0.0
    
    # Use itertools to generate all combinations of points
    for pair in itertools.combinations(coordinates, 2):
        distance = calculate_distance(pair[0], pair[1])
        max_distance = max(max_distance, distance)
    
    return max_distance


def calculate_bounding_box_dimensions(coordinates):
    min_x, min_y, min_z = float('inf'), float('inf'), float('inf')
    max_x, max_y, max_z = float('-inf'), float('-inf'), float('-inf')

    for point in coordinates:
        x, y, z = point
        min_x = min(min_x, x)
        min_y = min(min_y, y)
        min_z = min(min_z, z)
        max_x = max(max_x, x)
        max_y = max(max_y, y)
        max_z = max(max_z, z)

    dimensions = [max_x - min_x, max_y - min_y, max_z - min_z]
    dimensions.sort(reverse=True)

    return dimensions

def read_coordinates_from_log(log_file_path):
    coordinates = []

    with open(log_file_path, 'r') as log_file:
        lines = log_file.readlines()

        # Find the last occurrence of the start index
        start_index = next(i for i, line in reversed(list(enumerate(lines))) if " Center     Atomic      Atomic             Coordinates (Angstroms)" in line) + 3

        # Find the end index
        end_index = lines.index(" ---------------------------------------------------------------------\n", start_index)
        
        # Extract and parse the coordinates
        for line in lines[start_index:end_index]:
            
            values = line.split()
            if len(values) == 6:
                # Extract x, y, z coordinates and convert to floats
                x, y, z = map(float, values[3:])
                coordinates.append((x, y, z))

    return coordinates

In [13]:
def process_files_in_folder(folder_path):
    results = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".log"):
            file_path = os.path.join(folder_path, filename)

            try:
                # Extract coordinates from the .log file
                coordinates = read_coordinates_from_log(file_path)

                # Apply the functions
                max_distance = calculate_max_distance(coordinates)
                bounding_box_dimensions = calculate_bounding_box_dimensions(coordinates)

                # Save the results
                result_entry = {'filename': filename, 'max_distance': max_distance, 'dimensions': bounding_box_dimensions}
                results.append(result_entry)

            except Exception as e:
                print(f"An error occurred processing file {filename}: {e}")

    return results

# Replace 'your_folder_path' with the actual path to your folder containing .log files
folder_path = '../Data/DFT_structures'

results = process_files_in_folder(folder_path)

# Save results to a text file
output_file_path = '../Data/results_dist.txt'
with open(output_file_path, 'w') as output_file:
    for result in results:
        output_file.write(f"File: {result['filename']}, {result['max_distance']}, {result['dimensions'][0]}, {result['dimensions'][1]}, {result['dimensions'][2]}\n")


# Featurize  %Vbur 

In [16]:
import dbstep.Dbstep as db




filename_list_file = "../Data/Atom_IDs.txt"  # Replace with the actual path to your filename list file

directory_path = "../Data/DFT_structures"  # Replace with the actual path to your directory
output_file = "../Data/Vbur_cat.txt"
# Read the filenames from the specified file
with open(filename_list_file, "r") as filename_list:
    target_filenames = [line.split()[1] for line in filename_list.readlines() if line.strip()]
with open(filename_list_file, "r") as filename_list:
    AIDs= [line.split()[3] for line in filename_list.readlines() if line.strip()]

# Open the output file in write mode
with open(output_file, "w") as file:

    # Iterate over the target filenames
    for i in range(len(target_filenames)):
        target_filename = target_filenames[i]
        AID = AIDs[i]
        file_path = os.path.join(directory_path, target_filename)
        
        # Run db.dbstep command for each target file
        
        # Use subprocess to run the command
        result = db.dbstep(file_path,AID, commandline=True,volume=True,measure='classic',r=7.5)
            #result = subprocess.run(command, shell=True, capture_output=True, text=True)
        result = result.bur_vol
            # Write results to the output file
        file.write(f"Results for file {target_filename}:    {result}\n")

      R/Å     %V_Bur     %S_Bur
     7.50      11.11       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      13.54       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      11.31       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      10.69       0.00
      R/Å     %V_Bur     %S_Bur
     7.50       9.28       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      10.69       0.00
      R/Å     %V_Bur     %S_Bur
     7.50       7.95       0.00
      R/Å     %V_Bur     %S_Bur
     7.50       8.30       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      11.02       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      11.85       0.00
      R/Å     %V_Bur     %S_Bur
     7.50       8.86       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      11.58       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      15.80       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      11.87       0.00
      R/Å     %V_Bur     %S_Bur
     7.50      12.25       0.00
      R/Å     %V_Bur     %S_Bur
     7.5

After featurizattion is over we have to manualy merged obtained files and add a SMILES string to connect it to the original databse. Final data in catalyst_sterimol_data.csv