# Data Collection and Cleaning

The challenge we wanted to face was determining potential hazards given only the SMILES representation of the target molecule. This meant that the only data we could use to compare our target molecule with the molecules in our model had to be extracted from the SMILES string. This ended up being the largest challenge of our project, and we believe were are project truely shines is in the data collection. This is where the bulk of our time and work went. While attempting to solve the problem we realized that it would be best create python files to help keep track of all of the logic. Our presentation Explains what is going on the best (https://docs.google.com/presentation/d/1mM9jVTphG-KGLMMnJ64jOobA0GGqeCZog6zPpdtnB7w/edit#slide=id.g27324927fda_0_110).

## Conversion of SMILES to groupings

**sub_group_from_smiles.py**

File that handles the conversion of Smiles strings to substituents

In [None]:
import ring_handling_w_rdkit as ring_handle



def smiles_to_sub_groups(SMILES_string : str) -> tuple|bool:
    """
    Input: SMILES_string smiles representation of a molecule in string format
    Output: tuple containing tuple of all subgroups of the given molecule at index 0,
            and tuple of all combinations of adjacent subgroups at index 1
    Ex Input: "CC1=C(C(=CC=C1)[N+](=O)[O-])N"
    Ex Output: "(('0CCCCCC', 'C', '[N+]', '=O', '[O-]', 'N'), (('CCCCCC', 'C'), ('[N+]', 'CCCCCC'), ('[N+]', '=O'), ('[O-]', '[N+]'), ('N', 'CCCCCC')))"
    Note: subgroups that represent rings will have "0" at index 0 as a marker
    """

    if type(SMILES_string) != str:
        print("SMILES_string is of wrong type")
        return False

    try:
        sub_group_index_list = subgroups_from_SMILES(SMILES_string)
        test_neighbors_dict = determine_group_neighbors(SMILES_string, sub_group_index_list)
        cleaned_dict = clean_subgroup_dict(test_neighbors_dict, SMILES_string)
        group_list = get_group_list(cleaned_dict)
        combination_groups = create_group_combinations(cleaned_dict)

        return group_list, combination_groups

    except ValueError:
        print("Invalid SMILES_string")
        return False



def subgroups_from_SMILES(SMILES_string: str) -> list:
    subgroup_index_list = []
    index_to_group_dict = {}

    if "1" in SMILES_string:  # checks if there are any rings
        ring_indices_lists = ring_handle.ring_handling_rdkit(SMILES_string)
        index_to_group_dict = ring_handle.create_dict_of_ring_indices(ring_indices_lists)
        ring_num = 1
        for ring_indices in ring_indices_lists:
            subgroup_index_list.append((str(ring_num), ring_indices))
            ring_num += 1

    bond_type_tuple = ("=", "#")
    grouping_type_tuple = ("(", ")")

    current_sub_group = []
    current_group_ascii = 97
    current_group_name = chr(current_group_ascii)

    SMILES_length = len(SMILES_string)
    for i in range(SMILES_length):
        char = SMILES_string[i]
        if i in index_to_group_dict:
            # With the exception of rings, no atom/bond index should appear in more than one group.
            # An atom/bond index can be in 1 group OR in 1 or more rings
            # everything before this should be its own group
            if len(current_sub_group) > 0:
                current_group_ascii += 1
                current_group_name = chr(current_group_ascii)
                subgroup_index_list.append((current_group_name, current_sub_group))
            current_sub_group = []
            continue

        elif char.isdigit():
            # no reason to include digits
            # everything before this should be its own group
            # in theory, current_sub_group should always be a list of len 0 here
            if len(current_sub_group) > 0:
                current_group_name = chr(current_group_ascii)
                subgroup_index_list.append((current_group_name, current_sub_group))
                current_group_ascii += 1
                current_group_name = chr(current_group_ascii)
            current_sub_group = []
            continue

        elif char in grouping_type_tuple:
            if char == "(":
                current_sub_group.append(i)  # including opening now helps with finding neighbors later
            # leaves as list for now
            subgroup_index_list.append((current_group_name, current_sub_group))
            previous_sub_group = current_sub_group
            current_sub_group = []

            if char == ")":
                current_sub_group.append(i)

            current_group_ascii += 1
            current_group_name = chr(current_group_ascii)
        else:  # wanted character types to represent a subgroup
            current_sub_group.append(i)  # add index of char to list
            index_to_group_dict[i] = [current_group_name]

        if i + 1 == SMILES_length:  # for items at the very end of the string
            # leave as a list for now
            subgroup_index_list.append((current_group_name, current_sub_group))

    subgroup_index_list = clean_subgroup_index_list(subgroup_index_list, SMILES_string)
    return subgroup_index_list

def clean_subgroup_index_list(subgroup_index_list: list, SMILES_string: str) -> list:
    """
    Input: subgroup_index_list is a list of lists of subgroup indices
    Output: subgroup_index_list with empty index tuples and single item tuples of non atoms removed
    """

    groups_to_remove = []
    for subgroup_tuple in subgroup_index_list:
        tuple_of_indices = subgroup_tuple[1]  # index 0 is the group name
        if not isinstance(tuple_of_indices, list):
            print(f"Error: Expected list, got {type(tuple_of_indices)} at {subgroup_tuple}")
        length_tuple_of_indices = len(tuple_of_indices)

        if length_tuple_of_indices == 0:
            groups_to_remove.append((subgroup_tuple))

        elif length_tuple_of_indices == 1 and (not SMILES_string[tuple_of_indices[0]].isalpha()):  # remove any unhelpful single item tuples
            groups_to_remove.append((subgroup_tuple))

    for bad_group in groups_to_remove:
        subgroup_index_list.remove(bad_group)
    return subgroup_index_list


def create_group_dict(subgroup_index_list: list) -> dict:
    """
    Input: subgroup_index_list is a list of lists representing all groups. Each interior list contains the
           group's symbol at [0] and indices list at [1]
    Output: dictionary in the format {group_symbol: [[group_symbol, [group_indices]], [empty neighbor_list]]}
    """

    subgroup_dict = {}
    for subgroup in subgroup_index_list:
        subgroup_dict[subgroup[0]] = [subgroup, []] # empty second list will hold neighbors
    return subgroup_dict



def grouping_bridges_dict(SMILES_string: str) -> dict:
    """
    Input: Smiles string
    Output: list of the index BEFORE starting parens, and index AFTER after corresponding end parens
    Ex Input: "Cc(c(o)ccccc)C"
    Ex Output: {2: 12, 4: 6}
    """
    len_smiles_string = len(SMILES_string)
    group_bridge_dict = {}
    for i in range(len_smiles_string):
        char = SMILES_string[i]

        if char == "(":

            paren_num = 1
            start_paren_index = i
            for index in range(i + 1, len_smiles_string):
                current_char = SMILES_string[index]

                if current_char == "(":
                    paren_num += 1
                elif current_char == ")":
                    paren_num -= 1

                if paren_num == 0 and current_char == ")":
                    end_paren_index = index
                    group_bridge_dict[start_paren_index] = end_paren_index
                    break

    return group_bridge_dict



def determine_group_neighbors(SMILES_string: str, subgroup_index_list: list) -> dict:
    """
    Input: SMILES_string is a string of the SMILES representation. subgroup_index_list is a list of lists of subgroup indices.
           subgroup_index_list should have been created by subgroups_from_SMILES()
    Output: dict mapping subgroups to adjacent/ overlapping subgroups. Connections are one directional. dict follows
            the format of {group_symbol: [[group_symbol, [group_indices]], [neighbor_list]]}
    """
    group_bridge_dict = grouping_bridges_dict(SMILES_string)
    subgroup_dict = create_group_dict(subgroup_index_list)
    for this_subgroup in subgroup_index_list:
        this_subgroup_symbol = this_subgroup[0]
        this_subgroup_indices = this_subgroup[1]

        for i in this_subgroup_indices:
            if i in group_bridge_dict:
                # if i is a key in this dict, then it is a start paren "(" index
                # "(" index maps to corresponding ")" index
                bridged_index = group_bridge_dict[i]

                # add to list so that this group, and the group originally containing the ")" index are marked as neighbors
                this_subgroup_indices.append(bridged_index)


        for other_subgroup in subgroup_index_list:

            for index in this_subgroup_indices:
                neighbor_index_back = index - 1
                neighbor_index_front = index + 1

                other_subgroup_index_list = other_subgroup[1]
                other_subgroup_symbol = other_subgroup[0]
                other_subgroup_neighbors = subgroup_dict[other_subgroup_symbol][1]

                if SMILES_string[index] == ")":
                    continue

                if neighbor_index_front in other_subgroup_index_list and SMILES_string[neighbor_index_front] == ")":
                    continue

                if this_subgroup_symbol == other_subgroup_symbol:
                    break

                elif  this_subgroup_symbol in other_subgroup_neighbors:
                    # We want each neighbor connection to be one directional
                    # so that there are no duplicate tuples later on.

                    break

                elif (index in other_subgroup_index_list
                        or neighbor_index_back in other_subgroup_index_list
                        or neighbor_index_front in other_subgroup_index_list):
                    # Then they are neighbors
                    # Example subgroup: key -> [(key, [index_list]), [neighbor_symbol_list]

                    subgroup_dict[this_subgroup_symbol][1].append(other_subgroup_symbol)
                    break # stop checking indices for current other_subgroup

    return subgroup_dict



def clean_subgroup_dict(subgroup_dict: dict, SMILES_string: str) -> dict:
    """
    Input: subgroup_dict should have been created by determine_group_neighbors() and follows the format of
           {group_symbol: [[group_symbol, [group_indices]], [neighbor_list]]}, SMILES_string is a string
    Output: dict of the format {group_symbol: [group_string, [neighbor_symbols]]}
    """

    cleaned_subgroup_dict = {}
    for group_key in subgroup_dict:
        subgroup_info = subgroup_dict[group_key]
        group_neighbor_list = subgroup_info[1]
        group_index_list = subgroup_info[0][1]

        # convert to characters
        group_string = ""
        is_ring = False # Special case for rings, we need a way to identify structures that are meant to represent rings
        for index in group_index_list:
            this_char = SMILES_string[index]

            if this_char in ["(", ")"]:
                continue
            elif this_char.isdigit():   # Don't want to include numbers in the end result, but we do need to include some sort of marker
                is_ring = True
                continue
            else:
                group_string += this_char

        if is_ring:
            # 0 at index 0 will represent
            group_string = "0" + this_char

        cleaned_subgroup_dict[group_key] = [group_string, group_neighbor_list]

    return cleaned_subgroup_dict



def get_group_list(cleaned_subgroup_dict: dict) -> tuple:
    """
    Input: cleaned_subgroup_dict should have been created by clean_subgroup_dict() and follow the format
           {group_symbol: [group_string, [neighbor_symbols]]
    Output: list of all subgroups of the format [group_string_1, group_string_2, ... ]
    Note: subgroups that represent rings will have "0" at index 0 as a marker
    """

    group_list = []
    for group_key in cleaned_subgroup_dict:
        group_info = cleaned_subgroup_dict[group_key]
        group_string = group_info[0]

        if len(group_string) == 0:
            continue

        # add a "0" to the front of the string if it is a ring as a marker
        if type(group_key) == int:
            group_string = "0" + group_string

        group_list.append(group_string)

    return tuple(group_list)



def create_group_combinations(cleaned_subgroup_dict: dict) -> tuple:
    """
    Input: cleaned_subgroup_dict this dictionary should have already been processed by clean_subgroup_dict().
            Format should match {group_symbol: [group_string, [neighbor_symbols]]}
    Output: List of all possible combinations of adjacent subgroups
            Format follows tuple ((group_string_1, group_string_2, ... ]

    Ex Output: (('CCCCCC', 'C'), ('[N+]', 'CCCCCC'), ('[N+]', '=O'), ('[O-]', '[N+]'), ('N', 'CCCCCC'))
    """

    group_combinations = []

    for group_key in cleaned_subgroup_dict:
        this_group_info = cleaned_subgroup_dict[group_key]
        this_group_string = this_group_info[0]
        this_group_neighbors = this_group_info[1]

        if this_group_string in ["", "=", "#"]:     # Unwanted groups
            continue

        for neighbor_key in this_group_neighbors:
            other_group_info = cleaned_subgroup_dict[neighbor_key]

            other_group_string = other_group_info[0]

            # don't include these neighbors that escaped cleaning
            if other_group_string in ["", "=", "#"]:
                continue

            group_combinations.append((this_group_string, other_group_string))

    return tuple(group_combinations)

ModuleNotFoundError: No module named 'ring_handling_w_rdkit'

**ring_handling_w_rdkit.py**

Files that helps sub_group_from_smiles.py by handling cases were a smiles string contains a ring. When there is a ring, atoms must be treated as nodes in a map and more complex logic is needed. rdkit was used to pull these rings, and then more logic was used to convert the objects given into indexs of the string

In [None]:
from rdkit import Chem
from rdkit.Chem import rdmolops



def ring_handling_rdkit(SMILES_string: str) -> list | bool:
    """
    Input: SMILES_string is a string of the SMILES representation
    Output: list of lists of ring indices, returns False if there is an issue
    """

    molecule = Chem.MolFromSmiles(SMILES_string, sanitize=False)

    if molecule is None:
        print("Invalid SMILES string.")
        return False
    else:
        # Detect all rings in the molecule
        ring_vector = rdmolops.GetSymmSSSR(molecule, False)
        ring_idx_list = []

        for ring in ring_vector:
            current_ring = []
            for idx in ring:
                current_ring.append(idx)
            ring_idx_list.append(current_ring)

    idx_to_index_dict = map_idx_to_string_index(SMILES_string)
    ring_indices_list = []
    for ring_idxs in ring_idx_list:
        ring_indices = idx_to_index_of_rings(ring_idxs, idx_to_index_dict, SMILES_string)

        ring_indices_list.append(ring_indices)

    return ring_indices_list



def map_idx_to_string_index(SMILES_string: str) -> dict:
    """
    Input: SMILES_string is a string of the SMILES representation
    Output: returns a dictionary that maps IDX values to indices in the SMILES_string
    """

    idx = 0
    idx_to_index_dict = {}
    for index in range(len(SMILES_string)):
        character = SMILES_string[index]
        if character.isalpha():
            idx_to_index_dict[idx] = index
            idx += 1

    return idx_to_index_dict



def idx_to_index_of_rings(ring_idx_list: list, idx_to_index_dict: dict, SMILES_string: str) -> list:
    """
    Input: ring_idx_list list of one ring's idx values. idx_to_index_dict dictionary
           that maps idx values to indices within the smiles string
    Output: list of indices of the ring
    """

    index_list = []
    for idx in ring_idx_list:
        index = idx_to_index_dict[idx]
        index_list.append(index)

        if index < len(SMILES_string) and SMILES_string[index + 1] == "(":
            index_list.append(index + 1)
        if index - 1 >= 0 and SMILES_string[index - 1] == ")":
            index_list.append(index - 1)

    return index_list



def create_dict_of_ring_indices(ring_indices_list: list) -> dict:
    """
    Input: ring_indices_list created by ring_handling_rdkit
    Output: dict that maps indices of rings to their ring's respective ring_num
    """

    ring_index_dict = {}
    ring_num = 1
    for ring_index_list in ring_indices_list:

        for i in ring_index_list:
            if i in ring_index_dict:
                ring_index_dict[i].append(ring_num)
            else:
                ring_index_dict[i] = [ring_num]

        ring_num += 1
    return ring_index_dict

ModuleNotFoundError: No module named 'rdkit'

## Creation of DataTables and Data Collection




**Hazard_and_Subgroup_Column_Initialization.py**

file used to create pd.DataFrames in the format needed for proper study of the data

In [None]:
import pandas as pd
import re
import hashlib

import sub_groups_from_smiles as subgroups
import Data_Collection_from_Pubchem as pubchem_coll

import warnings

# Suppress the specific PerformanceWarning from pandas
# These performance errors will be looked into in the future, but for now it works
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
warnings.simplefilter('ignore', category=pd.errors.SettingWithCopyWarning)
# this gets raised by .applymap(), but the version of pandas on google collab crashes using .map()
warnings.simplefilter('ignore', category=FutureWarning)



def create_dataframe_from_cids(compound_IDs: list,
                              save_to_csv: bool=True, csv_name: str= 'compound_data.csv', wait_time: float=1,
                              drop_empty_hazard_rows: bool=True) -> pd.DataFrame|bool:
    """
    Input: compound_IDs list of ints representing compound IDs. save_to_csv bool to determine whether to
           save dataframe to csv or not, default is True. wait_time is a float value of the desired wait time between
           requests in seconds. drop_empty_hazard_rows is a bool that is True by default. drops rows in the columns that
           do not have any hazard data associated with them. This is due to the inability to differentiate between
           chemicals that have no hazards because they are safe and chemicals that have no hazards because there simply
           is not enough data.
    Output: pd.DataFrame of compound data including a column for each type of hazard, and columns of group
            and group combination hash keys
    """

    try:
        # List of compound identifiers
        all_data = []

        # Loop through each compound ID and fetch data
        for compound_ID in compound_IDs:
            try:
                compound_data = pubchem_coll.fetch_compound_data(compound_ID, wait_time)
            except KeyError:
                continue

            all_data.append(compound_data)

        df = pd.DataFrame(all_data)

        # Collect grouping data
        initialize_grouping_data(df)

        # Removes rows that don't have any hazards attached. Currently, there is no way to differentiate
        # between that chemicals that are safe, and chemicals that simply do not have any hazard data.
        if drop_empty_hazard_rows:
            df = df[df['Hazards'] != '']

        # Save for later use
        if save_to_csv:
            df.to_csv(csv_name, index=False)
        columns_to_remove = ["smiles_group_hash_list", "smiles_group_combination_hash_list"]
        df = df.drop(columns=columns_to_remove, errors='ignore')

        return df
    except ValueError:
        return False



def generate_grouping_hash_lists_from_df(df: pd.DataFrame,
                                         Canonical_SMILES_column_name: str="Canonical SMILES",
                                         create_columns: bool=True) -> bool:
    """
    Input: df is a pd.DataFrame including a column of Canonical SMILES strings. Canonical_SMILES_column_name is the
           column name of the column containing the Canonical SMILES strings, it is "Canonical SMILES" by default.
           create_columns is a bool that determines if the columns "smiles_group_hash_list",
           "smiles_group_combination_hash_list", and "could_not_collect_grouping_data" need to be initialized.
           create_columns is True by default
    Output: bool that is True if columns were successfully created and/or updated to include hash lists, False
            otherwise
    """

    try:
        if create_columns:
            df["smiles_group_hash_list"] = [[] for _ in range(len(df))]
            df["smiles_group_combination_hash_list"] = [[] for _ in range(len(df))]
            df["could_not_collect_grouping_data"] = 0

        for index, row in df.iterrows():
            canonical_smiles = row[Canonical_SMILES_column_name]

            group_hash_lists = generate_grouping_hash_lists_from_SMILES_string(canonical_smiles)

            if group_hash_lists == False:
                df.at[index, "could_not_collect_grouping_data"] = 1
                continue

            smiles_group_hash_list = group_hash_lists[0]
            smiles_group_combination_hash_list = group_hash_lists[1]

            df.at[index, "smiles_group_hash_list"] = smiles_group_hash_list
            df.at[index, "smiles_group_combination_hash_list"] = smiles_group_combination_hash_list

        return True
    except ValueError:
        return False



def generate_grouping_hash_lists_from_SMILES_string(canonical_smiles: str) -> tuple|bool:
    """
    Input: canonical_smiles of type str
    Output: tuple containing a hashlist of the different subgroups at index 0, and a hashlist of the different subgroup
            combinations at index 1. False if no subgroups could be pulled
    """
    group_list_info = subgroups.smiles_to_sub_groups(canonical_smiles)

    # group data could not be pulled
    if not group_list_info:
        return False

    group_list = group_list_info[0]
    group_combination_list = group_list_info[1]

    smiles_group_hash_list = []
    smiles_group_combination_hash_list = []

    # get data for individual groups
    for group in group_list:
        group_hash_key = hash_smiles_group(group)
        smiles_group_hash_list.append(group_hash_key)

    # get data for group combinations
    for group_combo in group_combination_list:
        group_combo_hash_key = hash_smiles_group_combination(group_combo)
        smiles_group_combination_hash_list.append(group_combo_hash_key)

    return smiles_group_hash_list, smiles_group_combination_hash_list



def convert_smiles_to_dataframe(canonical_smiles: str) -> pd.DataFrame:
    """
    Input: canonical_smiles is a string.
    Output: A pd.DataFrame with only one row referring to canonical_smiles. pd.Dataframe contains columns of the
            hashed subgroups and subgroup combinations
    """

    hash_dict = {}
    hash_lists = generate_grouping_hash_lists_from_SMILES_string(canonical_smiles)
    hash_lists_combined = hash_lists[0] + hash_lists[1]
    for hash in hash_lists_combined:
        if hash in hash_dict:
            hash_dict[hash][0] += 1
        else:
            hash_dict[hash] = [1]

    df = pd.DataFrame(hash_dict)

    return df



def fit_dataframes(this_smiles_df: pd.DataFrame, cleaned_main_df: pd.DataFrame) -> tuple:
    """
    Input: this_smiles_df is a pd.DataFrame. cleaned_main_df is a pd.DataFrame
    Output: tuple containing this_smiles_df at index 0 and cleaned_main_df where both dataframes have each other's
            columns added and set to zero. Both DataFrames have the same columns and are in the same order
    """

    # ensure all columns are strings
    this_smiles_df.columns = this_smiles_df.columns.astype(str)
    cleaned_main_df.columns = cleaned_main_df.columns.astype(str)

    # removes all columns not in this_smiles_df that are sum to less than 2
    cleaned_filtered_main_df = filter_columns_by_sum_and_input(main_df=cleaned_main_df, second_df=this_smiles_df)

    # Get the common columns between this_smiles_df and cleaned_main_df
    common_columns = this_smiles_df.columns.intersection(cleaned_main_df.columns)

    # Only keep the common columns in this_smiles_df
    this_smiles_df = this_smiles_df[common_columns]

    # Add missing columns from cleaned_filtered_main_df to this_smiles_df and set their values to zero
    missing_columns = [col for col in cleaned_filtered_main_df.columns if col not in this_smiles_df.columns]
    for col in missing_columns:
        this_smiles_df[col] = 0

    this_smiles_df = this_smiles_df[cleaned_filtered_main_df.columns]
    # Reorder the columns of this_smiles_df to match cleaned_main_df

    return (this_smiles_df, cleaned_filtered_main_df)



def hash_smiles_group(subgroup: str,
                      rotation_matters: bool=False) -> int|bool:
    """
    Input: subgroup is string representation of a subgroup. These should have been
           created by subgroups.smiles_to_sub_groups(). rotation_matters is a bool that represents whether
           groups other than rings that maintain the same order but are shifted can map to the same group, is False by
           default
    Output: hash representing the input subgroup

    Ex rotation_matters: if False, "ABCDE" and "DEABC" will map to the same hash and thus be counted as the same
                        group. If True, they will map to different hash. Regardless of rotation_matters,
                        "0ABCDE" and "0DEABC" will map to the same hash as they both represent a ring of the same
                        components in the same order. The "0" at index 0 marks the subgroup as a ring.
    """

    # check if subgroup is empty
    if subgroup == "":
        return False

    is_ring = False
    if subgroup[0] == "0":
        is_ring = True
        subgroup = subgroup[1:]

    if not rotation_matters or is_ring:
        rotations = [subgroup[i:] + subgroup[:i] for i in range(len(subgroup))]
        pre_hashed_key = str(is_ring) + min(rotations)

    else:
        pre_hashed_key = str(is_ring) + subgroup

    hashed = consistent_hash(pre_hashed_key)

    return str(hashed)



def hash_smiles_group_combination(group_combo: tuple,
                                  rotation_matters: bool=False) -> int:
    """
    Input: group_combo is a tuple of length two containing two subgroups of the same molecule. These should have been
           created by subgroups.smiles_to_sub_groups(). rotation_matters is a bool that is False by default.
           rotation_matters determines whether non ring subgroups with the same elements and the same order, but are
           shifted, represent the same subgroup.
    Output: Hash code to represent the subgroup combination
    """

    group_1_hash = hash_smiles_group(group_combo[0], rotation_matters)
    group_2_hash = hash_smiles_group(group_combo[1], rotation_matters)

    pre_hashed_key = min((group_1_hash + group_2_hash), (group_2_hash + group_1_hash))
    hashed_key = consistent_hash(pre_hashed_key)

    return str(hashed_key)



def create_grouping_columns(df: pd.DataFrame,
                            group_hashs_list_column_name: str="smiles_group_hash_list") -> bool:
    """
    Input: df is a pd.DataFrame containing a column that contains a list of subgroup hash's
    Output: bool of True if df was successfully mutated to include and update subgroup hash columns, False otherwise
    """

    try:
        for index, row in df.iterrows():
            hash_keys = row[group_hashs_list_column_name]

            # For each hash key in the list
            for hash_key in hash_keys:
                # Check if the column exists

                if hash_key in df.columns:
                    # Increment the value in the existing column
                    df.at[index, hash_key] += 1

                else:
                    # Create a new column and initialize it
                    df[hash_key] = 0  # Initialize the new column with zeros
                    df.at[index, hash_key] = 1

    except ValueError:
        return False



def initialize_grouping_data(df: pd.DataFrame,
                             Canonical_SMILES_column_name: str = "Canonical SMILES") -> bool:
    """
    Input: DataFrame containing a column with Canonical SMILES strings. Canonical_SMILES_column_name is the name
           of the column containing the Canonical SMILES representations.
    Output: True if df was successfully mutated to include hazard columns and subgroup columns, False otherwise
    """

    try:
        split_hazard_data(df)
        generate_grouping_hash_lists_from_df(df, Canonical_SMILES_column_name)
        create_grouping_columns(df)
        return True
    except ValueError:
        return False



def split_hazard_data(df: pd.DataFrame,
                      hazard_column_name: str="Hazards") -> bool:
    """
    Input: pd.DataFrame containing a column that contains compound hazards
    Output: True if df was successfully mutated to include a column for each type of hazard, False otherwise
    """

    try:
        for index, row in df.iterrows():
            hazards_string = row[hazard_column_name]
            if type(hazards_string) != str:
                continue

            hazard_list = re.sub(r'\s+', '', hazards_string).split(",")

            # For each hazard in the list
            for hazard in hazard_list:

                # Check if the column exists
                if hazard in df.columns:
                    # Set row value to True
                    df.at[index, hazard] = 1

                else:
                    # Create a new column and initialize it
                    df[hazard] = 0  # Initialize the new column with 0 to represent False
                    df.at[index, hazard] = 1   # set this row's value to 1 to represent True

        return True
    except ValueError:
        return False



def update_existing_ids_dataframe_from_cids(main_df: pd.DataFrame, new_compound_IDs: list,
                                            save_to_csv: bool=True, csv_name: str="compound_data.csv",
                                            overwrite_old_data: bool=False) -> pd.DataFrame|bool:
    """
    Input: main_df is a pd.DataFrame that will be mutated to include new data. new_compound_IDS is list of ints
           representing compound IDs.  save_to_csv bool to determine whether to
           save dataframe to csv or not, default is True. overwrite_old_data is a bool that determine whether data for
           old compound IDs can
           be overwritten by new data. overwrite_old_data is False by default
    Output: The merged pd.DataFrame if parent_dataframe was successfully mutated to include new compound IDs, False
            otherwise
    """

    try:
        new_df = create_dataframe_from_cids(new_compound_IDs, save_to_csv=False)

    except ValueError:
        print("Issue with new compound IDs")
        return False

    try:
        return update_existing_dataframe_from_dataframe(main_df, new_df, save_to_csv=save_to_csv, csv_name=csv_name, overwrite_old_data=overwrite_old_data)

    except ValueError:
        print("Issue combining dataframes")
        return False



def update_existing_dataframe_from_dataframe(main_df: pd.DataFrame, second_df: pd.DataFrame,
                                             save_to_csv: bool=True, csv_name: str="compound_data.csv",
                                             overwrite_old_data: bool=False) -> pd.DataFrame|bool:
    """
    Input: main_df is a pd.DataFrame that will be mutated to include new data. second_df is a pd.DataFrame that will be
           used to update main_df. save_to_csv bool to determine whether to save dataframe to csv or not, default is
           True. overwrite_old_data is a bool that determine whether data for old compound IDs can be overwritten by
           new data. overwrite_old_data is False by default
    Output: The merged pd.DataFrame if parent_dataframe was successfully mutated to include new compound IDs, False
            otherwise
    """

    try:
        if "Compound ID" not in main_df.columns or "Compound ID" not in second_df.columns:
            raise ValueError("Both dataframes must have a 'Compound ID' column")

        if overwrite_old_data:

            matching_ids = second_df["Compound ID"].isin(main_df["Compound ID"])
            matching_rows = second_df[matching_ids]

            # this is what will be iterated over to add missing columns

            second_df = second_df[~matching_ids]
            main_df = main_df.set_index("Compound ID")
            matching_rows = matching_rows.set_index("Compound ID")
            main_df.update(matching_rows)
            main_df = main_df.reset_index()

        else:
            # filters out rows in second_df that have matching "Compound ID" with values in main_df
            second_df = second_df[~second_df["Compound ID"].isin(main_df["Compound ID"])]

        # Step 1: Add missing columns from main_df to second_df and set them to 0
        for col in main_df.columns:
            if col not in second_df.columns:
                second_df.loc[:, col] = 0

        # Step 2: Add missing columns from second_df to main_df and set them to 0
        for col in second_df.columns:
            if col not in main_df.columns:
                main_df.loc[:, col] = 0

        # Step 3: Ensure both dataframes have the same columns and order
        main_df = main_df[sorted(main_df.columns, key=str)]
        second_df = second_df[sorted(second_df.columns, key=str)]

        # Step 4: Concatenate the dataframes
        combined_df = pd.concat([main_df, second_df], ignore_index=True)

        if save_to_csv:
            combined_df.to_csv(csv_name, index=False)

        return combined_df

    except ValueError:
        print("Issue combining dataframes")
        return False



def filter_columns_by_sum_and_input(main_df: pd.DataFrame, second_df: pd.DataFrame) -> pd.DataFrame:
    """
    Input: main_df is a pd.DataFrame, second_df pd.DataFrame
    Output: updated main_df that contains only columns that have more than one "1" or are present in second_df
    """
    columns_to_keep = []
    for col in main_df.columns:
        if main_df[col].sum() >= 2 or (col in second_df.columns):
            columns_to_keep.append(col)
    return main_df[columns_to_keep]



def add_hazard_and_hash_columns_from_csv(csv_name: str,
                                         drop_empty_hazard_rows: bool=True,
                                         save_to_csv: bool=True,
                                         ) -> pd.DataFrame|bool:

    try:
        df = pd.read_csv(csv_name)

        # Collect grouping data
        initialize_grouping_data(df)

        # Removes rows that don't have any hazards attached. Currently, there is no way to differentiate
        # between that chemicals that are safe, and chemicals that simply do not have any hazard data.
        if drop_empty_hazard_rows:
            df = df[df['Hazards'] != '']

        # Save for later use
        if save_to_csv:
            df.to_csv(csv_name, index=False)
        columns_to_remove = ["smiles_group_hash_list", "smiles_group_combination_hash_list"]
        df = df.drop(columns=columns_to_remove, errors='ignore')

        return df

    except ValueError:
        return False



def consistent_hash(input_string):
    """
    Input: input_string is a string.
    Output: hash representing input_string. Hash is consistently calculated
    """
    # Create a hashlib md5 hash object
    hash_object = hashlib.md5(input_string.encode())
    # Generate a hash value in hexadecimal format
    return hash_object.hexdigest()

ModuleNotFoundError: No module named 'sub_groups_from_smiles'

**Data_Collection_from_Pubchem.py**
File used to collect data from Pubhcem pug_view API to be used to instantiate pd.DataFrames

In [None]:
import requests
import time



def fetch_compound_data(compound_ID: int, wait_time: float=1.5) -> dict:
    """
    Input: compound_ID is an int representation of the compound ID. wait_time is a float value of the desired
    wait time between requests in seconds. wait_time is 1.5 seconds by default
    Output: dictionary containing compound info including Canonical Smiles, and Hazards
    """

    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{compound_ID}/JSON/"
    response = requests.get(url, timeout=10)

    try:
        data = response.json()
    except ValueError:
        return None

    if 'Record' not in data:
        return None

    # cool down inbetween calls
    time.sleep(wait_time)

    data = response.json()

    iupac_name = None
    canonical_smiles = None
    hazards = []

    # Extracting IUPAC name and canonical SMILES
    for section in data['Record']['Section']:
        if section['TOCHeading'] == 'Names and Identifiers':
            for subsection in section['Section']:
                if subsection['TOCHeading'] == 'Computed Descriptors':
                    for descriptor in subsection['Section']:
                        if descriptor['TOCHeading'] == 'IUPAC Name':
                            iupac_name = descriptor['Information'][0]['Value']['StringWithMarkup'][0]['String']
                        elif descriptor['TOCHeading'] == 'Canonical SMILES':
                            canonical_smiles = descriptor['Information'][0]['Value']['StringWithMarkup'][0]['String']

    # Extracting hazards
    for section in data['Record']['Section']:
        if section['TOCHeading'] == 'Safety and Hazards':
            for subsection in section['Section']:
                if subsection['TOCHeading'] == 'Hazards Identification':
                    for subsubsection in subsection['Section']:
                        if subsubsection['TOCHeading'] == 'GHS Classification':
                            for info in subsubsection['Information']:
                                if info['Name'] == 'Pictogram(s)':
                                    for pictogram in info['Value']['StringWithMarkup'][0]['Markup']:
                                        hazards.append(pictogram['Extra'])

    return {
        "Compound ID": compound_ID,
        "IUPAC Name": iupac_name,
        "Canonical SMILES": canonical_smiles,
        "Hazards": ', '.join(hazards)
    }