In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, rdchem, Descriptors, PandasTools, DataStructs, rdMolDescriptors, Lipinski
from rdkit.Chem.FilterCatalog import FilterCatalog, FilterCatalogParams
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.ML.Cluster import Butina
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker

from scipy.spatial.distance import pdist, squareform

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import matplotlib
matplotlib.rcParams['font.family'] = 'Arial'
matplotlib.rcParams['font.size'] = 10

In [None]:
smiles = ["c1ccccc1", "c1cnccc1", "Cc1ccncc1", "C1CCCCC1", "c1[nH]ccc1"]


In [4]:
smiles = ["c1ccccc1", "c1cnccc1", "Cc1ccncc1", "C1CCCCC1", "c1[nH]ccc1"]
molecules = []
for smiles in smiles:
    mol = Chem.MolFromSmiles(smiles)
    molecules.append(mol)

# Create a molecule object from the SMARTS pattern
substructure = Chem.MolFromSmarts('[c]')

# Find substructure matches in the target molecule
for mol in molecules:
    subs = mol.HasSubstructMatch(substructure)
    if subs:
        print()
#     submatch = mol.GetSubstructMatches(substructure)
#     print(submatch)

True
True
True
False
True


In [27]:
total_lines = 795384
one_percent_lines = total_lines / 100
processed_lines = (total_lines // 100)

print(processed_lines % one_percent_lines)

if processed_lines % one_percent_lines == 0:
    print(f"Processed {processed_lines / total_lines * 100:.0f}% of compounds.")

7953
7953.0


In [None]:
fiveMR = Chem.MolFromSmarts("c[NH]")

smiles = 'O=C(C1=CC=CC=C1)[C@@]([C@](C[N+]([O-])=O)([H])C2=CC=CC=C2)([H])C(N3CCOCC3)=O'
mol = Chem.MolFromSmiles(smiles)

            return any(mol.HasSubstructMatch(fg) for fg in functional_groups.values())
        except Exception as e:
            print(f"Error processing molecule {smiles}: {e}")
            return False


In [None]:
from rdkit import Chem
import pandas as pd

def filter_by_FGs(df, column_name='Smiles'):
    """
    Filters a DataFrame of molecules based on the presence of specific functional groups.

    Parameters:
    - df (pd.DataFrame): The DataFrame to filter, containing a column with SMILES strings.
    - column_name (str): The name of the column containing SMILES strings.

    Returns:
    - pd.DataFrame: The filtered DataFrame without the specified functional groups.
    - pd.DataFrame: DataFrame of molecules that were removed based on the filter.
    """
    # Ensure the specified column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"DataFrame must have a '{column_name}' column")

    print(f"Initial molecule count: {len(df)}")

    # Define SMARTS patterns for various functional groups
    functional_groups = {
        'anilines': "c[NH]",
        'michael_acceptor': "C=C(C=O)",
        'sulfonyl_halide': "S(=O)(=O)[F,Cl,Br,I]",
        'nitro_group': "[N+](=O)[O-]",
        'aldehyde': "[HC]=O",
        'primary_alkyl_halide': "[Cl,Br,I][CH2]",
        'epoxide_or_aziridine': "C1[O,N]C1",
        'sulfonate_ester': "[#6]S(=O)(=O)O[#6]",
        'phosphonate_ester': "[#6]OP(=O)(O[#6])O[#6]",
        'long_aliphatic_chain': "[CH2][CH2][CH2][CH2][CH2][CH2][CH2][CH2]",
        'peroxide': "OO",
        'dicarbonyl': "C(=O)C(=O)",
        'acid_halide': "C(=O)[Cl,Br,I]"
    }

    # Convert SMARTS strings to RDKit Mol objects for substructure matching
    functional_groups = {key: Chem.MolFromSmarts(value) for key, value in functional_groups.items()}

    def contains_functional_group(smiles):
        """Checks if a molecule contains any of the specified functional groups."""
        try:
            mol = Chem.MolFromSmiles(smiles)
            if mol is None:
                return False
            return any(mol.HasSubstructMatch(fg) for fg in functional_groups.values())
        except Exception as e:
            print(f"Error processing molecule {smiles}: {e}")
            return False

    # Apply the contains_functional_group function to each SMILES string in the DataFrame
    df['ContainsFG'] = df[column_name].apply(contains_functional_group)
    
    # Split the DataFrame into two: one with molecules not containing the specified functional groups and one with removed molecules
    filtered_df = df[~df['ContainsFG']].copy()
    removed_df = df[df['ContainsFG']].copy()
    
    print(f"After filtering functional groups: {len(filtered_df)}")
    
    # Drop the temporary 'ContainsFG' column from both DataFrames before returning
    filtered_df = filtered_df.drop(columns=['ContainsFG'])
    removed_df = removed_df.drop(columns=['ContainsFG'])
    
    print(f"Remove Unwanted Functional Groups Done!\n")

    # Return the filtered DataFrame and the DataFrame with removed molecules
    return filtered_df, removed_df

# Example usage:
# filtered_df, removed_df = filter_by_FGs(df, 'Smiles')
# display(filtered_df)
# display(removed_df)