In [None]:
# Import necessary libraries for chemical data processing, molecular representation, and data handling

# RDKit library for cheminformatics tasks, such as molecule manipulation and fingerprint generation
from rdkit import Chem                      # Core RDKit module for molecular operations (e.g., loading structures)
from rdkit.Chem import AllChem              # Module for generating molecular fingerprints and performing cheminformatics tasks
from rdkit.Chem import Draw                 # Module for visualizing chemical structures as images

# Pandas library for data manipulation and analysis
import pandas as pd                         # Useful for handling datasets, particularly CSV or DataFrame formats

# OS module for interacting with the operating system, including file and directory handling
import os                                   # Helpful for file path management and creating directories


In [None]:
# Select a specific molecule from the positive dataset
# - `a` represents the row index of the molecule in `Data_Positive` that we want to analyze
# - Replace `a` with the desired index as needed

a = 123  # Example index for molecule; replace `a` with the desired molecule index as needed

# Retrieve the SMILES string of the selected molecule from `Data_Positive`
# - `iloc[a, 0]` selects the SMILES string in the first column (column 0) of the specified row `a`
smiles = Data_Positive.iloc[a, 0]  # Extract the SMILES string for the molecule at index `a`

# Initialize bit information dictionary to store bit indices and their contributing atoms
bitinfo = {}

# Convert SMILES to RDKit molecule object
mol = Chem.MolFromSmiles(smiles)

# Generate the ECFP4 fingerprint for the molecule
# - radius=2 corresponds to ECFP4 (Extended Connectivity Fingerprint with radius 2)
# - nBits=1024 defines the length of the fingerprint
# - bitInfo stores information on which atoms contributed to each bit in the fingerprint
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024, bitInfo=bitinfo, useFeatures=True)
ecfp = fp  # Store the fingerprint in the variable `ecfp`

# Define a function to convert the ECFP fingerprint to a DataFrame
# - The function iterates over bits that are "on" (i.e., set to 1) in the fingerprint vector and sets them to 1 in the array
# - The resulting DataFrame has a column for each bit (0 to 1023)
def ecfp_to_dataframe(ecfp):
    arr = np.zeros((1, 1024))  # Initialize an array of zeros with a shape of 1x1024
    for i in ecfp.GetOnBits():  # Iterate over bits that are set to 1
        arr[0, i] = 1  # Set the corresponding index to 1 in the array
    df = pd.DataFrame(arr, columns=[f"Bit_{i}" for i in range(1024)])  # Convert array to DataFrame with named columns
    return df

# Convert the fingerprint to a DataFrame and print it
df = ecfp_to_dataframe(ecfp)
print(df)

# Identify the indices of bits that are "on" (i.e., equal to 1) in the DataFrame
ones = np.where(df.iloc[0, :] == 1)  # Find indices where bit is set to 1

# Draw the substructures contributing to the bits using RDKit's DrawMorganBit
# - DrawMorganBit visualizes the specific bit in the ECFP4 fingerprint, highlighting the contributing atoms
img = Draw.DrawMorganBit(mol, bitId=4, bitInfo=bitinfo)  # Specify bit ID to visualize with bitinfo providing contributing atom info

# Save the generated image as a file
# - img.save() saves the image to the specified file path
# - f"morgan_bit_331_{a}.png" names the file, using `a` (the bit index or molecule index) to keep filenames unique
img.save(f"morgan_bit_331_{a}.png")

# Display the substructure for a specific bit in the ECFP4 fingerprint using RDKit's DrawMorganBit function
# - Draw.DrawMorganBit highlights atoms contributing to the specified bit (in this case, bit 4) in the molecule
Draw.DrawMorganBit(mol, 4, bitinfo)


In [None]:
# Define the bits of interest for tracking their presence in each molecule
bits = [122, 331]

# Loop over each bit of interest to identify molecules contributing to these specific bits
for x in bits:
    # Create a global list to store indices of molecules contributing to the specified bit
    globals()[f'Bit_{x}'] = []  # Initialize an empty list for storing indices of molecules that have this bit "on"
    
    # Loop over each molecule in the dataset
    for i in range(2130):  # Assuming the dataset contains 2130 molecules
        smiles = Data_Positive.iloc[i, 0]  # Retrieve SMILES string for the ith molecule
        bitinfo = {}  # Dictionary to store bit information for atom contributions

        # Convert SMILES to RDKit molecule object
        mol = Chem.MolFromSmiles(smiles)
        
        # Generate the ECFP6 fingerprint for the molecule
        # - radius=3 corresponds to ECFP6 (Extended Connectivity Fingerprint with radius 3)
        # - nBits=1024 defines the fingerprint length
        # - bitInfo stores atom contributions for each bit in the fingerprint
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024, bitInfo=bitinfo, useFeatures=True)
        ecfp = fp

        # Define a function to convert the ECFP fingerprint to a DataFrame
        def ecfp_to_dataframe(ecfp):
            arr = np.zeros((1, 1024))  # Initialize a 1x1024 array of zeros
            for i in ecfp.GetOnBits():  # Iterate over bits that are "on"
                arr[0, i] = 1  # Set corresponding index to 1 in the array
            df = pd.DataFrame(arr, columns=[f"Bit_{i}" for i in range(1024)])  # Convert array to DataFrame with named columns
            return df

        # Convert the fingerprint to a DataFrame
        df = ecfp_to_dataframe(ecfp)

        # Identify bits that are set to 1 in the DataFrame
        ones = np.where(df.iloc[0, :] == 1)  # Find indices where the bit is "on"
        
        # Check if the bit of interest (x) is in the "on" bits for this molecule
        if bool(sum(ones[0] == x)):
            globals()[f'Bit_{x}'].append(i)  # Append molecule index i to the list if bit x is active for this molecule


In [None]:
# Select a specific molecule from the positive dataset
a = 123  # Molecule index to analyze (replace as needed)

# Retrieve the SMILES string for the selected molecule and create a directory for storing its images
smiles = Data_Positive.iloc[a, 0]  # Get the SMILES string for the molecule at index `a`
directory = f"Molecule_{a}"  # Directory named based on the molecule index
parent_dir = r"img"  # Parent directory to store molecule images
path = os.path.join(parent_dir, directory)  # Full path to the directory

# Create the directory if it does not already exist
if not os.path.exists(path):
    os.makedirs(path)

# Loop through each bit of interest (e.g., Bit_122) in the selected molecule
for bit in Bit_122:
    bitinfo = {}  # Dictionary to store bit information (contributing atoms)
    mol = Chem.MolFromSmiles(smiles)  # Convert SMILES to RDKit molecule object
    
    # Generate the ECFP6 fingerprint for the molecule
    # - radius=3 corresponds to ECFP6 (Extended Connectivity Fingerprint with radius 3)
    # - nBits=1024 defines the fingerprint length
    # - bitInfo stores atom contributions for each bit
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024, bitInfo=bitinfo, useFeatures=True)
    
    # Define a function to convert the ECFP fingerprint to a DataFrame
    def ecfp_to_dataframe(ecfp):
        arr = np.zeros((1, 1024))  # Initialize a 1x1024 array of zeros
        for i in ecfp.GetOnBits():  # Iterate over bits that are set to 1
            arr[0, i] = 1  # Set the corresponding index to 1
        df = pd.DataFrame(arr, columns=[f"Bit_{i}" for i in range(1024)])  # Convert array to DataFrame with bit column names
        return df

    # Generate DataFrame from the fingerprint
    df = ecfp_to_dataframe(fp)
    
    # Visualize and draw the specific substructure contributing to the bit
    # - DrawMorganBit highlights atoms in `mol` that contribute to the specified bit
    img = Draw.DrawMorganBit(mol, bit, bitinfo)

    # Save the image to the specified directory with a unique filename
    img.save(f"{path}/morgan_bit_{bit}_{a}.png")  # Save as 'morgan_bit_{bit}_{a}.png' in the created folder

    # Optionally display the drawn image (commented out if not needed)
    # Draw.DrawMorganBit(mol, bit, bitinfo)
