In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from rdkit import DataStructs

In [None]:
def compute_morgan_fingerprints(smiles, radius=2, n_bits=2048):
    """
    Compute Morgan fingerprints from SMILES codes.

    Parameters:
    - smiles: SMILES string or list of SMILES strings
    - radius: Radius of the Morgan fingerprint (default=2, equivalent to ECFP4)
    - n_bits: Number of bits in the fingerprint vector (default=2048)

    Returns:
    - Fingerprint as numpy array (or list of arrays)
    """
    if isinstance(smiles, str):
        smiles = [smiles]

    fingerprints = []

    for smi in smiles:
        # Convert SMILES to molecule object
        mol = Chem.MolFromSmiles(smi)

        if mol is None:
            print(f"Warning: Could not parse SMILES: {smi}")
            fingerprints.append(None)
            continue

        # Compute Morgan fingerprint
        # radius=2 corresponds to ECFP4 (Extended Connectivity Fingerprint)
        # radius=3 corresponds to ECFP6
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)

        # Convert to numpy array
        fp_array = np.array(fp)
        fingerprints.append(fp_array)

    return fingerprints if len(fingerprints) > 1 else fingerprints[0]

In [None]:
# Radius: Use radius=2 (ECFP4) as the standard choice, as it captures both local and global structural features including functional groups, rings, and branching patterns ResearchGate. Use radius=3 for capturing larger substructures when needed.

In [None]:
smiles_list = [
    "CCO",  # Ethanol
    "CC(=O)O",  # Acetic acid
    "c1ccccc1",  # Benzene
    "CC(C)CC1=CC=C(C=C1)C(C)C",  # Ibuprofen
]

# Example 1: Basic usage
print("Example 1: Computing Morgan fingerprints")
print("=" * 50)
for smi in smiles_list:
    fp = compute_morgan_fingerprints(smi)
    print(f"SMILES: {smi}")
    print(f"Fingerprint shape: {fp.shape}")
    print(f"Number of bits set: {np.sum(fp)}")
    print()


# Example 2: Computing fingerprints with different parameters
print("\nExample 2: Different radius values")
print("=" * 50)
test_smiles = "CC(C)CC1=CC=C(C=C1)C(C)C"  # Ibuprofen
for radius in [1, 2, 3]:
    fp = compute_morgan_fingerprints(test_smiles, radius=radius)
    print(f"Radius {radius} (ECFP{radius * 2}): {np.sum(fp)} bits set")


# Example 3: Getting count-based fingerprints (not just binary)
print("\n\nExample 3: Count-based fingerprints")
print("=" * 50)
mol = Chem.MolFromSmiles("CCO")
fp_count = AllChem.GetHashedMorganFingerprint(mol, radius=2, nBits=2048)
# Convert to array showing counts
fp_count_array = np.array([fp_count[i] for i in range(2048)])
print(f"Binary fingerprint bits set: {np.sum(fp_count_array > 0)}")
print(f"Total feature count: {np.sum(fp_count_array)}")


# Example 4: Computing Tanimoto similarity between molecules
print("\n\nExample 4: Tanimoto similarity")
print("=" * 50)

mol1 = Chem.MolFromSmiles("CCO")  # Ethanol
mol2 = Chem.MolFromSmiles("CCCO")  # Propanol
mol3 = Chem.MolFromSmiles("c1ccccc1")  # Benzene

fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
fp3 = AllChem.GetMorganFingerprintAsBitVect(mol3, 2)

similarity_1_2 = DataStructs.TanimotoSimilarity(fp1, fp2)
similarity_1_3 = DataStructs.TanimotoSimilarity(fp1, fp3)

print(f"Similarity (Ethanol vs Propanol): {similarity_1_2:.3f}")
print(f"Similarity (Ethanol vs Benzene): {similarity_1_3:.3f}")

Example 1: Computing Morgan fingerprints
SMILES: CCO
Fingerprint shape: (2048,)
Number of bits set: 6

SMILES: CC(=O)O
Fingerprint shape: (2048,)
Number of bits set: 7

SMILES: c1ccccc1
Fingerprint shape: (2048,)
Number of bits set: 3

SMILES: CC(C)CC1=CC=C(C=C1)C(C)C
Fingerprint shape: (2048,)
Number of bits set: 19


Example 2: Different radius values
Radius 1 (ECFP2): 12 bits set
Radius 2 (ECFP4): 19 bits set
Radius 3 (ECFP6): 24 bits set


Example 3: Count-based fingerprints
Binary fingerprint bits set: 6
Total feature count: 6


Example 4: Tanimoto similarity
Similarity (Ethanol vs Propanol): 0.556
Similarity (Ethanol vs Benzene): 0.000


