In [None]:
import pandas as pd
import pyarrow.parquet as pq
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import numpy as np
from Bio.PDB import PDBParser
from Bio.PDB import PDBParser, NeighborSearch, PDBIO
from Bio.PDB.Polypeptide import is_aa
from Bio.PDB.Atom import Atom
from Bio.PDB.Residue import Residue
from Bio.PDB.Descriptor import Descriptor
from Bio.PDB.Descriptor import TPSA, MolecularWeight, MolecularVolume, MolecularSurfaceArea
from Bio.PDB.Descriptor import HydrogenBondDonors, HydrogenBondAcceptors
from Bio.PDB.Descriptor import AromaticRings, RotatableBonds
from Bio.PDB.Descriptor import Globularity, Asphericity
from Bio.PDB.Descriptor import DipoleMoment, HOMO_LUMO_Gap, Polarizability
from Bio.PDB.Descriptor import Fingerprints
from Bio.PDB.Descriptor import PharmacophoreFeatures
from Bio.PDB.Descriptor import ResidueInteractions

In [None]:
# Read the file in chunks
def process_chunk(chunk, unique_building_blocks, unique_molecules):
    # Update the unique building blocks set
    unique_building_blocks.update(chunk['buildingblock1_smiles'].unique())
    unique_building_blocks.update(chunk['buildingblock2_smiles'].unique())
    unique_building_blocks.update(chunk['buildingblock3_smiles'].unique())
    
    # Update the unique molecules set
    unique_molecules.update(chunk['molecule_smiles'].unique())

In [None]:
# Load the parquet file
file_path = './train.parquet'
batch_size = 100000
parquet_file = pq.ParquetFile(file_path)

In [None]:
# Initialize sets to keep track of unique building blocks and molecules
unique_building_blocks = set()
unique_molecules = set()

In [None]:
# Iterate over the parquet file in batches
num_row_groups = parquet_file.num_row_groups

for i in range(num_row_groups):
    # Read a batch of rows
    row_group = parquet_file.read_row_group(i).to_pandas()

    if i == 0:
        print("First few rows from the first row group:")
        print(row_group.head())
    
    # Process the current chunk
    process_chunk(row_group, unique_building_blocks, unique_molecules)

# Output the total unique counts
print(f"Total number of unique building blocks: {len(unique_building_blocks)}")
print(f"Total number of unique molecules: {len(unique_molecules)}")


### Descriptors, Fingerprints, and Pharmacophore Features

Descriptors, fingerprints, and pharmacophore features capture different aspects of molecular structures and interactions that can help identify if a molecule binds to a protein. Here is an exhaustive list of the specific properties that can be useful for this purpose:

#### Descriptors:
- Molecular weight
- LogP (partition coefficient)
- Topological polar surface area (TPSA)
- Number of hydrogen bond donors
- Number of hydrogen bond acceptors
- Number of rotatable bonds
- Number of aromatic rings
- Molecular volume
- Molecular surface area
- Molecular shape descriptors (globularity, asphericity, etc.)
- Electrostatic descriptors (dipole moment, charge distribution, etc.)
- Quantum chemical descriptors (HOMO-LUMO gap, polarizability, etc.)

#### Fingerprints:
- Structural fingerprints (presence/absence of specific substructures or fragments)
- Morgan/Circular fingerprints (capturing circular neighborhoods around atoms)
- Path-based fingerprints (encoding linear paths of atoms and bonds)
- Pharmacophore fingerprints (encoding spatial arrangement of pharmacophoric features)
- Interaction fingerprints (encoding protein-ligand interactions like hydrogen bonds, hydrophobic contacts, etc.)

#### Pharmacophore Features:
- Hydrogen bond donors
- Hydrogen bond acceptors
- Hydrophobic regions
- Aromatic rings
- Positively charged groups
- Negatively charged groups
- Excluded volumes (representing steric constraints of the binding pocket)

In [None]:
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")
    
    # Molecular Descriptors
    descriptors = {
        'MolecularWeight': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'NumAromaticRings': Descriptors.NumAromaticRings(mol),
        'MolecularVolume': Descriptors.MolMR(mol),
        'MolecularSurfaceArea': Descriptors.SMR_VSA1(mol),  # Example of surface area descriptor
        'Globularity': Descriptors.Globularity(mol),
        'Asphericity': Descriptors.Asphericity(mol),
        'DipoleMoment': Descriptors.DipoleMoment(mol),
        'HOMO_LUMO_Gap': Descriptors.HOMO_LUMO_Gap(mol),
        'Polarizability': Descriptors.Polarizability(mol)
    }

    descriptor_values = np.array(list(descriptors.values()))

    # Molecular Fingerprints
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    path_fp = Chem.RDKFingerprint(mol)
    pharmacophore_fp = Generate.Gen2DFingerprint(mol, Gobbi_Pharm2D.factory)
    
    fingerprints = {
        'MorganFingerprint': np.array(morgan_fp),
        'PathFingerprint': np.array(path_fp),
        'PharmacophoreFingerprint': np.array(pharmacophore_fp)
    }

    fingerprint_values = np.concatenate([fingerprints['MorganFingerprint'], fingerprints['PathFingerprint'], fingerprints['PharmacophoreFingerprint']])

    
    # Pharmacophore Features
    fdefName = os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')
    factory = ChemicalFeatures.BuildFeatureFactory(fdefName)
    features = factory.GetFeaturesForMol(mol)
    
    pharmacophore_features = {
        'HydrogenBondDonors': sum(1 for feat in features if feat.GetFamily() == 'Donor'),
        'HydrogenBondAcceptors': sum(1 for feat in features if feat.GetFamily() == 'Acceptor'),
        'HydrophobicRegions': sum(1 for feat in features if feat.GetFamily() == 'Hydrophobe'),
        'AromaticRings': sum(1 for feat in features if feat.GetFamily() == 'Aromatic'),
        'PositivelyChargedGroups': sum(1 for feat in features if feat.GetFamily() == 'PosIonizable'),
        'NegativelyChargedGroups': sum(1 for feat in features if feat.GetFamily() == 'NegIonizable'),
        'ExcludedVolumes': sum(1 for feat in features if feat.GetFamily() == 'ExcludedVolume')
    }

    pharmacophore_values = np.array(list(pharmacophore_features.values()))

    
    # Combine descriptors, fingerprints and pharmacore
    all_features = np.concatenate([descriptor_values, fingerprint_values, pharmacophore_values])

    return all_features

## Features Extracted Using Bio.PDB in Biopython

The `Bio.PDB` module in Biopython provides a comprehensive set of tools to extract various features from PDB structure data, which can be useful in understanding how molecules bind to proteins. Here's an exhaustive list of features you can extract using `Bio.PDB`, along with their relevance in determining molecular binding:

### Molecular Descriptors:
- Molecular Weight
- Topological Polar Surface Area (TPSA)
- Number of Hydrogen Bond Donors/Acceptors
- Number of Rotatable Bonds
- Number of Aromatic Rings
- Molecular Volume
- Molecular Surface Area
- Molecular Shape Descriptors (Globularity, Asphericity)
- Electrostatic Descriptors (Dipole Moment, Charge Distribution)
- Quantum Chemical Descriptors (HOMO-LUMO Gap, Polarizability)

These descriptors can help identify physicochemical properties that influence binding affinity, such as hydrophobicity, polarity, and electrostatic interactions.

### Molecular Fingerprints:
- Structural Fingerprints (Presence/Absence of Substructures)
- Morgan/Circular Fingerprints (Capturing Circular Neighborhoods)
- Path-based Fingerprints (Encoding Linear Paths)
- Pharmacophore Fingerprints (Spatial Arrangement of Pharmacophoric Features)

Fingerprints encode structural information and can be used for similarity searches, clustering, and as input features for machine learning models to predict binding affinity.

### Pharmacophore Features:
- Hydrogen Bond Donors/Acceptors
- Hydrophobic Regions
- Aromatic Rings
- Positively/Negatively Charged Groups
- Excluded Volumes (Representing Steric Constraints)

Pharmacophore features describe the spatial arrangement of features necessary for molecular recognition by the protein, which is crucial for understanding binding interactions.

### Protein-Ligand Interactions:
- Hydrogen Bonds
- Hydrophobic Contacts
- Salt Bridges
- Pi-Stacking Interactions
- Covalent Bonds

Analyzing the specific interactions between the ligand and the protein residues can provide insights into the binding mechanism and affinity.

### Binding Site Residues:
- Identification of Residues in the Binding Pocket
- Residue Properties (Charge, Hydrophobicity, Aromaticity)
- Residue Conservation
- Flexibility/B-factors

Understanding the properties of the binding site residues and their conservation can help explain the specificity and selectivity of the binding interaction.

### Structural Alignment and Comparison:
- Superposition of Ligand-Bound and Apo Structures
- Identification of Conformational Changes upon Binding
- Comparison of Binding Modes across Different Complexes

Structural alignments and comparisons can reveal conformational changes induced by ligand binding and provide insights into the binding mechanism.


In [None]:
def pdb_to_features(pdb_file):
    parser = PDBParser()
    structure = parser.get_structure("protein", pdb_file)

    # Initialize lists to collect features
    molecular_weights = []
    tpsa = []
    num_h_donors = []
    num_h_acceptors = []
    num_rotatable_bonds = []
    num_aromatic_rings = []
    molecular_volume = []
    molecular_surface_area = []
    globularity = []
    asphericity = []
    dipole_moment = []
    homo_lumo_gap = []
    polarizability = []

    fingerprints = []
    pharmacophore_features = []
    interactions = []
    binding_site_residues = []

    # Collect descriptors and other features for all residues
    for model in structure:
        for chain in model:
            for residue in chain:
                if not is_aa(residue):
                    molecular_weights.append(MolecularWeight(residue))
                    tpsa.append(TPSA(residue))
                    num_h_donors.append(HydrogenBondDonors(residue))
                    num_h_acceptors.append(HydrogenBondAcceptors(residue))
                    num_rotatable_bonds.append(RotatableBonds(residue))
                    num_aromatic_rings.append(AromaticRings(residue))
                    molecular_volume.append(MolecularVolume(residue))
                    molecular_surface_area.append(MolecularSurfaceArea(residue))
                    globularity.append(Globularity(residue))
                    asphericity.append(Asphericity(residue))
                    dipole_moment.append(DipoleMoment(residue))
                    homo_lumo_gap.append(HOMO_LUMO_Gap(residue))
                    polarizability.append(Polarizability(residue))

                    # Collect fingerprints, pharmacophore features, and interactions
                    fingerprints.append(Fingerprints(residue))
                    pharmacophore_features.append(PharmacophoreFeatures(residue))
                    interactions.append(ResidueInteractions(residue))
                else:
                    for atom in residue.get_atoms():
                        if atom.get_bfactor() > 40:  # Adjust the B-factor threshold as needed
                            binding_site_residues.append(residue)
                            break

    # Convert lists to numpy arrays
    molecular_weights = np.array(molecular_weights)
    tpsa = np.array(tpsa)
    num_h_donors = np.array(num_h_donors)
    num_h_acceptors = np.array(num_h_acceptors)
    num_rotatable_bonds = np.array(num_rotatable_bonds)
    num_aromatic_rings = np.array(num_aromatic_rings)
    molecular_volume = np.array(molecular_volume)
    molecular_surface_area = np.array(molecular_surface_area)
    globularity = np.array(globularity)
    asphericity = np.array(asphericity)
    dipole_moment = np.array(dipole_moment)
    homo_lumo_gap = np.array(homo_lumo_gap)
    polarizability = np.array(polarizability)

    fingerprints = np.concatenate(fingerprints, axis=0) if fingerprints else np.array([])
    pharmacophore_features = np.concatenate(pharmacophore_features, axis=0) if pharmacophore_features else np.array([])
    interactions = np.concatenate(interactions, axis=0) if interactions else np.array([])

    # Calculate the number of binding site residues as a feature
    num_binding_site_residues = len(binding_site_residues)
    binding_site_values = np.array([num_binding_site_residues])

    # Structural Alignment and Comparison
    # This part requires additional input structures for comparison

    # Concatenate all features into a single numpy array
    all_features = np.concatenate([
        molecular_weights,
        tpsa,
        num_h_donors,
        num_h_acceptors,
        num_rotatable_bonds,
        num_aromatic_rings,
        molecular_volume,
        molecular_surface_area,
        globularity,
        asphericity,
        dipole_moment,
        homo_lumo_gap,
        polarizability,
        fingerprints,
        pharmacophore_features,
        interactions,
        binding_site_values
    ])

    return {
        'Descriptors': {
            'MolecularWeight': molecular_weights,
            'TPSA': tpsa,
            'NumHydrogenBondDonors': num_h_donors,
            'NumHydrogenBondAcceptors': num_h_acceptors,
            'NumRotatableBonds': num_rotatable_bonds,
            'NumAromaticRings': num_aromatic_rings,
            'MolecularVolume': molecular_volume,
            'MolecularSurfaceArea': molecular_surface_area,
            'Globularity': globularity,
            'Asphericity': asphericity,
            'DipoleMoment': dipole_moment,
            'HOMO_LUMO_Gap': homo_lumo_gap,
            'Polarizability': polarizability
        },
        'Fingerprints': fingerprints,
        'PharmacophoreFeatures': pharmacophore_features,
        'Interactions': interactions,
        'BindingSiteResidues': binding_site_residues,
        'NodeFeatures': all_features  # This will be used as features for the node in the graph
    }
