In [1]:
import pandas as pd
import pyarrow.parquet as pq
import numpy as np

# Read the file in chunks
def process_chunk(chunk, unique_building_blocks, unique_molecules):
    # Update the unique building blocks set
    unique_building_blocks.update(chunk['buildingblock1_smiles'].unique())
    unique_building_blocks.update(chunk['buildingblock2_smiles'].unique())
    unique_building_blocks.update(chunk['buildingblock3_smiles'].unique())
    
    # Update the unique molecules set
    unique_molecules.update(chunk['molecule_smiles'].unique())

# Load the parquet file
file_path = './train.parquet'
batch_size = 100000
parquet_file = pq.ParquetFile(file_path)

# Initialize sets to keep track of unique building blocks and molecules
unique_building_blocks = set()
unique_molecules = set()

# Iterate over the parquet file in batches
num_row_groups = parquet_file.num_row_groups

for i in range(num_row_groups):
    # Read a batch of rows
    row_group = parquet_file.read_row_group(i).to_pandas()

    if i == 0:
        print("First few rows from the first row group:")
        print(row_group.head())
    
    # Process the current chunk
    process_chunk(row_group, unique_building_blocks, unique_molecules)

# Output the total unique counts
print(f"Total number of unique building blocks: {len(unique_building_blocks)}")
print(f"Total number of unique molecules: {len(unique_molecules)}")

First few rows from the first row group:
   id                            buildingblock1_smiles buildingblock2_smiles  \
0   0  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21  C#CCOc1ccc(CN)cc1.Cl   
1   1  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21  C#CCOc1ccc(CN)cc1.Cl   
2   2  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21  C#CCOc1ccc(CN)cc1.Cl   
3   3  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21  C#CCOc1ccc(CN)cc1.Cl   
4   4  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21  C#CCOc1ccc(CN)cc1.Cl   

     buildingblock3_smiles                                    molecule_smiles  \
0  Br.Br.NCC1CCCN1c1cccnn1  C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...   
1  Br.Br.NCC1CCCN1c1cccnn1  C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...   
2  Br.Br.NCC1CCCN1c1cccnn1  C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...   
3        Br.NCc1cccc(Br)n1  C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...   
4        Br.NCc1cccc(Br)n1  C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](C

# Exhaustive List of Features for Small Molecules

## Molecular Descriptors:
- Molecular weight
- Number of atoms
- Number of bonds
- Number of aromatic rings
- Number of rotatable bonds
- Topological polar surface area (TPSA)
- LogP (octanol-water partition coefficient)

## Atom-Level Features:
- Atom types (e.g., C, H, O, N, S)
- Hybridization states (sp, sp2, sp3)
- Formal charge
- Aromaticity
- Degree (number of bonds to the atom)
- Implicit and explicit hydrogen counts
- Chirality

## Bond-Level Features:
- Bond types (single, double, triple, aromatic)
- Conjugation
- Ring membership
- Stereo configuration (cis/trans)

## Graph-Based Features:
- Adjacency matrix
- Distance matrix
- Graph Laplacian

## Physicochemical Properties:
- Hydrogen bond donors and acceptors
- Molecular refractivity
- Molar volume
- Electronegativity
- Electron affinity

## Structural Fingerprints:
- MACCS keys
- Morgan fingerprints
- ECFP (Extended Connectivity Fingerprints)
- RDKIT fingerprints


In [1]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, AllChem, rdmolops
from rdkit.DataStructs import ConvertToNumpyArray
import numpy as np

# Define encoding schemes outside the class
ATOM_TYPES = ['C', 'H', 'O', 'N', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'B']
HYBRIDIZATION_STATES = ['SP', 'SP2', 'SP3', 'SP3D', 'SP3D2']
CHIRAL_TAGS = ['CHI_UNSPECIFIED', 'CHI_TETRAHEDRAL_CW', 'CHI_TETRAHEDRAL_CCW', 'CHI_OTHER']
BOND_TYPES = ['SINGLE', 'DOUBLE', 'TRIPLE', 'AROMATIC']
STEREO_CONFIGURATIONS = ['STEREONONE', 'STEREOZ', 'STEREOE', 'STEREOCIS', 'STEREOTRANS']

class SmallMoleculeFeatureExtractor:
    def __init__(self, smiles):
        self.smiles = smiles
        self.mol = Chem.MolFromSmiles(smiles)

    def get_molecular_descriptors(self):
        descriptors = {
            'molecular_weight': Descriptors.MolWt(self.mol),
            'num_atoms': self.mol.GetNumAtoms(),
            'num_bonds': self.mol.GetNumBonds(),
            'num_aromatic_rings': rdMolDescriptors.CalcNumAromaticRings(self.mol),
            'num_rotatable_bonds': Descriptors.NumRotatableBonds(self.mol),
            'tpsa': Descriptors.TPSA(self.mol),
            'logp': Descriptors.MolLogP(self.mol)
        }
        return descriptors

    def one_hot_encode(self, value, categories):
        encoding = [0] * len(categories)
        if value in categories:
            encoding[categories.index(value)] = 1
        return encoding

    def get_atom_level_features(self):
        atom_features = []
        for atom in self.mol.GetAtoms():
            atom_features.append([
                self.one_hot_encode(atom.GetSymbol(), ATOM_TYPES),
                self.one_hot_encode(str(atom.GetHybridization()), HYBRIDIZATION_STATES),
                atom.GetFormalCharge(),
                atom.GetIsAromatic(),
                atom.GetDegree(),
                atom.GetImplicitValence(),
                atom.GetTotalNumHs(),
                self.one_hot_encode(str(atom.GetChiralTag()), CHIRAL_TAGS)
            ])
        return atom_features

    def get_bond_level_features(self):
        bond_features = []
        for bond in self.mol.GetBonds():
            bond_features.append([
                self.one_hot_encode(str(bond.GetBondType()), BOND_TYPES),
                bond.GetIsConjugated(),
                bond.IsInRing(),
                self.one_hot_encode(str(bond.GetStereo()), STEREO_CONFIGURATIONS)
            ])
        return bond_features

    def get_graph_based_features(self):
        adj_matrix = rdmolops.GetAdjacencyMatrix(self.mol)
        dist_matrix = rdmolops.GetDistanceMatrix(self.mol)
        return {
            'adjacency_matrix': adj_matrix,
            'distance_matrix': dist_matrix,
        }

    def get_physicochemical_properties(self):
        properties = {
            'h_bond_donors': Descriptors.NumHDonors(self.mol),
            'h_bond_acceptors': Descriptors.NumHAcceptors(self.mol),
            'molecular_refractivity': Descriptors.MolMR(self.mol),
            'molar_volume': Descriptors.MolLogP(self.mol) / Descriptors.MolWt(self.mol)
        }
        return properties

    def get_structural_fingerprints(self):
        maccs_keys = AllChem.GetMACCSKeysFingerprint(self.mol)
        morgan_fp = AllChem.GetMorganFingerprintAsBitVect(self.mol, 2)
        rdk_fp = Chem.RDKFingerprint(self.mol)

        maccs_keys_np = np.zeros((1,))
        ConvertToNumpyArray(maccs_keys, maccs_keys_np)

        morgan_fp_np = np.zeros((1,))
        ConvertToNumpyArray(morgan_fp, morgan_fp_np)

        rdk_fp_np = np.zeros((1,))
        ConvertToNumpyArray(rdk_fp, rdk_fp_np)
        
        return {
            'maccs_keys': maccs_keys_np,
            'morgan_fp': morgan_fp_np,
            'rdkit_fp': rdk_fp_np
        }

    def extract_features(self):
        features = {
            'molecular_descriptors': self.get_molecular_descriptors(),
            'atom_level_features': self.get_atom_level_features(),
            'bond_level_features': self.get_bond_level_features(),
            'graph_based_features': self.get_graph_based_features(),
            'physicochemical_properties': self.get_physicochemical_properties(),
            'structural_fingerprints': self.get_structural_fingerprints()
        }
        return features


In [2]:
smiles = "C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21"
extractor = SmallMoleculeFeatureExtractor(smiles)
features = extractor.extract_features()
for feature, value in features.items():
    print(f"{feature}: {value}")

molecular_descriptors: {'molecular_weight': 349.3860000000001, 'num_atoms': 26, 'num_bonds': 28, 'num_aromatic_rings': 2, 'num_rotatable_bonds': 6, 'tpsa': 75.63000000000001, 'logp': 3.391700000000002}
atom_level_features: [[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0], 0, False, 1, 1, 1, [1, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0], 0, False, 2, 0, 0, [1, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0], 0, False, 2, 2, 2, [1, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0], 0, False, 3, 0, 1, [0, 1, 0, 0]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0], 0, False, 2, 2, 2, [1, 0, 0, 0]], [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0], 0, False, 3, 0, 0, [1, 0, 0, 0]], [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0], 0, False, 1, 0, 0, [1, 0, 0, 0]], [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0], 0, False, 1, 1, 1, [1, 0, 0, 0]], [[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0], 0, False, 2, 1, 1, [1, 0, 0

# Feature Extraction from Protein Structure PDB File

## Structural Features

### Amino Acid Composition:
- Frequency of each amino acid type in the binding site.
- Frequency of amino acid types in the entire protein.

### Secondary Structure:
- Percentage of alpha-helices, beta-sheets, and random coils in the binding site.
- Secondary structure elements around the binding site.

### Tertiary Structure:
- 3D coordinates of the binding site.
- Distance between key residues in the binding site.

### Binding Site Characteristics:
- Volume and surface area of the binding site.
- Shape descriptors (e.g., sphericity, elongation).

## Physicochemical Properties

### Hydrophobicity:
- Hydrophobic and hydrophilic residue distribution in the binding site.
- Hydrophobic surface area.

### Charge Distribution:
- Number and type of charged residues (positive and negative).
- Electrostatic potential distribution.

### Polarity:
- Number of polar residues.
- Polar surface area.

### Solvent Accessibility:
- Solvent-accessible surface area (SASA) of residues in the binding site.

### Hydrogen Bonding:
- Number of potential hydrogen bond donors and acceptors.
- Hydrogen bond network in the binding site.

### Van der Waals Interactions:
- Van der Waals interaction potential of the binding site.

## Geometric Features

### Distance Metrics:
- Pairwise distances between all residues in the binding site.
- Distance to the nearest surface residue.

### Angles and Dihedrals:
- Angles and dihedral angles between residues in the binding site.

## Chemical Environment

### Residue Environment:
- Local chemical environment of each residue (e.g., neighboring residues within a certain radius).

### Ligand Interaction Sites:
- Specific interaction sites for known ligands (if available).

## Dynamic Properties

### Flexibility:
- B-factors or temperature factors indicating residue flexibility.

### Molecular Dynamics Simulations:
- Root mean square fluctuation (RMSF) of residues in the binding site.
- Conformational changes over time.

## Topological Features

### Graph-based Features:
- Protein structure represented as a graph with nodes (residues) and edges (interactions).
- Degree centrality, betweenness centrality, and clustering coefficient of residues in the binding site.

## Energy-based Features

### Binding Energy:
- Estimated binding free energy of known ligands.
- Energy components (van der Waals, electrostatic, solvation) from docking simulations.

## Protein-Ligand Interaction Features

### Docking Scores:
- Scores from molecular docking simulations with various ligands.

### Interaction Profiles:
- Interaction fingerprints summarizing the types and strengths of interactions with ligands.

## Evolutionary Features

### Conservation:
- Sequence conservation of residues in the binding site (e.g., from multiple sequence alignment).

### Mutational Impact:
- Predicted impact of mutations on binding site residues.

## Experimental Data

### Experimental Binding Data:
- Known binding affinities (e.g., Kd, Ki, IC50) for small molecules.

## Contextual Features

### Functional Annotations:
- Biological function and pathway involvement of the protein.
- Known protein-protein interactions.

## Integration and Representation

### Feature Scaling and Normalization:
- Standardize and normalize features for input into the deep learning model.


In [29]:
from Bio.PDB import PDBParser
import numpy as np
import networkx as nx

class ProteinFeatureExtractor:
    def __init__(self, pdb_file):
        self.pdb_file = pdb_file
        self.structure = self.load_structure()
        self.ligand_resnames = self.detect_ligands()
        self.graph = self.construct_graph()

    def load_structure(self):
        # Load the PDB structure
        parser = PDBParser()
        structure = parser.get_structure('protein', self.pdb_file)
        return structure

    def detect_ligands(self):
        # Detect ligand residue names by excluding standard amino acids and water
        ligands = set()
        standard_amino_acids = {'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 
                                'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'}
        water_residues = {'HOH'}
        for residue in self.structure.get_residues():
            resname = residue.resname
            if resname not in standard_amino_acids and resname not in water_residues:
                ligands.add(resname)
        return list(ligands)

    def get_amino_acid_composition(self):
        # Get the composition of amino acids in the protein
        amino_acids = [residue.resname for residue in self.structure.get_residues() if residue.id[0] == ' ']
        aa_counts = {aa: amino_acids.count(aa) for aa in set(amino_acids)}
        return aa_counts


    def get_flexibility(self):
        # Calculate the flexibility of the protein based on B-factors
        flexibility = []
        for atom in self.structure.get_atoms():
            flexibility.append(atom.bfactor)
        return np.mean(flexibility)

    def get_distance_metrics(self):
        # Calculate distance metrics between residues in the protein
        distances = []
        for chain in self.structure.get_chains():
            print(f"Processing chain: {chain.id}")
            residues = [res for res in chain if 'CA' in res.child_dict]  # Filter residues with 'CA' atom
            for i, res1 in enumerate(residues):
                ca1 = res1.child_dict.get('CA')
                if ca1 is None:
                    print(f"Residue {res1} does not have a CA atom.")
                    continue
                for j, res2 in enumerate(residues):
                    if i < j:
                        ca2 = res2.child_dict.get('CA')
                        if ca2 is None:
                            print(f"Residue {res2} does not have a CA atom.")
                            continue
                        try:
                            distance = ca1 - ca2
                            distances.append(distance)
                        except KeyError as e:
                            print(f"Error calculating distance: {e}")
        return distances

    def construct_graph(self, cutoff=4.0):
        # Initialize an undirected graph
        G = nx.Graph()

        # Add nodes for each residue
        for chain in self.structure.get_chains():
            for residue in chain:
                if is_aa(residue):
                    G.add_node(residue.id, residue=residue)

        # Add edges based on distance cutoff
        atoms = list(self.structure.get_atoms())
        ns = NeighborSearch(atoms)
        for atom in atoms:
            if atom.element == 'H':  # Skip hydrogen atoms
                continue
            neighbors = ns.search(atom.coord, cutoff)
            for neighbor in neighbors:
                if neighbor.element == 'H':  # Skip hydrogen atoms
                    continue
                res1 = atom.get_parent()
                res2 = neighbor.get_parent()
                if res1 != res2:
                    G.add_edge(res1.id, res2.id, weight=atom - neighbor)

        return G

    def extract_graph_features(self):
        # Adjacency matrix
        adjacency_matrix = nx.adjacency_matrix(self.graph).todense()

        # Distance matrix (Floyd-Warshall algorithm)
        distance_matrix = nx.floyd_warshall_numpy(self.graph)

        # Degree centrality
        degree_centrality = nx.degree_centrality(self.graph)

        # Betweenness centrality
        betweenness_centrality = nx.betweenness_centrality(self.graph)

        # Clustering coefficient
        clustering_coefficient = nx.clustering(self.graph)

        # Ensure features are in a consistent order
        nodes = list(self.graph.nodes)
        degree_centrality = np.array([degree_centrality[node] for node in nodes])
        betweenness_centrality = np.array([betweenness_centrality[node] for node in nodes])
        clustering_coefficient = np.array([clustering_coefficient[node] for node in nodes])

        # Aggregate features into a dictionary
        features = {
            'adjacency_matrix': adjacency_matrix,
            'distance_matrix': distance_matrix,
            'degree_centrality': degree_centrality,
            'betweenness_centrality': betweenness_centrality,
            'clustering_coefficient': clustering_coefficient
        }

        return features

    def extract_features(self):
        # Extract various features from the protein structure
        amino_acid_composition = self.get_amino_acid_composition()
        flexibility = self.get_flexibility()
        distance_metrics = self.get_distance_metrics()
        graph_features = self.extract_graph_features()

        features = {
            "amino_acid_composition": amino_acid_composition,
            "flexibility": flexibility,
            "distance_metrics": distance_metrics,
            "graph_features": graph_features
        }
        return features

    def aggregate_features(self, features):
        # Aggregate the extracted features into a single numpy array
        aggregated_features = []

        # Aggregate amino acid composition
        aa_composition = list(features["amino_acid_composition"].values())
        aggregated_features.extend(aa_composition)

        # Aggregate secondary structure
        sec_structure = [res[1] for res in features["secondary_structure"]]
        aggregated_features.extend(sec_structure)

        # Aggregate binding site volume
        aggregated_features.append(features["binding_site_volume"])

        # Aggregate physicochemical properties
        physico_properties = list(features["physicochemical_properties"].values())
        aggregated_features.extend(physico_properties)

        # Aggregate flexibility
        aggregated_features.append(features["flexibility"])

        # Aggregate distance metrics (average for simplification)
        avg_distance = np.mean(features["distance_metrics"]) if features["distance_metrics"] else 0
        aggregated_features.append(avg_distance)

        return np.array(aggregated_features)


In [30]:
pdb_file = "./ALB.pdb"
extractor = ProteinFeatureExtractor(pdb_file)
features = extractor.extract_features()
for feature, value in features.items():
    print(f"{feature}: {value}")



Processing chain: A
Processing chain: B
amino_acid_composition: {'PHE': 62, 'TRP': 2, 'TYR': 36, 'ILE': 16, 'GLN': 40, 'ALA': 122, 'ASP': 70, 'ASN': 34, 'HIS': 30, 'GLY': 22, 'LYS': 116, 'PRO': 48, 'LEU': 118, 'SER': 48, 'THR': 56, 'MET': 12, 'CYS': 70, 'GLU': 124, 'VAL': 82, 'ARG': 48}
flexibility: 43.24658229223249
distance_metrics: [3.799853, 5.625652, 5.2366357, 6.313811, 8.753273, 9.90739, 10.631102, 12.30577, 14.361686, 15.183271, 15.693063, 18.748978, 18.05107, 14.625893, 16.633743, 19.458057, 17.908018, 16.458813, 20.050129, 21.238708, 19.161816, 20.314484, 23.822437, 24.020021, 23.142714, 25.360504, 28.221441, 30.74149, 28.444807, 29.520653, 28.009275, 27.860603, 25.76604, 23.159004, 23.027431, 22.12102, 19.377497, 17.754814, 18.348131, 16.635723, 13.546834, 13.605101, 14.518148, 11.495396, 9.05958, 10.979823, 10.605165, 6.898326, 8.197482, 10.829777, 8.981563, 5.597059, 7.9343214, 9.263097, 10.455489, 8.521203, 4.9676795, 4.6003084, 6.8788652, 7.999854, 9.307276, 12.7483225, 