In [3]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile('test.parquet')
print(parquet_file.schema.names)


['id', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles', 'molecule_smiles', 'protein_name']


In [None]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('train.parquet')
test_df = dd.read_parquet('test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of rows: {total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
import pandas as pd
import os
from tqdm import tqdm
from collections import defaultdict

# Paths to input and output Parquet files
input_file = 'train.parquet'
output_file = 'filtered_train.parquet'

# Remove output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Open the Parquet file
pf = pq.ParquetFile(input_file)

# Get total number of row groups (batches)
total_row_groups = pf.num_row_groups

# First Pass: Build mapping of molecule_smiles to the set of proteins it binds to
print("First Pass: Building molecule to proteins mapping...")

# Initialize a dictionary to store mappings
molecule_binds = defaultdict(set)
all_proteins = set()

for rg in tqdm(range(total_row_groups), desc="Processing Batches"):
    # Read a row group with necessary columns
    batch = pf.read_row_group(rg, columns=['molecule_smiles', 'protein_name', 'binds'])
    df = batch.to_pandas()
    
    # Update all_proteins set
    all_proteins.update(df['protein_name'].unique())
    
    # Filter rows where 'binds' == 1
    df_binds_1 = df[df['binds'] == 1]
    
    # Update molecule_binds mapping
    for idx, row in df_binds_1.iterrows():
        molecule = row['molecule_smiles']
        protein = row['protein_name']
        molecule_binds[molecule].add(protein)
    
    # Clear variables to free memory
    del df, df_binds_1, batch

# Convert all_proteins to a list
all_proteins = list(all_proteins)

# Second Pass: Process data and write to output
print("Second Pass: Filtering dataset and writing to new Parquet file...")

# Initialize Parquet writer
writer = None

for rg in tqdm(range(total_row_groups), desc="Processing Batches"):
    # Read the row group
    batch = pf.read_row_group(rg)
    df = batch.to_pandas()
    
    # Filter molecules that have at least one binds == 1
    df = df[df['molecule_smiles'].isin(molecule_binds.keys())]
    
    if not df.empty:
        # Prepare to select rows to include
        rows_to_include = []
        
        # Process each molecule in the batch
        for molecule, group in df.groupby('molecule_smiles'):
            binds_1_proteins = molecule_binds[molecule]
            num_binds_1 = len(binds_1_proteins)
            
            if num_binds_1 > 1:
                # Include all rows for this molecule
                rows_to_include.append(group)
            elif num_binds_1 == 1:
                # Include the positive binding row
                positive_row = group[group['binds'] == 1]
                
                # Include one negative binding row
                unbound_proteins = set(all_proteins) - binds_1_proteins
                # Select one unbound protein
                unbound_protein = unbound_proteins.pop()
                negative_row = group[(group['protein_name'] == unbound_protein) & (group['binds'] == 0)]
                
                # If negative_row is empty, skp it
                if  not negative_row.empty:
                    # Append the positive and negative rows
                    rows_to_include.append(pd.concat([positive_row, negative_row]))

        
        if rows_to_include:
            filtered_df = pd.concat(rows_to_include)
            
            # Convert to PyArrow Table
            table = pa.Table.from_pandas(filtered_df)
            
            # Initialize the Parquet writer if not already done
            if writer is None:
                writer = pq.ParquetWriter(output_file, table.schema)
            
            # Write the table to the Parquet file
            writer.write_table(table)
            
            # Clear variables to free memory
            del table, filtered_df
    
    # Clear variables to free memory
    del df, batch

# Close the Parquet writer
if writer is not None:
    writer.close()

print("Filtering completed. Filtered dataset saved to 'filtered_train.parquet'.")

In [None]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('filtered_train.parquet')
test_df = dd.read_parquet('cleaned_test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of training rows: {total_rows}")

# Total number of rows in test dataset
test_total_rows = test_df.map_partitions(len).compute().sum()
print(f"Total number of test rows: {test_total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")
print(f"Unique proteins in both datasets: {unique_proteins_all.values}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

In [1]:
import pandas as pd
import json
from rdkit import Chem
from rdkit.Chem import AllChem
from joblib import Parallel, delayed
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm

# Load your original dataframes
train_df = pd.read_parquet('filtered_train.parquet')
test_df = pd.read_parquet('test.parquet')

# Define a function to process each row
def process_row(row):
    try:
        smiles = row['molecule_smiles']
        binds = 0

        if 'binds' in row:
            binds = row['binds']
        protein_name = row['protein_name']

        # Convert SMILES to molecular graph
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None  # Skip invalid SMILES

        # Remove atoms and process the molecule
        atoms_to_remove = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'Dy']
        mol = Chem.EditableMol(mol)
        for idx in sorted(atoms_to_remove, reverse=True):
            mol.RemoveAtom(idx)
        mol = mol.GetMol()

        mol = Chem.AddHs(mol)

        # Embed molecule
        AllChem.EmbedMolecule(mol, randomSeed=42)

        # Check if conformer is valid
        try:
            conformer = mol.GetConformer()
        except Exception as e:
            return None

        if conformer is None:
            return None

        # If the row passes all checks, return it
        return row

    except Exception as e:
        return None

# Function to filter and save cleaned data
def filter_and_save(df, output_file):
    # Process the dataframe rows in parallel with progress tracking
    results = Parallel(n_jobs=-1)(
        delayed(process_row)(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows in parallel")
    )

    # Filter out None rows and create a new DataFrame from valid rows
    valid_rows = [result for result in results if result is not None]
    valid_df = pd.DataFrame(valid_rows)

    # Save the cleaned data to a new parquet file
    table = pa.Table.from_pandas(valid_df)
    pq.write_table(table, output_file)

# Filter and save train and test datasets
# filter_and_save(train_df, 'cleaned_train.parquet')
filter_and_save(test_df, 'cleaned_test.parquet')

Processing rows in parallel: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1674896/1674896 [1:59:01<00:00, 234.52it/s]


In [2]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('cleaned_train.parquet')
test_df = dd.read_parquet('cleaned_test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of rows: {total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")
print(f"Unique proteins in both datasets: {unique_proteins_all.values}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

Total number of rows: 3099349
Number of positive bindings: 1589767
Number of negative bindings: 1509582
Percentage of positive bindings: 51.29%
Percentage of negative bindings: 48.71%
Total unique proteins in train dataset: 3
Total unique proteins in test dataset: 3
Total unique proteins in both datasets: 3
Unique proteins in both datasets: <ArrowStringArray>
['BRD4', 'sEH', 'HSA']
Length: 3, dtype: string
Total unique building blocks (train and test): 2110
Total unique small molecules (train and test): 2387581


In [None]:
# Define a function to calculate unique atom and edge types from a dataframe
def collect_unique_atom_and_edge_types(df):
    # Process the dataframe rows in parallel with progress tracking
    results = Parallel(n_jobs=-1)(
        delayed(process_row_for_types)(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Collecting atom and edge types")
    )

    # Collect unique atom and edge types
    molecule_node_types = set()
    molecule_edge_types = set()
    for node_types, edge_types in results:
        molecule_node_types.update(node_types)
        molecule_edge_types.update(edge_types)

    return molecule_node_types, molecule_edge_types

# Define a function to process each row for atom and edge type collection
def process_row_for_types(row):
    try:
        smiles = row['molecule_smiles']
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return set(), set()  # Skip invalid SMILES

        # Remove atoms and process the molecule
        atoms_to_remove = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'Dy']
        mol = Chem.EditableMol(mol)
        for idx in sorted(atoms_to_remove, reverse=True):
            mol.RemoveAtom(idx)
        mol = mol.GetMol()

        mol = Chem.AddHs(mol)

        # Collect atom and edge types
        atom_types = [atom.GetSymbol() for atom in mol.GetAtoms()]
        unique_atom_types = set(atom_types)
        edge_types = set()

        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            atype_i = atom_types[i]
            atype_j = atom_types[j]
            edge_types.add((atype_i, 'bond', atype_j))
            edge_types.add((atype_j, 'bond', atype_i))

        return unique_atom_types, edge_types

    except Exception as e:
        return set(), set()

# Load the cleaned data
cleaned_train_df = pd.read_parquet('cleaned_train.parquet')
cleaned_test_df = pd.read_parquet('test.parquet')

# Collect unique atom and edge types from both train and test datasets
print("Collecting unique atom and edge types from train dataset...")
train_node_types, train_edge_types = collect_unique_atom_and_edge_types(cleaned_train_df)

print("Collecting unique atom and edge types from test dataset...")
test_node_types, test_edge_types = collect_unique_atom_and_edge_types(cleaned_test_df)

# Combine the unique types from both datasets
combined_node_types = train_node_types.union(test_node_types)
combined_edge_types = train_edge_types.union(test_edge_types)

# Save the combined unique atom and edge types to a JSON file
unique_types = {
    'molecule_node_types': sorted(list(combined_node_types)),
    'molecule_edge_types': sorted([list(edge_type) for edge_type in combined_edge_types])
}
with open('unique_atom_and_edge_types.json', 'w') as f:
    json.dump(unique_types, f, indent=4)
