In [None]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile('train.parquet')
print(parquet_file.schema.names)

In [None]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('train.parquet')
test_df = dd.read_parquet('test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of rows: {total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

In [None]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('train.parquet')
test_df = dd.read_parquet('test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of training rows: {total_rows}")

# Total number of rows in test dataset
test_total_rows = test_df.map_partitions(len).compute().sum()
print(f"Total number of test rows: {test_total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")
print(f"Unique proteins in both datasets: {unique_proteins_all.values}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

In [None]:
import os
from itertools import cycle
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import pyarrow.parquet as pq
import pyarrow as pa
from rdkit import Chem
from rdkit.Chem import AllChem

def check_smiles(smiles):
    try:
        # Convert SMILES to molecular graph
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return False  # Skip invalid SMILES

        # Remove atoms and process the molecule
        atoms_to_remove = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'Dy']
        mol = Chem.EditableMol(mol)
        for idx in sorted(atoms_to_remove, reverse=True):
            mol.RemoveAtom(idx)
        mol = mol.GetMol()

        mol = Chem.AddHs(mol)

        # Embed molecule
        AllChem.EmbedMolecule(mol, randomSeed=42)

        # Check if conformer is valid
        try:
            conformer = mol.GetConformer()
        except Exception as e:
            return False

        if conformer is None:
            return False

        return True

    except Exception as e:
        return False

# Constants
proteins = ['HSA', 'sEH', 'BRD4']



def process_chunk(chunk, output_dir, chunk_id):
    """Process a chunk of the dataframe and save binds/non-binds to separate temporary files for each protein."""
    for protein in proteins:
        # Filter based on protein
        df_protein = chunk[chunk['protein_name'] == protein]
        
        # Separate into binds and non-binds
        df_binds = df_protein[df_protein['binds'] == 1]
        df_non_binds = df_protein[df_protein['binds'] == 0]

        # Write binds and non-binds to separate temporary files with unique names
        if not df_binds.empty:
            binds_file = f"{output_dir}/{protein}_binds_chunk_{chunk_id}.parquet"
            df_binds.to_parquet(binds_file, engine='pyarrow', compression='snappy')

        if not df_non_binds.empty:
            non_binds_file = f"{output_dir}/{protein}_non_binds_chunk_{chunk_id}.parquet"
            df_non_binds.to_parquet(non_binds_file, engine='pyarrow', compression='snappy')

def merge_and_sample_non_binds(output_dir, protein):
    """Merge temporary Parquet files for binds/non-binds and sample two non-binds for each bind."""
    binds_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith(f"{protein}_binds_chunk_")]
    non_binds_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith(f"{protein}_non_binds_chunk_")]

    if not binds_files or not non_binds_files:
        return None

    # Create iterators for binds and non-binds
    binds_data = pd.concat(pd.read_parquet(file, engine='pyarrow') for file in binds_files)
    binds_iter = cycle(binds_data.iterrows())  # Create a cyclic iterator

    # Open output file
    output_file = f"{output_dir}/{protein}_cleaned.parquet"
    writer = None

    result_rows = []
    bind_count = 0
    max_binds = len(binds_data) * 3  # Process each bind 3 times

    non_binds_file_iter = iter(non_binds_files)
    current_non_binds = pd.read_parquet(next(non_binds_file_iter), engine='pyarrow')
    non_binds_row_iter = iter(current_non_binds.iterrows())

    while bind_count < max_binds:
        _, bind_row = next(binds_iter)
        
        # Check bind SMILES
        if not check_smiles(bind_row['molecule_smiles']):
            continue  # Skip this bind row and move to the next

        non_binds = []
        while len(non_binds) < 2:
            try:
                _, non_bind = next(non_binds_row_iter)
            except StopIteration:
                # If we've exhausted the current file, move to the next one
                try:
                    current_non_binds = pd.read_parquet(next(non_binds_file_iter), engine='pyarrow')
                    non_binds_row_iter = iter(current_non_binds.iterrows())
                    continue # We found more data to go through
                except StopIteration:
                    # If we've gone through all files, break the loop
                    print(f"Ran out of non-binds after processing {bind_count} binds")
                    break

            # Check non-bind SMILES
            if check_smiles(non_bind['molecule_smiles']):
                non_binds.append(non_bind)

        # If we couldn't find 2 valid non-binds, end processing
        if len(non_binds) < 2:
            break

        # Append a row to the result
        result_rows.append({
            'id': bind_row['id'],
            'smiles_binds': bind_row['molecule_smiles'],
            'smiles_non_binds_1': non_binds[0]['molecule_smiles'],
            'smiles_non_binds_2': non_binds[1]['molecule_smiles'],
            'protein_name': bind_row['protein_name']
        })

        bind_count += 1

        # Write to the output file in batches
        if len(result_rows) >= 10000:
            result_df = pd.DataFrame(result_rows)
            if writer is None:
                writer = pa.parquet.ParquetWriter(output_file, schema=pa.Table.from_pandas(result_df).schema)
            writer.write_table(pa.Table.from_pandas(result_df))
            result_rows = []

    # Write any remaining rows
    if result_rows:
        result_df = pd.DataFrame(result_rows)
        if writer is None:
            writer = pa.parquet.ParquetWriter(output_file, schema=pa.Table.from_pandas(result_df).schema)
        writer.write_table(pa.Table.from_pandas(result_df))

    if writer:
        writer.close()

    # Clean up the intermediate files
    for f in binds_files + non_binds_files:
        os.remove(f)

def process_large_parquet(input_file, output_dir):
    """Process the large dataset in parallel chunks and save to temporary files."""
    # Open parquet file
    parquet_file = pq.ParquetFile(input_file)
    
    # Process each row group (chunk) in parallel
    tasks = []
    with tqdm(total=parquet_file.num_row_groups, dynamic_ncols=True) as pbar:
        for i in range(parquet_file.num_row_groups):
            # Read a chunk of the parquet file
            chunk = parquet_file.read_row_group(i)

            # Append the task to the list
            tasks.append(delayed(process_chunk)(chunk, output_dir, i))
            
            # Update the progress bar
            pbar.update(1)

    # Execute the tasks in parallel
    Parallel(n_jobs=-1)(tasks)

    # Merge and sample non-binds for each protein
    for protein in proteins:
        merge_and_sample_non_binds(output_dir, protein)

    # Merge the final cleaned data for all proteins into one file
    cleaned_files = [f"{output_dir}/{protein}_cleaned.parquet" for protein in proteins]
    df_final = pd.concat([pd.read_parquet(f) for f in cleaned_files])
    df_final.to_parquet(f"{output_dir}/cleaned_train.parquet", engine='pyarrow', compression='snappy')

    # Clean up intermediate files
    for f in cleaned_files:
        os.remove(f)

# Define the input file and output directory
input_parquet = "train.parquet"
output_directory = "."

# Run the parallel processing and merging
process_large_parquet(input_parquet, output_directory)