In [1]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile('train.parquet')
print(parquet_file.schema.names)

['id', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles', 'molecule_smiles', 'protein_name', 'binds']


In [2]:
import dask.dataframe as dd

# Read train and test datasets
train_df = dd.read_parquet('train.parquet')
test_df = dd.read_parquet('test.parquet')

# Total number of rows in train dataset
total_rows = train_df.map_partitions(len).compute().sum()
print(f"Total number of training rows: {total_rows}")

# Total number of rows in test dataset
test_total_rows = test_df.map_partitions(len).compute().sum()
print(f"Total number of test rows: {test_total_rows}")

# Number of positive bindings
num_positive_bindings = train_df['binds'].sum().compute()
print(f"Number of positive bindings: {num_positive_bindings}")

# Number of negative bindings
num_negative_bindings = total_rows - num_positive_bindings
print(f"Number of negative bindings: {num_negative_bindings}")

# Percentage calculations
percent_positive = (num_positive_bindings / total_rows) * 100
percent_negative = (num_negative_bindings / total_rows) * 100
print(f"Percentage of positive bindings: {percent_positive:.2f}%")
print(f"Percentage of negative bindings: {percent_negative:.2f}%")

# Total unique proteins in train dataset
unique_proteins_train = train_df['protein_name'].dropna().unique().compute()
total_unique_proteins_train = len(unique_proteins_train)
print(f"Total unique proteins in train dataset: {total_unique_proteins_train}")

# Total unique proteins in test dataset
unique_proteins_test = test_df['protein_name'].dropna().unique().compute()
total_unique_proteins_test = len(unique_proteins_test)
print(f"Total unique proteins in test dataset: {total_unique_proteins_test}")

# Total unique proteins in both datasets
unique_proteins_all = dd.concat([
    train_df['protein_name'],
    test_df['protein_name']
]).dropna().unique().compute()
total_unique_proteins_all = len(unique_proteins_all)
print(f"Total unique proteins in both datasets: {total_unique_proteins_all}")
print(f"Unique proteins in both datasets: {unique_proteins_all.values}")

# Concatenate building block columns from both datasets
train_building_blocks = dd.concat([
    train_df['buildingblock1_smiles'],
    train_df['buildingblock2_smiles'],
    train_df['buildingblock3_smiles']
])

test_building_blocks = dd.concat([
    test_df['buildingblock1_smiles'],
    test_df['buildingblock2_smiles'],
    test_df['buildingblock3_smiles']
])

all_building_blocks = dd.concat([train_building_blocks, test_building_blocks])

# Compute unique building blocks
unique_building_blocks = all_building_blocks.dropna().unique().compute()
total_unique_building_blocks = len(unique_building_blocks)
print(f"Total unique building blocks (train and test): {total_unique_building_blocks}")

# Compute unique small molecules from train and test
train_small_molecules = train_df['molecule_smiles'].dropna()
test_small_molecules = test_df['molecule_smiles'].dropna()
all_small_molecules = dd.concat([train_small_molecules, test_small_molecules])

unique_small_molecules = all_small_molecules.unique().compute()
total_unique_small_molecules = len(unique_small_molecules)
print(f"Total unique small molecules (train and test): {total_unique_small_molecules}")

Total number of training rows: 295246830
Total number of test rows: 1674896
Number of positive bindings: 1589906
Number of negative bindings: 293656924
Percentage of positive bindings: 0.54%
Percentage of negative bindings: 99.46%
Total unique proteins in train dataset: 3
Total unique proteins in test dataset: 3
Total unique proteins in both datasets: 3
Unique proteins in both datasets: <ArrowStringArray>
['HSA', 'sEH', 'BRD4']
Length: 3, dtype: string
Total unique building blocks (train and test): 2110
Total unique small molecules (train and test): 99293632


In [4]:
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import json

# Define a function to calculate unique atom and edge types from a dataframe
def collect_unique_atom_and_edge_types(df):
    # Process the dataframe rows in parallel with progress tracking
    results = Parallel(n_jobs=-1)(
        delayed(process_row_for_types)(row) for _, row in tqdm(df.iterrows(), total=len(df), desc="Collecting atom and edge types")
    )

    # Collect unique atom and edge types
    molecule_node_types = set()
    molecule_edge_types = set()
    for node_types, edge_types in results:
        molecule_node_types.update(node_types)
        molecule_edge_types.update(edge_types)

    return molecule_node_types, molecule_edge_types

# Define a function to process each row for atom and edge type collection
def process_row_for_types(row):
    try:
        smiles = row['molecule_smiles']
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return set(), set()  # Skip invalid SMILES

        # Remove atoms and process the molecule
        atoms_to_remove = [atom.GetIdx() for atom in mol.GetAtoms() if atom.GetSymbol() == 'Dy']
        mol = Chem.EditableMol(mol)
        for idx in sorted(atoms_to_remove, reverse=True):
            mol.RemoveAtom(idx)
        mol = mol.GetMol()

        mol = Chem.AddHs(mol)

        # Collect atom and edge types
        atom_types = [atom.GetSymbol() for atom in mol.GetAtoms()]
        unique_atom_types = set(atom_types)
        edge_types = set()

        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            atype_i = atom_types[i]
            atype_j = atom_types[j]
            edge_types.add((atype_i, 'bond', atype_j))
            edge_types.add((atype_j, 'bond', atype_i))

        return unique_atom_types, edge_types

    except Exception as e:
        return set(), set()

# Load the cleaned data
cleaned_train_df = pd.read_parquet('cleaned_train_unique.parquet')
cleaned_test_df = pd.read_parquet('test.parquet')

# Collect unique atom and edge types from both train and test datasets
print("Collecting unique atom and edge types from train dataset...")
train_node_types, train_edge_types = collect_unique_atom_and_edge_types(cleaned_train_df)

print("Collecting unique atom and edge types from test dataset...")
test_node_types, test_edge_types = collect_unique_atom_and_edge_types(cleaned_test_df)

# Combine the unique types from both datasets
combined_node_types = train_node_types.union(test_node_types)
combined_edge_types = train_edge_types.union(test_edge_types)

# Save the combined unique atom and edge types to a JSON file
unique_types = {
    'molecule_node_types': sorted(list(combined_node_types)),
    'molecule_edge_types': sorted([list(edge_type) for edge_type in combined_edge_types])
}
with open('unique_atom_and_edge_types.json', 'w') as f:
    json.dump(unique_types, f, indent=4)


Collecting unique atom and edge types from train dataset...


Collecting atom and edge types: 100%|██████████| 1589770/1589770 [01:29<00:00, 17704.15it/s]


Collecting unique atom and edge types from test dataset...


Collecting atom and edge types: 100%|██████████| 1674896/1674896 [01:32<00:00, 18067.89it/s]


NameError: name 'json' is not defined