In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

In [24]:
import re

from Bio.PDB import PDBParser
import pandas as pd
from tqdm import tqdm

from src.constants import HACKATHON_DATA, ALPHAFOLD_STRUCTURES_DIR, DATA_DIR

In [18]:
data = pd.read_csv(HACKATHON_DATA)
unique_uniprot_ids = data['uniprot_id'].unique()
unique_uniprot_ids

array(['Q8IL11', 'Q9NAV8', 'B3TMQ9', ..., 'Q96AZ6', 'Q96X16', 'P69834'],
      shape=(8130,), dtype=object)

In [5]:
def parse_pocket_selection(
    selection: str,
    default_chain: str = "A",
) -> tuple[str, list[int]]:
    """
    Parse a PyMOL selection string to extract chain ID and residue indices.
    """
    chain_match = re.search(r'chain\s+([A-Za-z])', selection)
    chain_id = chain_match.group(1) if chain_match else default_chain

    resi_match = re.search(r'resi\s+([0-9+\s]+)', selection)
    if not resi_match:
        raise ValueError("No residue information found in selection string.")

    resi_list = [int(r.strip()) for r in resi_match.group(1).split('+') if r.strip().isdigit()]

    return chain_id, resi_list

In [9]:
def extract_sequence(pdb_file: Path) -> str:
    """
    Extract the sequence from a PDB file.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('PDB', pdb_file)

    sequence = []
    for model in structure:
        for chain in model:
            seq = ''.join(residue.resname for residue in chain if residue.id[0] == ' ')
            sequence.append(seq)

    return ''.join(sequence)

In [22]:
pdb_parser = PDBParser(QUIET=True)
sequences = []

for uniprot_id in tqdm(unique_uniprot_ids[:100]):
    pdb_file = ALPHAFOLD_STRUCTURES_DIR / f"{uniprot_id}.cif" # Yes, the format is mismatched, pls ignore this for now
    if not pdb_file.exists():
        print(f"File {pdb_file} does not exist.")
        continue

    try:
        sequence = extract_sequence(pdb_file)
        sequences.append(sequence)
        # print(f"Extracted sequence for {uniprot_id}: {sequence}")
    except Exception as e:
        sequences.append(None)
        print(f"Error processing {uniprot_id}: {e}")

100%|██████████| 100/100 [00:07<00:00, 13.93it/s]


In [25]:
sequences_df_path = DATA_DIR / 'sequences.csv'
sequences_df = pd.DataFrame({
    'uniprot_id': unique_uniprot_ids[:100],
    'sequence': sequences
})
sequences_df.to_csv(sequences_df_path, index=False)