In [1]:
import sys
from pathlib import Path

project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

In [31]:
import re

from Bio.PDB import PDBParser
from Bio.SeqUtils import IUPACData
import pandas as pd
from tqdm import tqdm

from src.constants import HACKATHON_DATA, ALPHAFOLD_STRUCTURES_DIR, DATA_DIR

In [18]:
data = pd.read_csv(HACKATHON_DATA)
unique_uniprot_ids = data['uniprot_id'].unique()
unique_uniprot_ids

array(['Q8IL11', 'Q9NAV8', 'B3TMQ9', ..., 'Q96AZ6', 'Q96X16', 'P69834'],
      shape=(8130,), dtype=object)

In [44]:
def parse_pocket_selection(
    selection: str,
    default_chain: str = "A",
) -> tuple[str, list[int]]:
    """
    Parse a PyMOL selection string to extract chain ID and residue indices.
    """
    chain_match = re.search(r'chain\s+([A-Za-z])', selection)
    chain_id = chain_match.group(1) if chain_match else default_chain

    resi_match = re.search(r'resi\s+([0-9+\s]+)', selection)
    if not resi_match:
        raise ValueError("No residue information found in selection string.")

    resi_list = [int(r.strip()) for r in resi_match.group(1).split('+') if r.strip().isdigit()]

    return chain_id, resi_list


def aa_3_to_1(three_letter_code: str) -> str:
    """
    Convert a three-letter amino acid code to a one-letter code.
    """
    # Capitalize first letter and convert the rest to lowercase
    three_letter_code = three_letter_code.capitalize()
    if three_letter_code in IUPACData.protein_letters_3to1:
        return IUPACData.protein_letters_3to1[three_letter_code]
    else:
        raise ValueError(f"Invalid three-letter amino acid code: {three_letter_code}")


def extract_sequence(pdb_file: Path) -> str:
    """
    Extract the sequence from a PDB file.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure('PDB', pdb_file)

    # Assume the first model and chain for AlphaFold structures
    model = next(structure.get_models())
    chain = next(model.get_chains())
    seq = [aa_3_to_1(residue.resname) for residue in chain]
    return ''.join(seq)

In [45]:
pdb_parser = PDBParser(QUIET=True)
sequences = []

for uniprot_id in tqdm(unique_uniprot_ids, desc="Processing UniProt IDs"):
    pdb_file = ALPHAFOLD_STRUCTURES_DIR / f"{uniprot_id}.cif" # Yes, the format is mismatched, pls ignore this for now
    try:
        sequence = extract_sequence(pdb_file)
        sequences.append(sequence)
        # print(f"Extracted sequence for {uniprot_id}: {sequence}")
    except Exception as e:
        sequences.append(None)
        print(f"Error processing {uniprot_id}: {e}")

Processing UniProt IDs:  18%|█▊        | 1445/8130 [00:27<02:08, 52.06it/s]

Error processing A0A7U3TBV6: [Errno 2] No such file or directory: '/Users/pawel/deeplife-binding-sites/data/structures/alphafold/A0A7U3TBV6.cif'


Processing UniProt IDs:  71%|███████▏  | 5807/8130 [01:43<00:34, 66.42it/s]

Error processing A1X808: Empty file.


Processing UniProt IDs: 100%|██████████| 8130/8130 [02:22<00:00, 57.22it/s]


In [47]:
sequences_df_path = DATA_DIR / 'sequences.csv'
sequences_df = pd.DataFrame({
    'uniprot_id': unique_uniprot_ids,
    'sequence': sequences
})
sequences_df.to_csv(sequences_df_path, index=False)