In [1]:
! pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [15]:
# Install Biopython if you haven't already
# !pip install biopython

import requests
from Bio import PDB

def fetch_pdb_file(pdb_id):
    """Fetches a PDB file from the RCSB database."""
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"{pdb_id}.pdb", "wb") as file:
            file.write(response.content)
        print(f"Successfully downloaded {pdb_id}.pdb")
    else:
        print(f"Failed to download PDB file. Status code: {response.status_code}")

def extract_dna_sequences(pdb_file):
    """Extracts DNA sequences from the given PDB file, excluding non-DNA residues and water molecules."""
    # Load the PDB structure
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)

    dna_sequences = {}

    # Define a map for standard residue name to base
    residue_to_base = {
        'DA': 'A',  # Adenine
        'DC': 'C',  # Cytosine
        'DG': 'G',  # Guanine
        'DT': 'T'   # Thymine
    }

    # Iterate over all models
    for model in structure:
        # Iterate over all chains
        for chain in model:
            sequence = []
            for residue in chain:
                res_name = residue.get_resname()
                # Include standard DNA bases and ignore non-DNA residues and water molecules
                if res_name in residue_to_base:
                    sequence.append(residue_to_base[res_name])
                elif res_name not in ['HOH', 'H2O'] and not res_name.startswith('BRU'):  # Exclude water and heteroatoms
                    sequence.append(res_name)  # Append non-standard residues as-is
            if sequence:
                dna_sequences[chain.get_id()] = ''.join(sequence)

    return dna_sequences

# Specify the PDB ID
pdb_id = '1EBM'

# Fetch the PDB file
fetch_pdb_file(pdb_id)

# Extract DNA sequences
dna_sequences = extract_dna_sequences(f"{pdb_id}.pdb")

# Display the results
for chain, sequence in dna_sequences.items():
    print(f"Chain {chain}: {sequence}")



Successfully downloaded 1EBM.pdb
Chain C: GCGTCCA8OGGTCTACCCA
Chain D: GGTAGACCTGGACGC
Chain A: GLYSERGLUGLYHISARGTHRLEUALASERTHRPROALALEUTRPALASERILEPROCYSPROARGSERGLULEUARGLEUASPLEUVALLEUPROSERGLYGLNSERPHEARGTRPARGGLUGLNSERPROALAHISTRPSERGLYVALLEUALAASPGLNVALTRPTHRLEUTHRGLNTHRGLUGLUGLNLEUHISCYSTHRVALTYRARGSERGLNALASERARGPROTHRPROASPGLULEUGLUALAVALARGLYSTYRPHEGLNLEUASPVALTHRLEUALAGLNLEUTYRHISHISTRPGLYSERVALASPSERHISPHEGLNGLUVALALAGLNLYSPHEGLNGLYVALARGLEULEUARGGLNASPPROILEGLUCYSLEUPHESERPHEILECYSSERSERASNASNASNILEALAARGILETHRGLYMETVALGLUARGLEUCYSGLNALAPHEGLYPROARGLEUILEGLNLEUASPASPVALTHRTYRHISGLYPHEPROSERLEUGLNALALEUALAGLYPROGLUVALGLUALAHISLEUARGLYSLEUGLYLEUGLYTYRARGALAARGTYRVALSERALASERALAARGALAILELEUGLUGLUGLNGLYGLYLEUALATRPLEUGLNGLNLEUARGGLUSERSERTYRGLUGLUALAHISLYSALALEUCYSILELEUPROGLYVALGLYTHRGLNVALALAASPCYSILECYSLEUMETALALEUASPLYSPROGLNALAVALPROVALASPVALHISMETTRPHISILEALAGLNARGASPTYRSERTRPHISPROTHRTHRSERGLNALALYSGLYPROSERPROGLNTHRASNLYSGLULEUGLYASNPHEPHEARGSERLEUTRPGLYPROTYRALAGLYT