In [None]:
! pip install biopython
import requests
from Bio import PDB
import pandas as pd

def fetch_pdb_file(pdb_id):
    """Fetches a PDB file from the RCSB database."""
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"{pdb_id}.pdb", "wb") as file:
            file.write(response.content)
        return f"{pdb_id}.pdb"
    else:
        print(f"Failed to download PDB file {pdb_id}. Status code: {response.status_code}")
        return None

def extract_chain_info(pdb_file):
    """Extracts chain information, distinguishing between protein and DNA chains."""
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    
    protein_chains = []
    dna_chains = []
    
    residue_to_base = {
        'DA': 'A',  # Adenine
        'DC': 'C',  # Cytosine
        'DG': 'G',  # Guanine
        'DT': 'T'   # Thymine
    }
    
    for model in structure:
        for chain in model:
            is_protein = False
            is_dna = False
            for residue in chain:
                res_name = residue.get_resname()
                if res_name in residue_to_base:
                    is_dna = True
                elif res_name not in ['HOH', 'H2O']:
                    is_protein = True
            if is_protein:
                protein_chains.append(chain.get_id())
            if is_dna:
                dna_chains.append(chain.get_id())
    
    return protein_chains, dna_chains

# List of PDB IDs
pdb_ids = [
    "6ki6", "6kbs", "5zvb", "5zva", "5xs0", "5ke6", "5ed4", "5co8", "4yg4", "4x23", 
    "4wcg", "4qtk", "4qju", "4nm6", "4m9e", "4lll", "4ljr", "4l0z", "4kmf", "4k4g", 
    "4hqb", "4hn5", "4gzn", "4ch1", "4bnc", "3wpd", "3wpc", "3qi5", "3ode", "3od8", 
    "3mx9", "3gqc", "3bs1", "2vs7", "2vla", "2qby", "2moe", "2i05", "1w7a", "1qzg", 
    "1qbj", "1pvi", "1puf", "1par", "1mse", "1lmb", "1k82", "1j5n", "1io4", "1g9z", 
    "1az0"
]

# Data storage
results = []

# Iterate over each PDB ID
for pdb_id in pdb_ids:
    pdb_file = fetch_pdb_file(pdb_id)
    if pdb_file:
        protein_chains, dna_chains = extract_chain_info(pdb_file)
        results.append({
            "PDB ID": pdb_id,
            "Protein Chains": len(protein_chains),
            "DNA Chains": len(dna_chains),
            "Protein Chain Names": ','.join(protein_chains),
            "DNA Chain Names": ','.join(dna_chains)
        })

# Convert results to DataFrame
df = pd.DataFrame(results)

# Set pandas options for better display in Colab
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.colheader_justify', 'center')  # Center-align headers
pd.set_option('display.width', 1000)  # Set display width to avoid cutting off columns

# Display the DataFrame in a more organized way in Google Colab
from IPython.display import display
display(df.style.set_properties(**{'text-align': 'center'}).set_table_styles([{
    'selector': 'th',
    'props': [('text-align', 'center')]
}]))

# Save the DataFrame to an Excel file
df.to_excel("/content/pdb_chain_info.xlsx", index=False)

print("PDB chain information has been saved to 'pdb_chain_info.xlsx'")
