In [23]:
import pandas as pd
import re

### TODO List
- Load dataset
- make a function "parse_pdb_column" which will do the following things
    - it will create a new dataset with 4 extra column pdb_id, chain, start_residue & end_residue
    - save the data in a new .csv file

1. There are 3 types of data in the PDB column, based on this I will create new 4 columns
2. The conditions are like
    1. If the data is like this format, "1L8W (Chain B: 29–335)", then pdb_id will contain value "1ARR", chain will be "B", start_residue will be 29 & end_residue 335.
    2. If the data is like this format, "1CUN (113–219)", then pdb_id will contain value "1CUN", chain will be "A" as there is no chain string in the data set default value "A", start_residue will be 113 & end_residue 219.
    3. If the data is like this format, "1RYK", then pdb_id will contain value "1RYK", chain will be "A" as there is no "chain" string in the data set default value "A", start_residue & end_residue will be empty as there is nothing in the data.

In [45]:
# Load the dataset
df = pd.read_excel('./data/Final_2Sm_modified_work_base.xlsx')

# Function to parse the PDB column
def parse_pdb_column(pdb_str):
    # Match patterns like "1L8W (Chain B: 29–335)"
    match = re.match(r"(\w+)\s*\(Chain\s+(\w):\s*(\d+)–(\d+)\)", pdb_str)
    if match:
        return match.group(1), match.group(2), match.group(3), match.group(4)

    # Match patterns like "1CUN (113–219)"
    match = re.match(r"(\w+)\s*\((\d+)–(\d+)\)", pdb_str)
    if match:
        return match.group(1), 'A', match.group(2), match.group(3)  # Default chain to 'A'

    # Match patterns like "1RYK"
    match = re.match(r"(\w+)", pdb_str)
    if match:
        return match.group(1), 'A', '', ''  # Default chain to 'A', start_residue and end_residue to empty

    # If none of the patterns match, return empty values
    return pdb_str, 'A', '', ''  # Return the original pdb_str if no pattern matches

# Apply the function to parse the 'PDB' column
df[['pdb_id', 'chain', 'start_residue', 'end_residue']] = df.apply(
    lambda row: pd.Series(parse_pdb_column(row['PDB'])), axis=1
)

# Convert 'start_residue' and 'end_residue' to numeric types (integers if possible)
df['start_residue'] = pd.to_numeric(df['start_residue'], errors='coerce').fillna(0).astype(int)
df['end_residue'] = pd.to_numeric(df['end_residue'], errors='coerce').fillna(0).astype(int)

# Replace 0 back to NaN (or empty string) for start_residue and end_residue where appropriate
df['start_residue'].replace(0, '', inplace=True)
df['end_residue'].replace(0, '', inplace=True)

# When saving to Excel, explicitly setting the dtype for 'pdb_id' and 'chain' as text is not directly supported by to_excel.
# However, ensuring 'start_residue' and 'end_residue' are numeric helps maintain clarity of data types.
df.to_excel('./data/Final_2Sm_modified_work_generated.xlsx', index=False)

In [46]:
# Show the modified DataFrame
print(df.head())

   no             PDB pdb_id chain start_residue end_residue
0   1    1ARR (1–106)   1ARR     A             1         106
1   2            1BA5   1BA5     A                          
2   3    1CUN (7–112)   1CUN     A             7         112
3   4  1CUN (113–219)   1CUN     A           113         219
4   5   1E41 (93–192)   1E41     A            93         192


In [2]:
import os
from Bio.PDB import PDBParser, PPBuilder
import requests

In [50]:
# function to fetch the protein sequence based on PDB_id, chain_id, start_residue & end_residue

def get_pdb_sequence(pdb_id, chain_id=None, start=None, end=None):
    # Ensure temp_pdb directory exists
    temp_dir = "./temp_pdb"
    os.makedirs(temp_dir, exist_ok=True)

    # Define the path for the temporary PDB file
    pdb_filename = os.path.join(temp_dir, f"{pdb_id}.pdb")

    # Fetch the PDB file from RCSB PDB
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch PDB file for {pdb_id}.")
        return None

    # Save the PDB content to the temporary file
    with open(pdb_filename, 'w') as file:
        file.write(response.text)

    # Parse the PDB file
    pdb_parser = PDBParser()
    structure = pdb_parser.get_structure(pdb_id, pdb_filename)

    # Extract sequence for the specified chain using PPBuilder
    for model in structure:
        for chain in model:
            if chain_id is None or chain.id == chain_id:
                for pp in PPBuilder().build_peptides(chain):
                    sequence = pp.get_sequence()
                    # If start and end are specified, slice the sequence
                    if start is not None and end is not None:
                        sequence = sequence[start-1:end]
                    return str(sequence)
    return None

In [None]:
# Example usage
pdb_id = "1TP3"
chain_id = "A"  # Adjust based on your needs or keep as None to fetch for all chains
start_residue = None
end_residue = None
sequence = get_pdb_sequence(pdb_id, chain_id, start_residue, end_residue)
print(f"Sequence for {pdb_id}, Chain {chain_id}, Residues {start_residue}-{end_residue}:\n{sequence}")

In [6]:
def fetch_sequence_from_url(url):
    """
    Fetch a sequence from a given URL.

    :param url: The URL to fetch the sequence from.
    :return: The fetched sequence as a string, or None if the fetch fails.
    """
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the sequence from the response content
        content = response.text
        # Assuming the FASTA format, the first line is the header (which we skip),
        # and the rest is the sequence.
        sequence_lines = content.split('\n')[1:]  # Skip the header line
        sequence = ''.join(sequence_lines)  # Combine the sequence lines
        return sequence
    else:
        print(f"Failed to fetch the sequence. HTTP Status Code: {response.status_code}")
        return None

# Example usage


In [14]:
pdb_id = "2KDI"
url = f"https://www.rcsb.org/fasta/entry/{pdb_id}/display"
sequence = fetch_sequence_from_url(url)
print(f"pdb_id: {pdb_id}")
print(f"Length: {len(sequence)}")
print(f"Sequence:\n{sequence}")

pdb_id: 2KDI
Length: 114
Sequence:
MHHHHHHGEFQIFAKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIWAGKQLEDGRTLSDYNIQRESTLHLVLRLRGGSMGGAADEEELIRKAIELSLKESRNSGGY
