### EXTRACT PDB IDS

In [None]:
import requests
import json
from pathlib import Path
import pandas as pd

In [None]:
OUT_PATH = Path("../data/raw/all_current_pdb_ids.txt")

In [None]:
# url = "https://data.rcsb.org/rest/v1/holdings/current/entry_ids"
# resp = requests.get(url, timeout=30)
# resp.raise_for_status()          # ensure the request succeeded

# pdb_ids = resp.json()            # ← resp.json() is a Python list
# print(f"Retrieved {len(pdb_ids):,} current PDB IDs")

In [None]:
# OUT_PATH.write_text("\n".join(pdb_ids))

In [None]:
biolip_df = pd.read_csv('../data/raw/BioLiP_nr.txt', sep='\t', header=None)

In [None]:
columns = [
    "pdb_id",                      # 01: PDB ID
    "receptor_chain",              # 02: Receptor chain
    "resolution",                  # 03: Resolution (-1.00 indicates missing data)
    "binding_site_id",             # 04: Binding site number code
    "ligand_id",                   # 05: Ligand ID (CCD code)
    "ligand_chain",                # 06: Ligand chain
    "ligand_serial_number",        # 07: Ligand serial number
    "binding_residues_pdb",        # 08: Binding site residues (PDB numbering)
    "binding_residues_renum",      # 09: Binding site residues (renumbered from 1)
    "catalytic_residues_pdb",      # 10: Catalytic site residues (PDB numbering)
    "catalytic_residues_renum",    # 11: Catalytic site residues (renumbered from 1)
    "ec_number",                   # 12: EC number
    "go_terms",                    # 13: GO terms
    "binding_affinity_literature", # 14: Binding affinity (manual survey)
    "binding_affinity_moad",       # 15: Binding affinity (Binding MOAD)
    "binding_affinity_pdbbind",    # 16: Binding affinity (PDBbind-CN)
    "binding_affinity_bindingdb",  # 17: Binding affinity (BindingDB)
    "uniprot_id",                  # 18: UniProt ID
    "pubmed_id",                   # 19: PubMed ID
    "ligand_residue_seq_number",   # 20: Ligand residue sequence number
    "receptor_sequence"            # 21: Receptor sequence
]

In [None]:
biolip_df.columns = columns

In [None]:
pdb_ids = biolip_df['pdb_id']

import numpy as np

pdb_ids = pdb_ids.str.upper().unique()

In [None]:
with open("../data/raw/all_current_pdb_ids.txt") as f:
    all_pdb_ids = [line.strip().upper() for line in f if line.strip()]

# ✅ Compare: how many match?
biolip_ids_set = set(pdb_ids)
all_pdb_ids_set = set(all_pdb_ids)

shared = biolip_ids_set.intersection(all_pdb_ids_set)

In [None]:
len(shared)

In [None]:
non_biolip_ids = all_pdb_ids_set - biolip_ids_set

In [None]:
len(non_biolip_ids)

### Extract 5000 random proteins

In [None]:
import random

non_biolip_ids = list(non_biolip_ids)

random.seed(42)

sampled_non_biolip_ids = random.sample(non_biolip_ids, k=20000)

Non_OUT_PATH = Path("../data/raw/sample_non_biolip_pdb_ids.txt")

Non_OUT_PATH.write_text("\n".join(sampled_non_biolip_ids))

### Download cif file for each sampled protein

In [None]:
# import requests
# from pathlib import Path

# output_dir = Path("../data/raw/pdb_cifs")
# output_dir.mkdir(parents=True, exist_ok=True)

# for pid in sampled_non_biolip_ids:
#     url = f"https://files.rcsb.org/download/{pid.upper()}.cif.gz"
#     out_path = output_dir / f"pdb{pid.lower()}.ent.cif.gz"
    
#     if out_path.exists():
#         print(f"Already downloaded: {pid}")
#         continue

#     response = requests.get(url)
#     if response.status_code == 200:
#         with open(out_path, "wb") as f:
#             f.write(response.content)
#         print(f"Downloaded: {pid}")
#     else:
#         print(f"Failed: {pid} (HTTP {response.status_code})")


### Extract all residues of sample proteins

In [None]:
from pathlib import Path
from Bio.PDB import MMCIFParser
from Bio.PDB.Polypeptide import is_aa
import pandas as pd
import gzip

# ✅ 3-letter to 1-letter mapping
three_to_one_dict = {
    "ALA": "A", "ARG": "R", "ASN": "N", "ASP": "D",
    "CYS": "C", "GLN": "Q", "GLU": "E", "GLY": "G",
    "HIS": "H", "ILE": "I", "LEU": "L", "LYS": "K",
    "MET": "M", "PHE": "F", "PRO": "P", "SER": "S",
    "THR": "T", "TRP": "W", "TYR": "Y", "VAL": "V",
    "SEC": "U", "PYL": "O",
}

parser = MMCIFParser(QUIET=True)
records = []

cif_dir = Path("../data/raw/pdb_cifs")

for pid in sampled_non_biolip_ids:
    cif_file = next(cif_dir.glob(f"pdb{pid.lower()}.ent.cif.gz"), None)
    if cif_file is None:
        print(f"Missing file: {pid}")
        continue

    try:
        with gzip.open(cif_file, mode='rt') as handle:
            structure = parser.get_structure(pid, handle)

        model = structure[0]

        for chain in model:
            chain_residues = []
            seq_list = []

            for res in chain:
                if not is_aa(res, standard=True):
                    continue

                hetflag, seq_num, icode = res.get_id()
                pdb_residue_number = seq_num
                resname = res.get_resname().upper()

                one_letter = three_to_one_dict.get(resname, 'X')
                seq_list.append(one_letter)

                chain_residues.append({
                    "pdb_id": pid,
                    "chain_id": chain.id,
                    "pdb_residue_number": pdb_residue_number,
                    "residue_name": resname,
                    "receptor_sequence": None  # placeholder
                })

            if not chain_residues:
                continue

            receptor_sequence = "".join(seq_list)

            for renum_idx, r in enumerate(chain_residues, start=1):
                r["renum_residue_number"] = renum_idx
                r["receptor_sequence"] = receptor_sequence

            records.extend(chain_residues)

    except KeyError as e:
        print(f"KeyError for {pid}: {e}. Skipping.")
    except Exception as e:
        print(f"Other error for {pid}: {e}. Skipping.")

print(f"Total residues extracted: {len(records):,}")

df = pd.DataFrame(records)
print(df.head())


In [None]:
df.to_csv("../data/raw/sample_non_biolip_residues.csv", index=False)

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df['receptor_sequence'].unique()

In [None]:
df[["pdb_id", "chain_id"]].drop_duplicates().shape[0]

In [None]:
df['receptor_sequence'].nunique()

In [None]:
df[["pdb_id", "chain_id", "receptor_sequence"]].drop_duplicates().groupby("receptor_sequence").size().value_counts()

### Construct FASTA files

In [None]:
import pandas as pd

df = pd.read_csv("../data/raw/sample_non_biolip_residues.csv")

In [None]:
import os

# Create output FASTA folder
fasta_dir = "../data/raw/iupred_fasta_non_biolip"
os.makedirs(fasta_dir, exist_ok=True)

# Drop duplicates to ensure unique PDB-chain combinations
unique_entries = df[['pdb_id', 'chain_id', 'receptor_sequence']].drop_duplicates()

# Write FASTA file for each entry
fasta_paths = []
for _, row in unique_entries.iterrows():
    pdb_id = row['pdb_id']
    chain = row['chain_id']
    sequence = row['receptor_sequence']
    fasta_filename = f"{pdb_id}_{chain}.fasta"
    fasta_path = os.path.join(fasta_dir, fasta_filename)
    fasta_paths.append(fasta_path)
    if chain == "SX" or chain == "SX0":
        print(f"Skipping {pdb_id} chain {chain} due to invalid chain identifier.")
    
    with open(fasta_path, "w") as f:
        f.write(f">{pdb_id}_{chain}\n{sequence}\n")

In [None]:
import subprocess
import os
import glob

def run_psiblast(fasta_path, db_path, output_dir, num_iterations=3):
    pdb_id = os.path.splitext(os.path.basename(fasta_path))[0]
    pssm_ascii = os.path.join(output_dir, f"{pdb_id}.pssm")

    if os.path.exists(pssm_ascii):
        print(f"Skipping {pdb_id}, PSSM already exists.")
        return

    cmd = [
        "psiblast",
        "-query", fasta_path,
        "-db", db_path,
        "-num_iterations", str(num_iterations),
        "-out_ascii_pssm", pssm_ascii,
        "-evalue", "0.001"
    ]
    print(f"Running PSI-BLAST for {pdb_id}...")
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"PSI-BLAST failed for {pdb_id}: {e}")

if __name__ == "__main__":
    fasta_dir = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/raw/iupred_fasta_non_biolip"
    blast_db = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/raw/blast_db/uniprot_sprot_db"
    output_dir = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/processed/non_biolip_pssm_1"
    os.makedirs(output_dir, exist_ok=True)

    fasta_files = glob.glob(os.path.join(fasta_dir, "*.fasta"))

    for fasta_path in fasta_files:
        run_psiblast(fasta_path, blast_db, output_dir)

In [None]:
import pandas as pd
from pathlib import Path
from Bio.PDB import MMCIFParser, DSSP
import warnings
from Bio.PDB.PDBExceptions import PDBConstructionWarning
# Add these new imports for handling gzipped files
import gzip
import tempfile

# ----------------------- settings -----------------------
# split_id_file  = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/raw/pdb_ids_part3.csv"
structure_dir  = Path("/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/raw/pdb_cifs")
dssp_exe       = "mkdssp"
output_path    = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/processed/non_biolip_residue_features_1.csv"
# --------------------------------------------------------

warnings.simplefilter("ignore", PDBConstructionWarning)

# This assumes `non_biolip_ids` is a list of PDB IDs like ['1a2t', '1a2w', ...]
pdb_ids = (list(id.lower() for id in non_biolip_ids))
parser  = MMCIFParser(QUIET=True)
rows    = []

for pdb_id in pdb_ids:
    # 1. Correct the filename to match the format in your directory
    cif_gz_file = structure_dir / f"pdb{pdb_id}.ent.cif.gz"

    if not cif_gz_file.exists():
        print(f"[!] Missing structure: {pdb_id}")
        continue

    # 2. Decompress the .gz file into a temporary file
    try:
        with gzip.open(cif_gz_file, 'rt') as gz_f:
            cif_content = gz_f.read()
        
        # Create a temporary file to store the uncompressed content
        with tempfile.NamedTemporaryFile(mode='w', delete=True, suffix='.cif') as temp_file:
            temp_file.write(cif_content)
            temp_file.flush() # Ensure content is written to disk

            # Use the temporary file path for Biopython and DSSP
            structure = parser.get_structure(pdb_id, temp_file.name)
            dssp      = DSSP(structure[0], temp_file.name, dssp=dssp_exe)

    except Exception as e:
        print(f"[!] Skipping {pdb_id}: DSSP failed – {e}")
        continue

    for (chain_id, res_id), data in dssp.property_dict.items():
        hetflag, resnum, icode = res_id
        residue_id = f"{resnum}{icode.strip()}" if icode.strip() else str(resnum)
        aa         = data[1]

        rows.append({
            "pdb_id": pdb_id,
            "chain_id": chain_id,
            "residue_id": residue_id,
            "residue_name": aa,
            "secondary_structure": data[2],
            "absolute_sasa": data[3],
            "relative_asa": data[4],
            "phi": data[5],
            "psi": data[6],
            "hbond_NH_O1_energy": data[7],
            "hbond_NH_O2_energy": data[8],
            "hbond_O_NH1_energy": data[9],
            "hbond_O_NH2_energy": data[10],
            "is_aromatic": int(aa in {"F","Y","W","H"}),
            "is_polar":      int(aa in {"S","T","N","Q","Y","C"}),
            "is_charged":    int(aa in {"R","K","D","E","H"}),
            "is_hydrophobic": int(aa in {"A","V","I","L","M","F","W","Y"}),
        })

df = pd.DataFrame(rows)
df.to_csv(output_path, index=False)
print(f"[✓] Saved {len(df)} residue rows → {output_path}")

In [None]:
import subprocess
import pandas as pd
import os
import glob

def run_iupred2a(fasta_path, mode='long', use_anchor=False):
    if use_anchor and mode != 'long':
        raise ValueError("Anchor mode must use 'long' as the prediction type.")

    if use_anchor:
        cmd = [
            'python3',
            '/home/mpradhan007/Academic/Research_Projects/Intern_Research/iupred2a/iupred2a.py',
            '-a',
            fasta_path,
            'long'
        ]
    else:
        cmd = [
            'python3',
            '/home/mpradhan007/Academic/Research_Projects/Intern_Research/iupred2a/iupred2a.py',
            fasta_path,
            mode
        ]

    result = subprocess.run(cmd, capture_output=True, text=True)
    output = result.stdout

    if not output or "Usage:" in output or "not found" in output:
        raise RuntimeError(f"IUPred2A failed for {os.path.basename(fasta_path)} in mode '{mode}':\n{result.stderr or output}")

    lines = output.strip().split('\n')
    data = []

    for line in lines:
        if line.startswith('#') or not line.strip():
            continue
        fields = line.strip().split()
        if len(fields) < 3:
            continue
        pos, aa, score = fields[0], fields[1], fields[2]
        label = 'anchor' if use_anchor else mode
        data.append({
            'position': int(pos),
            'amino_acid': aa,
            f'iupred2a_{label}_score': float(score)
        })

    return pd.DataFrame(data)


def extract_ids_from_filename(filename):
    basename = os.path.basename(filename)
    pdb_chain = basename.replace('.fasta', '')
    if '_' not in pdb_chain:
        raise ValueError(f"Filename {basename} does not follow the expected 'pdb_chain.fasta' format.")
    pdb_id, chain_id = pdb_chain.split('_')
    if chain_id == 'SX':
        print("incorrect chain_id, skipping")
    return pdb_id.lower(), chain_id.upper()


if __name__ == "__main__":
    input_dir = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/raw/iupred_fasta_non_biolip"
    output_csv = "/home/mpradhan007/Academic/Research_Projects/Intern_Research/data/processed/non_biolip_iupred2a_scores_all.csv"

    all_files = glob.glob(os.path.join(input_dir, "*.fasta"))
    all_dfs = []

    for fasta_file in all_files:
        pdb_id, chain_id = extract_ids_from_filename(fasta_file)

        try:
            df_long = run_iupred2a(fasta_file, mode='long')
            df_short = run_iupred2a(fasta_file, mode='short')
            df_anchor = run_iupred2a(fasta_file, mode='long', use_anchor=True)

            df = df_long.merge(df_short, on=['position', 'amino_acid']) \
                        .merge(df_anchor, on=['position', 'amino_acid'])

            df['pdb_id'] = pdb_id
            df['chain_id'] = chain_id

            all_dfs.append(df)
        except Exception as e:
            print(f"Failed on {fasta_file}: {e}")

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        final_df.to_csv(output_csv, index=False)
        print(f"Saved combined IUPred2A scores to {output_csv}")
    else:
        print("No data processed.")


In [None]:
import pandas as pd

In [None]:
df_residue_1 = pd.read_csv("../data/processed/sample_non_biolip_residue_structural_features_biopdb.csv")

In [None]:
df_residue_1.head()

In [None]:
df_residue_1.shape

In [None]:
df_iupred = pd.read_csv("../data/processed/non_biolip_iupred2a_scores_all.csv", nrows=1000)

In [None]:
df_iupred.head()

In [None]:
df_iupred.shape

## Merge all the extracted features

In [1]:
import pandas as pd

In [2]:
df_master  = pd.read_csv("../data/raw/sample_non_biolip_residues.csv")

In [3]:
df_master.head()

Unnamed: 0,pdb_id,chain_id,pdb_residue_number,residue_name,receptor_sequence,renum_residue_number
0,7K86,A,1,MET,MTCRTRFAPSPTGYLHIGGARTALYCWLEARHRGGEFVLRIEDTDR...,1
1,7K86,A,2,THR,MTCRTRFAPSPTGYLHIGGARTALYCWLEARHRGGEFVLRIEDTDR...,2
2,7K86,A,3,CYS,MTCRTRFAPSPTGYLHIGGARTALYCWLEARHRGGEFVLRIEDTDR...,3
3,7K86,A,4,ARG,MTCRTRFAPSPTGYLHIGGARTALYCWLEARHRGGEFVLRIEDTDR...,4
4,7K86,A,5,THR,MTCRTRFAPSPTGYLHIGGARTALYCWLEARHRGGEFVLRIEDTDR...,5


In [4]:
df_pssm = pd.read_csv("../data/processed/non_biolip_pssm_features.csv")

In [5]:
for df in (df_master, df_pssm):
    df["pdb_id"]  = df["pdb_id"].astype(str).str.strip().str.lower()

In [6]:
# --- 2.  Bring the identifier columns to the same spelling & dtype -----------
df_pssm = (
    df_pssm
      .rename(columns={"chain": "chain_id",          # make the names match
                       "position": "renum_residue_number"})
)

In [7]:
key_cols = ["pdb_id", "chain_id", "renum_residue_number"]

# 1 Keep the first row for each key triple, drop the rest
df_pssm = df_pssm.drop_duplicates(subset=key_cols, keep="first")

In [8]:
# --- 3.  Merge ---------------------------------------------------------------
df_master = (
    df_master
      .merge(
          df_pssm,
          on=["pdb_id", "chain_id", "renum_residue_number"],
          how="left",             # keep every residue in your master table
          validate="one_to_one",   # raises if duplicates slip through
          indicator=True           # adds a column telling whether it matched
      )
)

In [12]:
df_residue = pd.read_csv("../data/processed/non_biolip_iupred2a_scores_all.csv")

In [11]:
df_residue.head()

Unnamed: 0,pdb_id,chain_id,residue_id,residue_name,secondary_structure,absolute_sasa,relative_asa,phi,psi,hbond_NH_O1_energy,hbond_NH_O2_energy,hbond_O_NH1_energy,hbond_O_NH2_energy,is_aromatic,is_polar,is_charged,is_hydrophobic
0,2brh,A,6,V,-,0.922535,360.0,-34.6,0,0.0,2,-0.2,0,0,0,0,1
1,2brh,A,7,E,-,0.731959,-84.2,162.0,21,-0.1,23,-0.1,22,0,0,1,0
2,2brh,A,8,D,-,0.533742,96.2,114.3,-2,-0.2,21,-1.9,21,0,0,1,0
3,2brh,A,9,W,E,0.400881,-125.4,131.0,19,-0.2,2,-0.4,20,1,0,0,1
4,2brh,A,10,D,E,0.527607,-90.5,133.3,17,-2.3,17,-2.3,-2,0,0,1,0


In [None]:
df_residue.head()

In [None]:
# --- 2.  Bring the identifier columns to the same spelling & dtype -----------
df_residue = (
    df_residue
      .rename(columns={"residue_id": "pdb_residue_number"})
)

In [None]:
# columns that define a “match”
key_cols = ["pdb_id", "chain_id", "pdb_residue_number"]

# (1) make sure both tables have only one row per key so the count is honest
left_keys  = df_master[key_cols].drop_duplicates()
right_keys = df_residue[key_cols].drop_duplicates()

# (2) inner-join just those key columns
matching_keys = left_keys.merge(right_keys, on=key_cols, how="inner")

n_match   = len(matching_keys)          # keys shared by both tables
n_master  = len(left_keys)              # unique keys in df_master
n_residue = len(right_keys)             # unique keys in df_residue

print(f"{n_match:,} of {n_master:,} keys in df_master "
      f"({n_match/n_master:.1%}) are present in df_residue.")
print(f"{n_match:,} of {n_residue:,} keys in df_residue "
      f"({n_match/n_residue:.1%}) are present in df_master.")


In [None]:
key_cols = ["pdb_id", "chain_id", "pdb_residue_number"]

# 1 Keep the first row for each key triple, drop the rest
df_residue = df_residue.drop_duplicates(subset=key_cols, keep="first")
df_master = df_master.drop_duplicates(subset=key_cols, keep="first")

In [None]:
df_master.drop(columns="_merge", errors="ignore", inplace=True)

In [None]:
# --- 3.  Merge ---------------------------------------------------------------
df_master = (
    df_master
      .merge(
          df_residue,
          on=["pdb_id", "chain_id", "pdb_residue_number"],
          how="left",             # keep every residue in your master table
          validate="one_to_one",   # raises if duplicates slip through
          indicator=True           # adds a column telling whether it matched
      )
)

In [None]:
df_residue.head()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
df_master.head()

In [None]:
df_residue.head()

In [None]:
df_master.shape, df_residue.shape

In [None]:
# 1) Grand total of all NaNs in the whole table
total_missing = df_master.isna().sum().sum()
print(f"Total missing cells: {total_missing:,}")

# 2) Per-column breakdown (sorted, largest first)
missing_by_col = (
    df_master.isna()
             .sum()
             .sort_values(ascending=False)
)
print(missing_by_col)
