In [1]:
#!/usr/bin/env python
"""
Attach IUPred2A and PSSM features to the master BioPDB feature table.
Outputs one CSV with every residue row plus the new feature columns.
"""

import pandas as pd
from pathlib import Path

In [2]:
# ─────────────────────────── paths ────────────────────────────
master_in   = Path("../data/processed/BioPDB_features_with_labels_renumonly.csv")
iupred_in   = Path("../data/processed/iupred2a_scores_all.csv")
pssm_in     = Path("../data/processed/pssm_features_all.csv")

master_out  = Path("../data/processed/BioPDB_master_w_iupred_pssm.csv")

chunksize   = 1_000_000            # tweak to fit your RAM
cat_dtype   = {"pdb_id":"category", "chain_id":"category"}


In [3]:
# ────────────────── 1) load IUPred2A & PSSM once ──────────────────
print("Loading IUPred2A feature table …")
iupred = pd.read_csv(
    iupred_in,
    dtype={**cat_dtype,
           "position":"int32",
           "iupred2a_long_score":"float32",
           "iupred2a_short_score":"float32",
           "iupred2a_anchor_score":"float32"}
).rename(columns={"position":"renum_residue_number"})           # key harmonised
iupred.set_index(["pdb_id","chain_id","renum_residue_number"], inplace=True)


Loading IUPred2A feature table …


In [4]:
print("Loading PSSM feature table …")
pssm = pd.read_csv(
    pssm_in,
    dtype={**cat_dtype,
           "position":"int32",
           "chain":"category"}       # chain → category
).rename(columns={"position":"renum_residue_number",
                  "chain":"chain_id"})
pssm.set_index(["pdb_id","chain_id","renum_residue_number"], inplace=True)


Loading PSSM feature table …


In [5]:
# # drop columns we don’t need to avoid bloat (keep position/AA to the left)
# pssm_cols_to_keep = [col for col in pssm.columns if col.startswith("PSSM_")]
# pssm = pssm[pssm_cols_to_keep]
# ── drop duplicate ID column to avoid name collision
pssm = pssm.drop(columns=["amino_acid"])

In [6]:
# ────────────────── 2) stream-merge master chunks ─────────────────
if master_out.exists():
    master_out.unlink()              # start fresh

reader = pd.read_csv(
    master_in,
    dtype={**cat_dtype, "renum_residue_number":"int32"},
    chunksize=chunksize
)


In [7]:
for i, chunk in enumerate(reader, 1):
    # set same index for join
    chunk.set_index(["pdb_id","chain_id","renum_residue_number"], inplace=True)

    # merge (aligns on index)
    chunk = chunk.join(iupred, how="left")
    chunk = chunk.join(pssm,   how="left")

    # reset index for output
    chunk.reset_index(inplace=True)

    # write
    chunk.to_csv(
        master_out,
        mode="a",
        header=(i == 1),
        index=False
    )
    print(f"Chunk {i}: wrote {len(chunk):,} rows")

print("\n✓ All chunks processed.")
print("Augmented master file saved to:", master_out)

Chunk 1: wrote 1,000,000 rows
Chunk 2: wrote 1,000,000 rows
Chunk 3: wrote 1,003,423 rows
Chunk 4: wrote 1,000,183 rows
Chunk 5: wrote 1,019,624 rows
Chunk 6: wrote 1,010,293 rows
Chunk 7: wrote 1,017,832 rows
Chunk 8: wrote 1,004,786 rows
Chunk 9: wrote 1,026,902 rows
Chunk 10: wrote 1,028,253 rows
Chunk 11: wrote 1,009,718 rows
Chunk 12: wrote 1,017,516 rows
Chunk 13: wrote 1,012,947 rows
Chunk 14: wrote 32,634 rows

✓ All chunks processed.
Augmented master file saved to: ../data/processed/BioPDB_master_w_iupred_pssm.csv


In [26]:
# ─────────────── paths ───────────────
master_in       = Path("../data/processed/BioPDB_master_w_iupred_pssm.csv")
dssp_in         = Path("../data/processed/dssp_residue_features_ALL.csv")       # adjust if needed
master_out      = Path("../data/processedmaster_residue_file.csv")

In [27]:
chunksize       = 1_000_000        # rows per chunk; tweak for your RAM
cat_dtype       = {"pdb_id": "category", "chain_id": "category"}

In [28]:
import re
# ────────────────────────────────────────────────────────────────────────
# 1) Load DSSP once, convert residue_id → numeric pdb_residue_number
# ────────────────────────────────────────────────────────────────────────
print("Loading DSSP features …")

def numeric_part(label: str) -> int:
    """Return integer part of author residue label (e.g. '150A' → 150)."""
    return int(re.match(r"-?\d+", str(label)).group())

Loading DSSP features …


In [29]:
dssp = pd.read_csv(
    dssp_in,
    dtype={**cat_dtype, "residue_id": "string"}
)

In [30]:
dssp["pdb_residue_number"] = dssp["residue_id"].map(numeric_part).astype("int32")
dssp = dssp.drop(columns=["residue_id"])          # no longer needed


In [31]:
# avoid column collision with master (rename if already present)
if "residue_name" in dssp.columns:
    dssp = dssp.rename(columns={"residue_name": "dssp_residue_name"})

In [32]:
# set index for fast join
dssp.set_index(["pdb_id", "chain_id", "pdb_residue_number"], inplace=True)
print("  → DSSP rows:", len(dssp))

  → DSSP rows: 31234239


In [33]:
# ────────────────────────────────────────────────────────────────────────
# 2) Stream-merge master chunks with DSSP and write output
# ────────────────────────────────────────────────────────────────────────
if master_out.exists():
    master_out.unlink()             # start fresh

In [34]:
reader = pd.read_csv(
    master_in,
    dtype={**cat_dtype, "pdb_residue_number": "int32"},
    chunksize=chunksize
)

In [35]:
for i, chunk in enumerate(reader, 1):
    # set same index as DSSP
    chunk.set_index(["pdb_id", "chain_id", "pdb_residue_number"], inplace=True)

    # left-join DSSP features
    chunk = chunk.join(dssp, how="left")

    # reset index so identifiers are normal columns again
    chunk.reset_index(inplace=True)

    # append to output CSV
    chunk.to_csv(
        master_out,
        mode="a",
        header=(i == 1),
        index=False
    )
    print(f"Chunk {i}: wrote {len(chunk):,} rows")

print("\n✓ All chunks processed.")
print("Final augmented file saved to:", master_out.resolve())

Chunk 1: wrote 1,006,545 rows
Chunk 2: wrote 1,003,060 rows
Chunk 3: wrote 1,003,112 rows
Chunk 4: wrote 1,002,506 rows
Chunk 5: wrote 1,002,408 rows
Chunk 6: wrote 1,003,018 rows
Chunk 7: wrote 1,003,338 rows
Chunk 8: wrote 1,003,142 rows
Chunk 9: wrote 1,002,954 rows
Chunk 10: wrote 1,001,160 rows
Chunk 11: wrote 1,002,079 rows
Chunk 12: wrote 1,001,912 rows
Chunk 13: wrote 1,000,262 rows
Chunk 14: wrote 184,175 rows

✓ All chunks processed.
Final augmented file saved to: /home/mpradhan007/Academic/Research_Projects/Intern_Research/data/processedmaster_residue_file.csv
