In [None]:
#!/usr/bin/env python
"""
Add BioLiP renum_residue_number to BioPDB_features_with_labels.csv
   • Keys used: pdb_id, chain_id, pdb_residue_number
   • Insertion codes are ignored.
"""

import pandas as pd
from pathlib import Path

In [None]:

# ──────────────────────────── paths & parameters ────────────────────────────
master_path   = Path("../data/processed/BioPDB_features_with_labels.csv")
mapping_path  = Path("../data/processed/map_pdb_biolip_renum_all.csv")
output_path   = Path("../data/processed/BioPDB_features_with_labels_w_renum.csv")

chunksize     = 500_000                       # tweak for your RAM
cat_dtype     = {"pdb_id":"category", "chain_id":"category"}

In [None]:
# ─────────────────────────── 1) load mapping into RAM ───────────────────────
print("Loading mapping …")
mapping = pd.read_csv(
    mapping_path,
    usecols=["pdb_id", "chain_id", "pdb_residue_number", "renum_residue_number"],
    dtype={**cat_dtype,
           "pdb_residue_number":"int32",
           "renum_residue_number":"int32"}
)

In [None]:
# ── ensure there is at most one renum per (pdb_id, chain_id, pdb_residue_number)
mapping = mapping.drop_duplicates(
    ["pdb_id", "chain_id", "pdb_residue_number"],
    keep="first"
)
dupes = mapping.duplicated(["pdb_id","chain_id","pdb_residue_number"], keep=False)
if dupes.any():
    raise ValueError(
        "Duplicate author numbers with different insertion codes detected:\n"
        f"{mapping.loc[dupes].head()}\n"
        "Decide how to handle these before merging."
    )

mapping.set_index(["pdb_id","chain_id","pdb_residue_number"], inplace=True)


In [None]:
# ─────────────────────────── 2) stream-merge master ─────────────────────────
if output_path.exists():
    output_path.unlink()                     # start fresh

reader = pd.read_csv(
    master_path,
    dtype={**cat_dtype, "residue_number":"int32"},
    chunksize=chunksize
)

In [None]:
total_rows, matched_rows = 0, 0

for i, chunk in enumerate(reader, 1):
    total_rows += len(chunk)

    # rename to align with mapping keys
    chunk = chunk.rename(columns={"residue_number":"pdb_residue_number"})

    # join via index for speed
    chunk = chunk.join(
        mapping, on=["pdb_id","chain_id","pdb_residue_number"]
    )

    matched_rows += chunk["renum_residue_number"].notna().sum()

    chunk.to_csv(
        output_path,
        mode="a",
        header=(i == 1),
        index=False
    )
    print(f"Chunk {i}: processed {len(chunk):,} rows "
          f"({chunk['renum_residue_number'].notna().mean():.1%} matched)")

print("──────────────────────── summary ────────────────────────")
print(f"Total master rows   : {total_rows:,}")
print(f"Rows with renum     : {matched_rows:,} "
      f"({matched_rows/total_rows:.1%})")
print("Output written to   :", output_path)

In [2]:
from pathlib import Path
import duckdb

in_path  = Path("../data/processed/BioPDB_features_with_labels_w_renum.csv")
out_path = Path("../data/processed/BioPDB_features_with_labels_renumonly.csv")

duckdb.sql(f"""
    COPY (
        SELECT *
        FROM read_csv_auto('{in_path.resolve()}', header=True)
        WHERE renum_residue_number IS NOT NULL      -- keep only mapped rows
    )
    TO '{out_path.resolve()}'
    (HEADER, DELIMITER ',');
""")

print("✓ Filtered file written to:", out_path)

✓ Filtered file written to: ../data/processed/BioPDB_features_with_labels_renumonly.csv


In [3]:
import pandas as pd
from pathlib import Path

# ───────────── paths & parameters ─────────────
in_path   = Path("../data/processed/BioPDB_features_with_labels_renumonly.csv")
chunksize = 1_000_000          # adjust to your RAM

# ───────────── initialise running totals ───────
na_counts   = None   # will become a pandas Series
total_rows  = 0

# ───────────── stream & accumulate ─────────────
for chunk in pd.read_csv(in_path, chunksize=chunksize):
    total_rows += len(chunk)
    # count NAs in this chunk
    chunk_na = chunk.isna().sum()

    # first chunk → create Series; later chunks → add
    na_counts = chunk_na if na_counts is None else na_counts.add(chunk_na, fill_value=0)

# ───────────── report ─────────────
print(f"Total rows processed : {total_rows:,}\n")
print("Missing-value count per column")
print("--------------------------------")
print(na_counts.astype(int).sort_values(ascending=False))

Total rows processed : 13,032,634

Missing-value count per column
--------------------------------
prev_res                 40577
next_res                 35988
closest_neighbor_dist        3
avg_neighbor_distance        3
pdb_id                       0
chain_id                     0
pdb_residue_number           0
centroid_z                   0
mean_bfactor                 0
std_bfactor                  0
mean_occupancy               0
insertion_code               0
residue_name                 0
centroid_x                   0
centroid_y                   0
mean_intra_atom_dist         0
num_sidechain_atoms          0
num_heavy_atoms              0
num_atoms                    0
bounding_box_volume          0
residue_radius               0
radius_of_gyration           0
std_intra_atom_dist          0
position_in_chain            0
is_small                     0
contact_number_4A            0
contact_number_6A            0
contact_number_8A            0
contact_number_10A           0
la