In [1]:
from pathlib import Path
import re, requests, pandas as pd
from Bio.PDB import PDBParser, PDBIO, Select
from Bio.SeqUtils import seq1
from Bio import pairwise2

UNIPROT_ID = "Q6PSU2"
# ↓ 이미 만들어 둔 전처리 PDB (체인 E + protein-only)
PDB_FILE = Path(r"D:/final/8DB4/8DB4_mono/8DB4_AraH2_chainE_proteinOnly.pdb")
PREFERRED_CHAIN_ID = "E"

# === 0) 출력 폴더 ===
OUT_DIR = PDB_FILE.parent / "mapping_out"
OUT_DIR.mkdir(parents=True, exist_ok=True)
CSV_PATH = OUT_DIR / f"{UNIPROT_ID}_to_8DB4_chainE_mapping.csv"



# PDB 8DB4 chain E에서 모델링된 폴리펩타이드/CA 개수 확인

In [6]:
from Bio.PDB import PDBParser
from pathlib import Path

pdb_path = Path(r"D:/final/8DB4/8DB4_mono/8DB4_AraH2_chainE.pdb")
s = PDBParser(QUIET=True).get_structure("s", str(pdb_path))
E = next(s.get_models())[ "E" ]

n_poly = sum(1 for r in E if r.id[0] == " ")
n_ca   = sum(1 for r in E if r.id[0] == " " and r.has_id("CA"))
print("polymer residues with coords:", n_poly, "| with CA:", n_ca)

polymer residues with coords: 95 | with CA: 95


132 aa = 엔티티/서열 길이(SEQRES 기준, “있어야 할 잔기” 수)

95 aa = ATOM 좌표가 실제로 모델링된 잔기 수

HET 제거/altloc 처리 때문에 132에서 95 aa로 줄어든 게 아니라, 원래 그 잔기들에 좌표가 없어서(불질서 루프 등) 파일에 존재하지 않음

In [2]:
# === 1) UniProt 서열 ===
fa = requests.get(f"https://www.uniprot.org/uniprotkb/{UNIPROT_ID}.fasta", timeout=60).text
seq_uniprot = "".join(re.findall(r"[A-Za-z]+", fa.split("\n",1)[1]))
print(f"[UniProt] {UNIPROT_ID} length = {len(seq_uniprot)} aa")

[UniProt] Q6PSU2 length = 172 aa


In [3]:
# === 2) PDB 파싱 (이미 전처리된 파일 사용) ===
assert PDB_FILE.exists() and PDB_FILE.stat().st_size > 0, f"파일이 없거나 비어있음: {PDB_FILE}"
s = PDBParser(QUIET=True).get_structure("s", str(PDB_FILE))

models = list(s.get_models())
if not models:
    raise RuntimeError("ATOM 레코드가 없습니다. 전처리 파일을 확인하세요.")
model = models[0]

chain_ids = [c.id for c in model]
if PREFERRED_CHAIN_ID in chain_ids:
    CHAIN_ID = PREFERRED_CHAIN_ID
elif len(chain_ids) == 1:
    CHAIN_ID = chain_ids[0]
    print(f"[warn] '{PREFERRED_CHAIN_ID}' 체인이 없어 '{CHAIN_ID}' 체인을 사용합니다.")
else:
    raise KeyError(f"원하는 체인('{PREFERRED_CHAIN_ID}')이 없고, 체인이 여러 개입니다: {chain_ids}")

chain = model[CHAIN_ID]

# 폴리펩타이드 잔기만 수집 → (one-letter, resseq, icode) 리스트
res_records = []
for res in chain:
    hetflag, resseq, icode = res.id
    if hetflag != " ":
        continue
    name3 = res.get_resname()
    try:
        aa = seq1(name3, custom_map={"MSE":"M", "HYP":"P"})
    except KeyError:
        aa = {"MSE":"M"}.get(name3, "X")  # 필요 시 확장
    res_records.append((aa, resseq, icode or ""))

seq_pdb = "".join([r[0] for r in res_records])
print(f"[PDB] Chain {CHAIN_ID} modeled length = {len(seq_pdb)} aa")

[PDB] Chain E modeled length = 95 aa


In [7]:
# === 3) 글로벌 정렬 (UniProt vs PDB 체인) ===
aln = pairwise2.align.globalms(seq_uniprot, seq_pdb, 2, -1, -10, -0.5, one_alignment_only=True)[0]
A, B = aln.seqA, aln.seqB

In [8]:
# === 4) 매핑 테이블 구축 ===
rows = []
ui = pi = 0  # 1-based
mismatch = 0
for a, b in zip(A, B):
    if a != "-": ui += 1
    if b != "-": pi += 1
    if a != "-" and b != "-":
        aa_u = a
        aa_p, resseq, icode = res_records[pi-1]
        rows.append({
            "uniprot_pos": ui,
            "uniprot_aa": aa_u,
            "pdb_chain": CHAIN_ID,
            "pdb_modeled_index": pi,   # 체인 내 1..N
            "pdb_resseq": resseq,      # author residue number
            "pdb_icode": icode,
            "pdb_aa": aa_p,
            "match": int(aa_u == aa_p),
        })
        if aa_u != aa_p:
            mismatch += 1
    elif a != "-" and b == "-":
        rows.append({
            "uniprot_pos": ui,
            "uniprot_aa": a,
            "pdb_chain": None,
            "pdb_modeled_index": None,
            "pdb_resseq": None,
            "pdb_icode": None,
            "pdb_aa": None,
            "match": None,
        })
    elif a == "-" and b != "-":
        aa_p, resseq, icode = res_records[pi-1]
        rows.append({
            "uniprot_pos": None,
            "uniprot_aa": None,
            "pdb_chain": CHAIN_ID,
            "pdb_modeled_index": pi,
            "pdb_resseq": resseq,
            "pdb_icode": icode,
            "pdb_aa": aa_p,
            "match": None,
        })

df = pd.DataFrame(rows)

In [9]:
# === 5) 요약 + 저장 ===
mapped = df.dropna(subset=["uniprot_pos","pdb_modeled_index"])
uniprot_only = df[df["pdb_modeled_index"].isna() & df["uniprot_pos"].notna()]
pdb_only = df[df["uniprot_pos"].isna() & df["pdb_modeled_index"].notna()]

print("\n=== Summary ===")
print(f"UniProt length: {len(seq_uniprot)}")
print(f"PDB modeled length (chain {CHAIN_ID}): {len(seq_pdb)}")
print(f"Mapped positions: {len(mapped)}")
print(f"Uniprot-only (no structure): {len(uniprot_only)}")
print(f"PDB-only (no UniProt counterpart): {len(pdb_only)}")
print(f"Mismatches in mapped positions: {mismatch}")

cols = ["uniprot_pos","uniprot_aa","pdb_chain","pdb_modeled_index","pdb_resseq","pdb_icode","pdb_aa","match"]
df_sorted = df.sort_values(by=["uniprot_pos","pdb_modeled_index"], na_position="last")[cols]
df_sorted.to_csv(CSV_PATH, index=False, encoding="utf-8")
print(f"\nSaved mapping CSV -> {CSV_PATH}")


=== Summary ===
UniProt length: 172
PDB modeled length (chain E): 95
Mapped positions: 95
Uniprot-only (no structure): 77
PDB-only (no UniProt counterpart): 0
Mismatches in mapped positions: 2

Saved mapping CSV -> D:\final\8DB4\8DB4_mono\mapping_out\Q6PSU2_to_8DB4_chainE_mapping.csv
