In [7]:
import pandas as pd
import numpy as np
import glob
from IPython.display import display

from Bio.PDB import PDBParser,PDBIO
from Bio.PDB.DSSP import DSSP

In [3]:
DATA_DIR = "../data/backup"

In [6]:
def parse_pdb_to_df(pdb_path):
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("structure", pdb_path)

    atom_data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_info = {
                        "model_id": model.id,
                        "chain_id": chain.id,
                        "residue_name": residue.resname,
                        "residue_id": residue.id[1],
                        "atom_name": atom.name,
                        "atom_serial_number": atom.serial_number,
                        "x": atom.coord[0],
                        "y": atom.coord[1],
                        "z": atom.coord[2],
                        "occupancy": atom.occupancy,
                        "b_factor": atom.bfactor,
                        "element": atom.element
                    }
                    atom_data.append(atom_info)

    df = pd.DataFrame(atom_data)
    return df

In [5]:
pdb_paths = glob.glob(f"{DATA_DIR}/*.pdb")
print(f"{pdb_paths=}")

pdb_paths=['../data/backup/CGG_mean.pdb', '../data/backup/CGG_max.pdb', '../data/backup/CGG.pdb', '../data/backup/CGG_75th.pdb', '../data/backup/expression_mean.pdb', '../data/backup/IgY-CH2_EH2_final_real_space_refined_021.pdb']


In [None]:
# load pdbs
pdb_dfs = {}
for pdb_path in pdb_paths:
    pdb_dfs[pdb_path] = parse_pdb_to_df(pdb_path)

for path1,df1 in pdb_dfs.items():
    print(f"{df1.columns=}")
    display(df1)
    break


df1.columns=Index(['model_id', 'chain_id', 'residue_name', 'residue_id', 'atom_name',
       'atom_serial_number', 'x', 'y', 'z', 'occupancy', 'b_factor',
       'element'],
      dtype='object')


Unnamed: 0,model_id,chain_id,residue_name,residue_id,atom_name,atom_serial_number,x,y,z,occupancy,b_factor,element
0,0,A,VAL,233,N,1,139.557007,162.477997,153.742004,1.0,0.0,N
1,0,A,VAL,233,CA,2,140.057007,161.889008,152.462997,1.0,0.0,C
2,0,A,VAL,233,C,3,140.324005,160.399002,152.707993,1.0,0.0,C
3,0,A,VAL,233,O,4,139.399002,159.662003,152.332001,1.0,0.0,O
4,0,A,VAL,233,CB,5,141.207001,162.744003,151.912003,1.0,0.0,C
...,...,...,...,...,...,...,...,...,...,...,...,...
4865,0,D,NAG,1336,O3,4866,132.774002,145.983002,140.544006,1.0,0.0,O
4866,0,D,NAG,1336,O4,4867,132.421997,144.298004,142.850006,1.0,0.0,O
4867,0,D,NAG,1336,O5,4868,133.466995,141.807007,140.242996,1.0,0.0,O
4868,0,D,NAG,1336,O6,4869,133.041000,140.011002,142.354996,1.0,0.0,O


In [None]:
match_df = []
for path1,df1 in pdb_dfs.items():
    for path2,df2 in pdb_dfs.items():
        if (path1 == path2):
            continue
        match_ratio = (df1 == df2).mean()
        match_ratio = match_ratio.to_frame().T
        cols = list(match_ratio.columns)
        match_ratio['path1'] = path1
        match_ratio['path2'] = path2
        match_ratio = match_ratio[['path1','path2'] + cols]
        match_df.append(match_ratio)

match_df = pd.concat(match_df, ignore_index=True)
display(match_df)

Unnamed: 0,path1,path2,model_id,chain_id,residue_name,residue_id,atom_name,atom_serial_number,x,y,z,occupancy,b_factor,element
0,../data/backup/CGG_mean.pdb,../data/backup/CGG_max.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.301848,1.0
1,../data/backup/CGG_mean.pdb,../data/backup/CGG.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,../data/backup/CGG_mean.pdb,../data/backup/CGG_75th.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.307598,1.0
3,../data/backup/CGG_mean.pdb,../data/backup/expression_mean.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.309651,1.0
4,../data/backup/CGG_mean.pdb,../data/backup/IgY-CH2_EH2_final_real_space_re...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
5,../data/backup/CGG_max.pdb,../data/backup/CGG_mean.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.301848,1.0
6,../data/backup/CGG_max.pdb,../data/backup/CGG.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.301848,1.0
7,../data/backup/CGG_max.pdb,../data/backup/CGG_75th.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.301848,1.0
8,../data/backup/CGG_max.pdb,../data/backup/expression_mean.pdb,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.301848,1.0
9,../data/backup/CGG_max.pdb,../data/backup/IgY-CH2_EH2_final_real_space_re...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
