In [12]:
import os
from dotenv import load_dotenv
# подгружаем .env
load_dotenv()

# Считываем все креды
src_dir = os.environ.get('DB_SOURCE_DIR')

In [13]:
# Считываем родительскую директорию
from pathlib import Path
current_dir = Path().resolve()
project_root = current_dir.parent

Cчитаем данные о топологии аминокислотных остатков

In [16]:
import json
import pandas as pd

with open(f"../data/raw/topology_complete.json", 'r', encoding='utf-8') as f:
    data_topology = json.load(f)
print(len(data_topology.keys()))
print(data_topology['ALA'])

107
{'name': 'ALA', 'type': 'RESI', 'charge': 0.0, 'atoms': {'N': {'name': 'N', 'type': 'N', 'charge': -0.4157}, 'HN': {'name': 'HN', 'type': 'H', 'charge': 0.2719}, 'CA': {'name': 'CA', 'type': 'CX', 'charge': 0.0337}, 'HA': {'name': 'HA', 'type': 'H1', 'charge': 0.0823}, 'CB': {'name': 'CB', 'type': 'CT', 'charge': -0.1825}, 'HB1': {'name': 'HB1', 'type': 'HC', 'charge': 0.0603}, 'HB2': {'name': 'HB2', 'type': 'HC', 'charge': 0.0603}, 'HB3': {'name': 'HB3', 'type': 'HC', 'charge': 0.0603}, 'C': {'name': 'C', 'type': 'C', 'charge': 0.5973}, 'O': {'name': 'O', 'type': 'O', 'charge': -0.5679}}, 'bonds': [['CB', 'CA'], ['N', 'HN'], ['N', 'CA'], ['C', 'CA'], ['C', '+N'], ['CA', 'HA'], ['CB', 'HB1'], ['CB', 'HB2'], ['CB', 'HB3'], ['C', 'O']], 'donors': [['HN', 'N']], 'acceptors': [['O', 'C']], 'impropers': [['-C', 'CA', 'N', 'HN'], ['CA', '+N', 'C', 'O']]}


Теперь прочитает файл со структурой в формате ".pdb"

In [27]:
atoms_df = pd.read_csv(f"{project_root}/{src_dir}/pdbs/analysis_qm/analysis4/mut1_qm.pdb",
                       sep="\\s+",
                       header=None,
                       names=['type', 'atom_id', 'name', 'residue', 'chain', 'residue_id', 'x', 'y', 'z', 'par1', 'par2', 'element'])

atoms_df = atoms_df.drop(columns=['type', 'chain', 'par1', 'par2', 'element'])
atoms_df.head(10)

Unnamed: 0,atom_id,name,residue,residue_id,x,y,z
0,1,N,ALA,1,4.749,-1.304,-9.444
1,2,HT1,ALA,1,4.608,-0.331,-9.192
2,3,HT2,ALA,1,3.902,-1.801,-9.21
3,4,HT3,ALA,1,4.905,-1.324,-10.447
4,5,CA,ALA,1,5.928,-1.884,-8.774
5,6,HA,ALA,1,6.814,-1.524,-9.296
6,7,CB,ALA,1,6.003,-1.429,-7.311
7,8,HB1,ALA,1,6.032,-0.34,-7.265
8,9,HB2,ALA,1,6.908,-1.828,-6.85
9,10,HB3,ALA,1,5.132,-1.787,-6.76


Также заметим, что используются стандартные обозначения для всех аминокислот, тем не менее начальная аминокислота обозначается буквой "N" в начале, а конечная аминокислота буквой "C"

Инициализируем граф для атомов

In [28]:
import networkx as nx
G_atoms = nx.Graph()
for _, row in atoms_df.iterrows():
    node_id = int(row['atom_id'])
    props = row.drop('atom_id').to_dict()
    G_atoms.add_node(node_id, **props)

Инициализируем граф для аминокислотных остатков

In [29]:
G_residues = nx.Graph()
for res_id in atoms_df['residue_id'].unique().tolist():
    G_residues.add_node(res_id)

Необходимо составить список связей в белке, согласно файлу тополгии и датафрейму белка.

Создадим новый объект словарь смежности для более удобного взаимодействия со связями из файла топологии. При этом помним, что если решим переводить снова в json файл, то необходимо будет преобразовать в обычный словарь.

In [30]:
from collections import defaultdict
for residues in data_topology.keys():
    adjacency = defaultdict(set)
    for u, v in data_topology[residues]['bonds']:
        adjacency[u].add(v)
        adjacency[v].add(u)

    #data_topology[residues]['adjancey_bonds'] = dict(adjacency)
    data_topology[residues]['adjancey_bonds'] = adjacency

In [31]:
data_topology['ALA']['adjancey_bonds']

defaultdict(set,
            {'CB': {'CA', 'HB1', 'HB2', 'HB3'},
             'CA': {'C', 'CB', 'HA', 'N'},
             'N': {'CA', 'HN'},
             'HN': {'N'},
             'C': {'+N', 'CA', 'O'},
             '+N': {'C'},
             'HA': {'CA'},
             'HB1': {'CB'},
             'HB2': {'CB'},
             'HB3': {'CB'},
             'O': {'C'}})

Отображаем все id атомов, приндлжещих аминокислоте с индексом res_id

In [33]:
res_id = 232
result = atoms_df.loc[atoms_df['residue_id'] == res_id, 'atom_id']
result.tolist()

[3644,
 3645,
 3646,
 3647,
 3648,
 3649,
 3650,
 3651,
 3652,
 3653,
 3654,
 3655,
 3656,
 3657,
 3658,
 3659,
 3660,
 3661,
 3662]

In [34]:
atom_id = 3644
idx = atom_id - 1
atoms_df['residue'][idx]

'ILE'

In [35]:
res_topology = data_topology[atoms_df['residue'][idx]]
res_topology

{'name': 'ILE',
 'type': 'RESI',
 'charge': 0.0,
 'atoms': {'N': {'name': 'N', 'type': 'N', 'charge': -0.4157},
  'HN': {'name': 'HN', 'type': 'H', 'charge': 0.2719},
  'CA': {'name': 'CA', 'type': 'CX', 'charge': -0.0597},
  'HA': {'name': 'HA', 'type': 'H1', 'charge': 0.0869},
  'CB': {'name': 'CB', 'type': 'JC', 'charge': 0.1303},
  'HB': {'name': 'HB', 'type': 'HC', 'charge': 0.0187},
  'CG2': {'name': 'CG2', 'type': 'CT', 'charge': -0.3204},
  'HG21': {'name': 'HG21', 'type': 'HC', 'charge': 0.0882},
  'HG22': {'name': 'HG22', 'type': 'HC', 'charge': 0.0882},
  'HG23': {'name': 'HG23', 'type': 'HC', 'charge': 0.0882},
  'CG1': {'name': 'CG1', 'type': 'GC', 'charge': -0.043},
  'HG11': {'name': 'HG11', 'type': 'HC', 'charge': 0.0236},
  'HG12': {'name': 'HG12', 'type': 'HC', 'charge': 0.0236},
  'CD': {'name': 'CD', 'type': 'CT', 'charge': -0.066},
  'HD1': {'name': 'HD1', 'type': 'HC', 'charge': 0.0186},
  'HD2': {'name': 'HD2', 'type': 'HC', 'charge': 0.0186},
  'HD3': {'name': '

In [36]:
# отображаем возможные связи
res_topology['adjancey_bonds']

defaultdict(set,
            {'CB': {'CA', 'CG1', 'CG2', 'HB'},
             'CA': {'C', 'CB', 'HA', 'N'},
             'CG1': {'CB', 'CD', 'HG11', 'HG12'},
             'CG2': {'CB', 'HG21', 'HG22', 'HG23'},
             'CD': {'CG1', 'HD1', 'HD2', 'HD3'},
             'N': {'CA', 'HN'},
             'HN': {'N'},
             'C': {'+N', 'CA', 'O'},
             '+N': {'C'},
             'HA': {'CA'},
             'HB': {'CB'},
             'HG11': {'CG1'},
             'HG12': {'CG1'},
             'HG21': {'CG2'},
             'HG22': {'CG2'},
             'HG23': {'CG2'},
             'HD1': {'CD'},
             'HD2': {'CD'},
             'HD3': {'CD'},
             'O': {'C'}})

Связь "+N"-"C" Обозначает связь углерода C из нынешнего остатка, с азотом N из следующего остатка(при его существовании)

Добавим связи между атомами согласно топологии сперва для второй аминокислоты, так как с первой аминокислотой NALA необходимо работать отдельно.

In [37]:
res_id = 2
res_name = atoms_df.loc[atoms_df['residue_id'] == res_id, 'residue'].unique()[0]
res_topology = data_topology[res_name]
existing_bonds = res_topology['adjancey_bonds']

result = atoms_df.loc[atoms_df['residue_id'] == res_id, 'atom_id']
for id_in_res, atom_id in enumerate(result):
    for second_atom_id in result[id_in_res+1::]:
        if atoms_df['name'][second_atom_id-1] in existing_bonds.get(atoms_df['name'][atom_id-1], []):
            G_atoms.add_edge(second_atom_id, atom_id, kind='covalent')

In [38]:
G_atoms.edges

EdgeView([(13, 14), (13, 15), (15, 16), (15, 17), (15, 18), (18, 19)])

Проверим, что верно находит атомы, которые имеют только одну связь

In [39]:
end_one_nodes = [n for n in G_atoms.nodes() if G_atoms.degree(n) == 1]
end_one_nodes

[14, 16, 17, 19]

Теперь напишем код, который полностью обходит весь белок

В этот же блок можно объеденить создание нод и присвивать ноде атом свойство донор/акцептор/не принадлежит

In [40]:
res_topology = data_topology['RET']
res_topology['donors']

[['HN', 'N'], ['HZ', 'NZ']]

In [41]:
res_topology['acceptors']

[]

In [42]:
# represent bonds in protein as edges, using topology file as instruction. Also add nodes as atoms in atoms graph and amino acids in residue graph
def create_edge(res_id):
    # add new node for atom grapha and add to it property: donor/acceptor/None
    def add_node(node_id):
        name = atoms_df.at[node_id-1, 'name']
        if name in donor_role:
            props = {'donor-acceptor': donor_role[name]}
        elif name in acceptor_role:
            props = {'donor-acceptor': acceptor_role[name]}
        else:
            props = {'donor-acceptor': 'nan'}
        G_atoms.add_node(node_id, **props)

    # DataFrame of atoms for target residue
    res_df = atoms_df[atoms_df['residue_id'] == res_id]
    # get residue name according to its residue id
    res_name = res_df['residue'].iat[0]
    # get topology informatiin for picked residue
    res_topology = data_topology[res_name]
    # get information of existing bonds in aminoacid/protein
    existing_bonds = res_topology['adjancey_bonds']
    # get info about atoms belongs to residue
    res_atoms = atoms_df.loc[atoms_df['residue_id'] == res_id, 'atom_id']
    
    donor_role = {}
    for pair in res_topology['donors']:
        donor_role[pair[0]] = 'donor'
        #donor_role[pair[1]] = 'pre-donor'

    acceptor_role = {}
    for pair in res_topology['acceptors']:
        acceptor_role[pair[0]] = 'acceptor'
        #acceptor_role[pair[1]] = 'pre-acceptor'
    
    # run arround all atoms in residue and check if it has bond with another one. Complexity is O(N(N-1)/2) ~ O(N^2). But N is near 10-15, so actuall runtime is quate fast
    # and all atoms near each other, so methods like kd-trees don; improve it as mush as needed
    for id_in_res, atom_id in enumerate(res_atoms):
        add_node(int(atom_id))
        for second_atom_id in res_atoms[id_in_res+1::]:
            if atoms_df['name'][second_atom_id-1] in existing_bonds.get(atoms_df['name'][atom_id-1], []):
                G_atoms.add_edge(int(second_atom_id), int(atom_id), kind='covalent')

    if '+N' in res_topology['adjancey_bonds'].get('C', []):
        # find this residue’s 'C' atom(s)
        c_atom = int(res_df.loc[res_df['name'] == 'C', 'atom_id'].iloc[0])
        # find next residue’s 'N' atom(s)
        next_res = atoms_df[atoms_df['residue_id'] == res_id+1]
        n_atom = next_res.loc[next_res['name'] == 'N', 'atom_id']
        if not n_atom.empty:
            n_atom = int(n_atom.iloc[0])
            # add nodes and edges
            add_node(c_atom)
            add_node(n_atom)
            G_atoms.add_edge(c_atom, n_atom, kind='covalent')
            G_residues.add_edge(res_id, res_id+1, kind='covalent')

In [43]:
# nodes is ids of atoms from pdb file
G_atoms = nx.Graph()
# nodes is ids of residues from pdb file
G_residues = nx.Graph()

# loop arround all residues in protein
for res_id in atoms_df['residue_id'].unique().tolist():
    create_edge(res_id)
    G_residues.add_node(res_id, kind='covalent')

In [44]:
edges_with_data = list(G_atoms.edges(data=True))

# Выводим первые 50 штук
for u, v, attrs in edges_with_data[:50]:
    kind = attrs.get('kind')
    print(f"{u} — {v}: kind={kind}")

1 — 5: kind=covalent
5 — 6: kind=covalent
5 — 7: kind=covalent
5 — 11: kind=covalent
7 — 8: kind=covalent
7 — 9: kind=covalent
7 — 10: kind=covalent
11 — 12: kind=covalent
11 — 13: kind=covalent
13 — 14: kind=covalent
13 — 15: kind=covalent
15 — 16: kind=covalent
15 — 17: kind=covalent
15 — 18: kind=covalent
18 — 19: kind=covalent
18 — 20: kind=covalent
20 — 21: kind=covalent
20 — 22: kind=covalent
22 — 23: kind=covalent
22 — 24: kind=covalent
22 — 39: kind=covalent
24 — 25: kind=covalent
24 — 26: kind=covalent
24 — 27: kind=covalent
39 — 40: kind=covalent
39 — 41: kind=covalent
27 — 28: kind=covalent
27 — 37: kind=covalent
28 — 29: kind=covalent
28 — 30: kind=covalent
37 — 35: kind=covalent
37 — 38: kind=covalent
30 — 31: kind=covalent
30 — 32: kind=covalent
32 — 33: kind=covalent
32 — 35: kind=covalent
33 — 34: kind=covalent
35 — 36: kind=covalent
41 — 42: kind=covalent
41 — 43: kind=covalent
43 — 44: kind=covalent
43 — 45: kind=covalent
43 — 51: kind=covalent
45 — 46: kind=covalent


Проверим, что правильно находим доноров и акцепторов

In [45]:
end_one_nodes = [n for n in G_atoms.nodes() if G_atoms.degree(n) == 1]
s = 0
donor_acceptor = []
for node_idx in end_one_nodes:
    if G_atoms.nodes()[node_idx]['donor-acceptor'] in ['donor', 'acceptor']:
        donor_acceptor.append(node_idx)
print(donor_acceptor)

[12, 14, 19, 21, 34, 40, 42, 49, 50, 52, 54, 71, 73, 90, 92, 97, 99, 106, 107, 109, 111, 116, 118, 131, 134, 135, 137, 138, 140, 154, 156, 166, 167, 169, 171, 181, 183, 185, 202, 204, 214, 226, 228, 245, 247, 252, 254, 271, 273, 278, 280, 290, 292, 294, 311, 313, 320, 322, 324, 339, 341, 358, 360, 377, 379, 384, 386, 396, 398, 400, 418, 420, 433, 439, 441, 459, 461, 478, 480, 494, 496, 509, 512, 513, 515, 516, 518, 520, 525, 527, 537, 549, 551, 556, 558, 572, 574, 584, 586, 588, 595, 596, 598, 600, 616, 617, 618, 620, 622, 629, 630, 632, 634, 642, 644, 657, 660, 661, 663, 664, 666, 668, 678, 679, 681, 683, 696, 702, 704, 717, 723, 725, 733, 735, 749, 751, 761, 763, 765, 782, 784, 801, 803, 817, 831, 833, 838, 840, 857, 859, 867, 869, 876, 878, 880, 888, 890, 905, 907, 920, 926, 928, 945, 947, 954, 956, 958, 973, 975, 993, 995, 1013, 1015, 1020, 1022, 1039, 1041, 1046, 1048, 1065, 1067, 1077, 1079, 1081, 1091, 1092, 1094, 1096, 1110, 1112, 1122, 1124, 1126, 1140, 1142, 1147, 1149, 1154,

То есть можно теперь не проверять ноду на количество связей, а просто смотреть на ее состояние. Единственное когда может потребоваться проверка связей, если накладывать жесткое ограничение на количество водородных связей.

Проверим также что созданный граф аминокислот имеет цепочечную структуру, при этом вода ни с кем не соеденена

In [46]:
start_node = 1

bfs_order = list(nx.bfs_tree(G_residues, source=start_node).nodes())
print("BFS order:", bfs_order)

BFS order: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 2

Добавим водородные связи

In [47]:
import numpy as np
from scipy.spatial import cKDTree
import math

coords = atoms_df[["x", "y", "z"]].to_numpy()
kd = cKDTree(coords)

# 3) Prepare donor & acceptor lists
# Map atom_id → index in DataFrame (0-based)
id_to_idx = {atom_id: idx for idx, atom_id in enumerate(atoms_df["atom_id"])}
# Identify donor & acceptor atom_ids
donors = [
    atom_id
    for atom_id, props in G_atoms.nodes(data=True)
    if props.get("donor-acceptor") == "donor"
]
acceptors = [
    atom_id
    for atom_id, props in G_atoms.nodes(data=True)
    if props.get("donor-acceptor") == "acceptor"
]

# 4) For each donor, find acceptors within 3.5Å and check angle >= 120°
for donor_id in donors:
    di = id_to_idx[donor_id]
    d_coord = coords[di]
    # query points within 2Å (radius)
    idxs = kd.query_ball_point(d_coord, r=3.5)
        
    for j in idxs:
        acceptor_id = atoms_df.iloc[j]["atom_id"]
        if acceptor_id not in acceptors:
            continue
        a_coord = coords[j]

        # Compute angle at donor between (donor→acceptor) and donor’s bond direction.
        # We take donor’s first bonded neighbor in G_atoms as reference:
        neighbors = list(G_atoms.neighbors(donor_id))
        if not neighbors:
            continue
        # Use the first neighbor to define bond direction vector
        ref_id = neighbors[0]
        ri = id_to_idx[ref_id]
        r_coord = coords[ri]

        # Vectors
        v1 = a_coord - d_coord   # donor→acceptor
        v2 = r_coord - d_coord   # donor→reference neighbor

        # Angle in degrees
        cos_theta = np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        theta = math.degrees(math.acos(np.clip(cos_theta, -1.0, 1.0)))

        if theta >= 120.0:
            # Add hydrogen‐bond edge
            G_atoms.add_edge(int(donor_id), int(acceptor_id), kind="hydrogen")
            G_residues.add_edge(int(atoms_df['residue_id'][donor_id-1]), int(atoms_df['residue_id'][acceptor_id-1]), kind="hydrogen")


Добавим функцию бфс обхода по заданному типу связей

In [50]:
from collections import deque, defaultdict

# Precompute adjacency_by_kind[node][kind] → list of neighbors
adj_by_kind = {node: defaultdict(list) for node in G_residues}
for u, v, d in G_residues.edges(data=True):
    kind = d['kind']
    adj_by_kind[u][kind].append(v)
    adj_by_kind[v][kind].append(u)

# BFS that directly uses these lists
def bfs_kinds(start, allowed_kinds):
    visited = {start}
    queue = deque([start])
    order = []
    while queue:
        u = queue.popleft()
        order.append(u)
        for k in allowed_kinds:
            for v in adj_by_kind[u].get(k, ()):
                if v not in visited:
                    visited.add(v)
                    queue.append(v)
    return order

print("Covalent only:  ", bfs_kinds(232, {'covalent'}))
print("Hydrogen only:", bfs_kinds(232, {'hydrogen'}))
print("Covalent AND Hydrogen:    ", bfs_kinds(1, {'primary','hydrogen'}))
both_kinds = {'covalent'} | {'hydrogen'}
print("Covalent OR Hydrogen:", bfs_kinds(1, both_kinds))


Covalent only:   [232, 231, 233, 230, 234, 229, 235, 228, 236, 227, 237, 226, 238, 225, 239, 224, 240, 223, 222, 221, 220, 219, 218, 217, 216, 215, 214, 213, 212, 211, 210, 209, 208, 207, 206, 205, 204, 203, 202, 201, 200, 199, 198, 197]
Hydrogen only: [232, 229, 169, 226, 173, 228, 230, 222, 177, 224, 225, 233, 217, 181, 220, 221, 213, 185, 216, 209, 547, 519, 188, 22, 212, 526, 219, 205, 218, 549, 184, 215, 192, 18, 26, 208, 501, 214, 96, 180, 187, 60, 89, 211, 504, 195, 14, 30, 204, 555, 88, 210, 92, 100, 176, 183, 191, 56, 64, 85, 93, 207, 198, 199, 200, 3, 34, 15, 84, 206, 104, 172, 179, 52, 10, 66, 82, 97, 196, 197, 203, 138, 201, 11, 43, 562, 19, 68, 168, 175, 48, 8, 63, 515, 12, 508, 86, 101, 137, 509, 532, 134, 142, 2, 35, 37, 40, 47, 23, 160, 171, 44, 4, 9, 59, 80, 16, 90, 105, 106, 518, 537, 129, 131, 133, 141, 536, 77, 146, 31, 32, 39, 41, 235, 236, 50, 51, 27, 155, 156, 164, 165, 167, 5, 6, 7, 13, 520, 55, 523, 20, 557, 237, 516, 125, 127, 74, 132, 136, 189, 145, 75, 76, 1

Выведем водородные связи между атомами

In [52]:
from collections import defaultdict

# Precompute adjacency_by_kind[node][kind] → list of neighbors
adj_by_kind_atoms = {node: defaultdict(list) for node in G_atoms}
for u, v, d in G_atoms.edges(data=True):
    kind = d['kind']
    adj_by_kind_atoms[u][kind].append(v)
    adj_by_kind_atoms[v][kind].append(u)

for node, kinds in list(adj_by_kind_atoms.items())[:100:]:
    hydrogen_neighbors = kinds.get('hydrogen', [])
    if hydrogen_neighbors:
        print(f"Node {node} → Hydrogen neighbors: {hydrogen_neighbors}")

Node 14 → Hydrogen neighbors: [154]
Node 21 → Hydrogen neighbors: [154]
Node 40 → Hydrogen neighbors: [185]
Node 49 → Hydrogen neighbors: [54, 73, 92, 181, 3795]
Node 50 → Hydrogen neighbors: [99, 3795]
Node 54 → Hydrogen neighbors: [49]
Node 73 → Hydrogen neighbors: [49]
Node 92 → Hydrogen neighbors: [49, 3794]
Node 97 → Hydrogen neighbors: [3796]
Node 99 → Hydrogen neighbors: [50]


Выведем водородные связи между аминокислотными остатками

In [53]:
from collections import defaultdict

# Precompute adjacency_by_kind[node][kind] → list of neighbors
adj_by_kind_atoms = {node: defaultdict(list) for node in G_residues}
for u, v, d in G_residues.edges(data=True):
    kind = d['kind']
    adj_by_kind_atoms[u][kind].append(v)
    adj_by_kind_atoms[v][kind].append(u)

for node, kinds in list(adj_by_kind_atoms.items())[:25:]:
    hydrogen_neighbors = kinds.get('hydrogen', [])
    if hydrogen_neighbors:
        print(f"Node {node} → Hydrogen neighbors: {hydrogen_neighbors}")

Node 2 → Hydrogen neighbors: [11]
Node 3 → Hydrogen neighbors: [11, 14]
Node 4 → Hydrogen neighbors: [5, 6, 7, 8, 13, 520]
Node 5 → Hydrogen neighbors: [4]
Node 6 → Hydrogen neighbors: [4]
Node 7 → Hydrogen neighbors: [4, 520]
Node 8 → Hydrogen neighbors: [4, 9, 10]
Node 9 → Hydrogen neighbors: [8]
Node 10 → Hydrogen neighbors: [8, 63, 64, 515]
Node 11 → Hydrogen neighbors: [2, 3]
Node 12 → Hydrogen neighbors: [16, 82, 562]
Node 13 → Hydrogen neighbors: [4, 520, 17]
Node 14 → Hydrogen neighbors: [3, 18]
Node 15 → Hydrogen neighbors: [562, 19, 555]
Node 16 → Hydrogen neighbors: [12, 20]
Node 17 → Hydrogen neighbors: [13, 21]
Node 18 → Hydrogen neighbors: [14, 22]
Node 19 → Hydrogen neighbors: [15, 23]
Node 20 → Hydrogen neighbors: [16, 24]
Node 21 → Hydrogen neighbors: [17, 25]
Node 22 → Hydrogen neighbors: [18, 216, 26]
Node 23 → Hydrogen neighbors: [19, 27]
Node 24 → Hydrogen neighbors: [20, 28]
Node 25 → Hydrogen neighbors: [21, 29]


Сохраним полученные графы

In [54]:
# Сохранение
nx.write_graphml(G_atoms,    f"{project_root}/{src_dir}/test_structure_atoms.graphml")
nx.write_graphml(G_residues, f"{project_root}/{src_dir}/test_structure_residues.graphml")

In [61]:
# Загрузка
G1_atoms    = nx.read_graphml(f"{project_root}/{src_dir}/test_structure_atoms.graphml")
G1_residues = nx.read_graphml(f"{project_root}/{src_dir}/test_structure_residues.graphml")

In [63]:
def hydrogen_edges_for_atom_id(
    G_res: nx.Graph,
    residue: str):
    """
    Return all hydrogen-bond edges adjacent to a given residue.
    Each edge returned as (residue, neighbor).
    """
    edges = []
    for nbr in G_res.neighbors(residue):
        data = G_res[residue][nbr]
        if data.get('kind') == 'hydrogen':
            edges.append((residue, nbr))
    return edges
hydrogen_edges_for_atom_id(G1_residues, '232')

[('232', '229')]

Также сохраним в виде csv файла

In [56]:
# Сохранение
def save_edge_list_pd(G, filename):
    # Собираем список кортежей (u, v, kind)
    rows = [
        {"u": u, "v": v, "kind": attrs.get("kind")}
        for u, v, attrs in G.edges(data=True)
    ]
    df = pd.DataFrame(rows)
    df.to_csv(filename, index=False)

save_edge_list_pd(G_atoms,    f"{project_root}/{src_dir}/test_structure_atoms_edges.csv")
save_edge_list_pd(G_residues, f"{project_root}/{src_dir}/test_structure_residues_edges.csv")

In [57]:
# Загрузка
def load_edge_list_pd(filename):
    df = pd.read_csv(filename)
    G = nx.Graph()
    for _, row in df.iterrows():
        u, v = int(row["u"]), int(row["v"])
        kind = row["kind"] if pd.notna(row["kind"]) else None
        G.add_edge(u, v, kind=kind)
    return G

G1_atoms    = load_edge_list_pd(f"{project_root}/{src_dir}/test_structure_atoms_edges.csv")
G1_residues = load_edge_list_pd(f"{project_root}/{src_dir}/test_structure_residues_edges.csv")

Compare bonds

In [58]:
G2_atoms = G1_atoms

In [59]:
def hydrogen_edges(G):
    return {
        tuple(sorted((u, v)))
        for u, v, attrs in G.edges(data=True)
        if attrs.get("kind") == "hydrogen"
    }

h1 = hydrogen_edges(G1_atoms)
h2 = hydrogen_edges(G2_atoms)

print("Lost:",   h1 - h2)
print("Gained:", h2 - h1)
print("Same:",   h1 & h2)


Lost: set()
Gained: set()
Same: {(2775, 2831), (107, 131), (3739, 3755), (2845, 2904), (1934, 3807), (1878, 1929), (3791, 3820), (1346, 1421), (2638, 3630), (3643, 3823), (2110, 3083), (1657, 1664), (629, 3702), (1092, 3841), (3358, 3803), (2864, 2904), (183, 247), (2577, 2582), (2276, 2325), (1707, 1759), (3207, 3261), (478, 527), (1096, 1205), (1358, 3340), (3340, 3772), (14, 154), (888, 947), (2169, 2234), (1607, 2598), (3495, 3538), (3123, 3141), (2257, 2306), (3726, 3802), (2638, 3611), (50, 99), (3360, 3434), (3240, 3298), (2829, 2900), (3360, 3470), (140, 3784), (1707, 1749), (617, 3714), (817, 859), (1740, 1782), (663, 666), (2512, 2545), (2711, 2763), (3056, 3203), (2323, 2376), (2232, 2285), (138, 993), (3107, 3141), (320, 3355), (733, 784), (1747, 1798), (1140, 2038), (1215, 3798), (2966, 3030), (1389, 3834), (2640, 2689), (3081, 3780), (1972, 2010), (2128, 3090), (1323, 1372), (271, 313), (2829, 2890), (1989, 2108), (3080, 3811), (2611, 2658), (2113, 2171), (596, 634), (121