In [1]:
## import for plotting
%matplotlib inline
from os import listdir
from os.path import isfile, join
import numpy as np
from glob import glob
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from adjustText import adjust_text

## import for stats
import statsmodels.stats.multitest as smt
from scipy import stats
from collections import defaultdict

In [None]:
## import for PDB manipulation
from Bio import PDB

### Compute distance between MT-encoded proteins (chain M) and nuclear-encoded proteins (chain N)

In [22]:
dir_path = '/Users/osipova/Documents/LabDocs/Brood_parasites_analysis/NAD_complex_structure/'
pdb_file = 'COXI_anoImb.pdb'

# Function to calculate distance between two points
def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

# Load the PDB file
pdb_parser = PDB.PDBParser(QUIET=True)
structure = pdb_parser.get_structure('protein', dir_path + pdb_file)

# Select chains M and N
chain_M = structure[0]['M']
chain_N = structure[0]['N']

# Extract CA atoms and their residue IDs for all residues in each chain
ca_atoms_M = [(residue['CA'], f"{residue.resname}_{residue.id[1]}") 
              for residue in chain_M if 'CA' in residue]
ca_atoms_N = [(residue['CA'], f"{residue.resname}_{residue.id[1]}") 
              for residue in chain_N if 'CA' in residue]

# Initialize a distance matrix
distance_matrix = np.zeros((len(ca_atoms_N), len(ca_atoms_M)))

# Fill the matrix with distances and keep track of residue IDs
for i, (atom_N, res_id_N) in enumerate(ca_atoms_N):
    for j, (atom_M, res_id_M) in enumerate(ca_atoms_M):
        distance_matrix[i, j] = calculate_distance(atom_N.coord, atom_M.coord)

# Collect residue IDs for easy reference
residues_M = [res_id_M for _, res_id_M in ca_atoms_M]
residues_N = [res_id_N for _, res_id_N in ca_atoms_N]

# Convert the distance matrix to a pandas DataFrame with residue IDs as row/column labels
distance_df = pd.DataFrame(distance_matrix, index=residues_N, columns=residues_M)
distance_df.head()

Unnamed: 0,MET_1,THR_2,ASN_3,HIS_4,PRO_5,MET_6,LEU_7,ILE_8,ASN_9,LEU_10,...,SER_596,ILE_597,THR_598,ILE_599,ILE_600,LEU_601,ILE_602,SER_603,THR_604,TYR_605
MET_1,55.015274,55.009197,52.996067,51.425522,49.828991,48.470726,46.577038,45.151386,44.115421,42.570705,...,86.368889,84.627998,85.775063,89.358223,89.294891,88.083916,90.831619,93.450592,92.621605,94.201324
TRP_2,53.039963,53.042133,50.843788,49.352093,47.541759,46.303802,44.590931,42.9734,41.779293,40.47044,...,83.606407,81.779984,82.989319,86.575142,86.398453,85.168655,87.985397,90.550194,89.623459,91.222847
TYR_3,50.068611,50.226425,48.150162,46.791748,45.124886,44.026363,42.113728,40.424622,39.491623,38.216492,...,83.546997,81.615669,82.684021,86.332222,86.17778,84.788071,87.547981,90.202751,89.228249,90.682487
GLU_4,49.026562,48.977764,47.018196,45.42358,43.920033,42.553505,40.569477,39.192017,38.259724,36.64946,...,82.326721,80.500244,81.463814,85.11351,85.124199,83.74826,86.402885,89.141243,88.312645,89.731323
ILE_5,46.746826,46.599575,44.443596,42.827477,41.10844,39.758141,37.992359,36.513149,35.347801,33.884232,...,78.777939,76.90406,77.905334,81.555893,81.504662,80.123787,82.818848,85.524879,84.6492,86.087006


### Now, get min distance from each nuclear residue to MT residues

In [23]:
min_values_info = []
for row_id, row_data in distance_df.iterrows():
    # Find the column (MT residue) with the minimum value in this row
    min_col = row_data.idxmin()
    min_value = row_data.min()

    # Store the row ID (residue), column ID, and the minimum value
    min_values_info.append((row_id, min_col, min_value))

# Convert the list of results to a DataFrame for easier manipulation
min_values_df = pd.DataFrame(min_values_info, columns=['Row_Residue', 'Min_Column_Residue', 'Min_Distance'])
min_values_df.head()

Unnamed: 0,Row_Residue,Min_Column_Residue,Min_Distance
0,MET_1,TYR_37,6.547333
1,TRP_2,TYR_37,5.771532
2,TYR_3,TYR_37,8.859904
3,GLU_4,LYS_33,6.651138
4,ILE_5,LYS_33,5.466868


In [None]:
# distance_df.to_csv('distances_anoImb.tsv', sep='\t')