In [1]:
## import for plotting
%matplotlib inline
from os import listdir
from os.path import isfile, join
import numpy as np
from glob import glob
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
from adjustText import adjust_text

## import for stats
import statsmodels.stats.multitest as smt
from scipy import stats
from collections import defaultdict

In [None]:
## import for PDB manipulation
from Bio import PDB

### Compute distance between MT-encoded proteins (chain M) and nuclear-encoded proteins (chain N)

In [8]:
dir_path = '/Users/osipova/Documents/LabDocs/Brood_parasites_analysis/NAD_complex_structure/'
pdb_file = 'COXI_anoImb.pdb'

# Function to calculate distance between two points
def calculate_distance(coord1, coord2):
    return np.linalg.norm(coord1 - coord2)

# Load the PDB file
pdb_parser = PDB.PDBParser(QUIET=True)
structure = pdb_parser.get_structure('protein', dir_path + pdb_file)

# Select chains M and N
chain_M = structure[0]['M']
chain_N = structure[0]['N']

# Extract CA atoms and their residue IDs for all residues in each chain
ca_atoms_M = [(residue['CA'], f"{residue.resname}_{residue.id[1]}") 
              for residue in chain_M if 'CA' in residue]
ca_atoms_N = [(residue['CA'], f"{residue.resname}_{residue.id[1]}") 
              for residue in chain_N if 'CA' in residue]

# Initialize a distance matrix
distance_matrix = np.zeros((len(ca_atoms_M), len(ca_atoms_N)))

# Fill the matrix with distances and keep track of residue IDs
for i, (atom_M, res_id_M) in enumerate(ca_atoms_M):
    for j, (atom_N, res_id_N) in enumerate(ca_atoms_N):
        distance_matrix[i, j] = calculate_distance(atom_M.coord, atom_N.coord)

# Collect residue IDs for easy reference
residues_M = [res_id_M for _, res_id_M in ca_atoms_M]
residues_N = [res_id_N for _, res_id_N in ca_atoms_N]

# Convert the distance matrix to a pandas DataFrame with residue IDs as row/column labels
distance_df = pd.DataFrame(distance_matrix, index=residues_M, columns=residues_N)

In [9]:
distance_df

Unnamed: 0,MET_1,TRP_2,TYR_3,GLU_4,ILE_5,LEU_6,PRO_7,GLY_8,MET_9,ALA_10,...,ALA_719,HIS_720,ALA_721,VAL_722,GLU_723,GLU_724,PRO_725,SER_726,ILE_727,CYS_728
MET_1,55.015274,53.039963,50.068611,49.026562,46.746826,44.717251,42.482414,41.536713,40.809879,38.761604,...,120.595116,118.910904,118.158623,115.391571,113.890152,110.153618,109.050606,108.196861,104.411400,103.955521
THR_2,55.009197,53.042133,50.226425,48.977764,46.599575,44.848904,42.561554,41.311039,40.731358,38.958118,...,119.277336,117.674980,116.847656,114.033638,112.634155,108.896523,107.913261,107.119568,103.335136,103.009918
ASN_3,52.996067,50.843788,48.150162,47.018196,44.443596,42.756943,40.692173,39.269566,38.463417,36.886650,...,118.774040,117.251953,116.502792,113.614716,112.188644,108.473106,107.537498,106.640549,102.843895,102.508797
HIS_4,51.425522,49.352093,46.791748,45.423580,42.827477,41.415733,39.248894,37.580429,37.003830,35.663334,...,115.825829,114.363930,113.567780,110.646011,109.289307,105.576729,104.728081,103.870880,100.078003,99.839157
PRO_5,49.828991,47.541759,45.124886,43.920033,41.108440,39.759850,37.879242,36.038116,35.175888,34.067284,...,115.887749,114.506882,113.799263,110.806084,109.416100,105.731911,104.926338,103.956612,100.156174,99.897247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LEU_601,88.083916,85.168655,84.788071,83.748260,80.123787,80.643875,80.294693,76.948112,75.670387,76.949760,...,144.220840,144.100555,143.056366,139.539597,139.141220,135.864822,136.432983,135.650955,132.152603,132.992538
ILE_602,90.831619,87.985397,87.547981,86.402885,82.818848,83.338669,82.858208,79.511101,78.342339,79.558334,...,145.731537,145.611923,144.477188,140.985321,140.664993,137.382065,137.988464,137.302032,133.817703,134.729355
SER_603,93.450592,90.550194,90.202751,89.141243,85.524879,86.078346,85.719780,82.358276,81.108284,82.400131,...,148.425568,148.370590,147.266464,143.747543,143.440063,140.191513,140.843658,140.120926,136.655106,137.578644
THR_604,92.621605,89.623459,89.228249,88.312645,84.649200,85.078102,84.812737,81.518944,80.109940,81.341438,...,149.522156,149.417023,148.389175,144.864380,144.459656,141.190536,141.758850,140.954071,137.454758,138.276855


In [10]:
distance_df.to_csv('distances_anoImb.tsv', sep='\t')

In [14]:
# Filter the matrix to keep only rows and columns with any distance value below 10
filtered_rows = distance_df.index[distance_df.lt(10).any(axis=1)]  # Rows with any value < 10
filtered_cols = distance_df.columns[distance_df.lt(10).any(axis=0)]  # Columns with any value < 10

# Subset the original DataFrame to keep only those rows and columns
filtered_distance_df = distance_df.loc[filtered_rows, filtered_cols]
filtered_distance_df

Unnamed: 0,MET_1,TRP_2,TYR_3,GLU_4,ILE_5,LEU_6,PRO_7,GLY_8,MET_9,ALA_10,...,ILE_24,PHE_25,MET_26,HIS_27,GLU_35,LYS_36,ILE_38,ALA_39,ARG_40,TYR_41
MET_12,39.340168,37.021801,34.615852,33.465416,30.611977,29.260687,27.487503,25.604204,24.652086,23.627249,...,11.737617,12.106692,9.232236,10.821548,16.175486,14.140402,16.663254,18.963800,20.114710,23.524994
ALA_13,38.428543,36.110786,34.002636,32.629673,29.641434,28.756754,27.045982,24.707705,23.939699,23.468378,...,14.642351,15.487639,12.908010,14.241089,19.174263,16.943972,18.366482,20.287127,20.822046,24.255852
LEU_14,36.481045,34.460381,32.224777,30.554924,27.832430,26.983921,24.866343,22.628839,22.366285,21.707285,...,15.673966,15.906991,13.705523,15.896236,21.250212,19.478949,21.640718,23.819052,24.457682,27.949183
SER_15,34.703415,32.598061,30.111483,28.742649,26.066668,24.761023,22.714014,20.871960,20.280638,19.164984,...,12.390733,12.557989,10.938760,13.403924,18.709969,17.507681,20.377228,23.047256,23.855232,27.475864
TYR_16,33.438553,31.043388,28.847424,27.660515,24.641054,23.572544,22.015705,19.787310,18.773668,18.238819,...,12.525262,13.789906,12.470367,14.065465,18.606070,17.283613,19.130674,21.696732,21.939093,25.656973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
THR_270,22.772083,19.269306,18.148182,18.712614,15.021782,14.311793,15.788361,13.731269,10.373037,12.122069,...,19.766657,22.549442,23.814543,24.052231,25.170948,25.517935,25.673346,28.345264,26.971846,30.478477
LEU_271,19.053442,15.647624,14.441401,14.888520,11.199365,10.620801,12.295108,10.292995,6.883449,9.233138,...,21.612745,23.949745,25.409670,26.131901,27.668957,28.191622,28.897455,31.711550,30.521639,34.089939
LEU_272,18.387293,14.661091,14.029243,15.306784,11.849113,11.427586,14.066672,12.620235,9.199774,11.599439,...,24.091894,26.613239,28.307724,28.722799,29.614138,30.354387,30.707708,33.440262,32.015244,35.420555
ALA_275,16.096481,12.583381,12.873594,13.330388,9.768597,11.041677,13.553877,11.343130,9.086976,12.590768,...,27.530521,29.869419,31.206793,31.945127,33.448769,33.833111,34.043175,36.593502,35.022305,38.462841
