In [1]:
from protein_configuration import distance_residue, distance_cutoff
import MDAnalysis
from MDAnalysis.analysis import distances
import itertools
from numpy.lib.function_base import average
import pandas as pd
import dask
import dask.multiprocessing
from dask.distributed import Client
#dask.config.set(scheduler='processes')
#client = Client()
#client

In [2]:
#directory = '/home/emanuele/TTR/greta_cutoff_55_ex_2/epsilon_0275_latestdihedrals_newljr_harp0/monomer_test/plain_MD'
directory = '/home/emanuele/ABeta'
#reference_structure = f'{directory}/monomer_analysis.gro'
#reference_trajectory = f'{directory}/monomer_analysis.xtc'
reference_structure = f'{directory}/reduced-noh.gro'
reference_trajectory = f'{directory}/reduced-noh.xtc'

In [3]:
def make_pairs(atomgroup, frame_index, pairs_ai, pairs_aj):

    framed_peptide = atomgroup.universe.trajectory[frame_index]
    self_distance = distances.self_distance_array(framed_peptide.positions)
    monomer_pairs_df = pd.DataFrame(columns=['ai', 'aj','ai_name', 'aj_name', 'ai_resnum', 'aj_resnum', 'distances'])
    monomer_pairs_df['ai'] = pairs_ai
    monomer_pairs_df['aj'] = pairs_aj
    monomer_pairs_df['distances'] = self_distance
    monomer_pairs_df = monomer_pairs_df[monomer_pairs_df['distances'] < distance_cutoff]
    monomer_pairs_df[['ai_name','ai_resnum']] = monomer_pairs_df.ai.str.split("_", expand=True)
    monomer_pairs_df[['aj_name','aj_resnum']] = monomer_pairs_df.aj.str.split("_", expand=True)
    monomer_pairs_df = monomer_pairs_df.astype({"ai_resnum": int, "aj_resnum": int})
    monomer_pairs_df.drop(monomer_pairs_df[abs(monomer_pairs_df['aj_resnum'] - monomer_pairs_df['ai_resnum']) < distance_residue].index, inplace=True)

    return monomer_pairs_df    

In [4]:
u = MDAnalysis.Universe(reference_structure, reference_trajectory)
peptides = u.select_atoms('all')
print('Residues: ', u.residues)
print('Atoms: ', len(peptides))

atomtypes = []
for atom in peptides:
    atp = str(atom.name) + '_' + str(atom.resnum)
    atomtypes.append(atp)

pairs_list = list(itertools.combinations(atomtypes, 2))
pairs_ai, pairs_aj = [], []
for n in range(0, len(pairs_list)):
    i = pairs_list[n][0]
    pairs_ai.append(i)
    j = pairs_list[n][1]
    pairs_aj.append(j)

print('Pairs list: ',len(pairs_list))
print('Number of frames: ', len(u.trajectory))
total_frames = len(u.trajectory)    

Residues:  <ResidueGroup [<Residue ASP, 1>, <Residue ALA, 2>, <Residue GLU, 3>, ..., <Residue VAL, 40>, <Residue ILE, 41>, <Residue ALA, 42>]>
Atoms:  319
Pairs list:  50721
Number of frames:  7921


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  residx = np.zeros_like(criteria[0], dtype=np.int)


In [5]:
job_list = []
for frame_index in range(u.trajectory.n_frames):
    job_list.append(dask.delayed(make_pairs(atomgroup=peptides, frame_index=frame_index, pairs_ai=pairs_ai, pairs_aj=pairs_aj)))

monomer_pairs_df = pd.concat(dask.compute(job_list)[0])
monomer_pairs_df

Unnamed: 0,ai,aj,ai_name,aj_name,ai_resnum,aj_resnum,distances
343,CA_1,CE1_4,CA,CE1,1,4,5.385586
658,CB_1,CD1_4,CB,CD1,1,4,5.350300
659,CB_1,CE1_4,CB,CE1,1,4,4.225743
660,CB_1,CZ_4,CB,CZ,1,4,4.633304
974,CG_1,CE1_4,CG,CE1,1,4,5.045256
...,...,...,...,...,...,...,...
50611,C_40,CA_42,C,CA,40,42,4.963446
50612,C_40,CB_42,C,CB,40,42,4.883984
50624,O_40,N_42,O,N,40,42,3.438983
50625,O_40,CA_42,O,CA,40,42,4.350930


In [6]:
count_ai, count_aj, count_distance, count_ratio, average_distance = [], [], [], [], []

total_pairs = len(pairs_list)
for n_counter, pair in enumerate(pairs_list, 1):
    print(f'Processing {n_counter} out of {total_pairs}: {pair}')
    # filtering the data frame based on the pairs values
    count_ai.append(pair[0])
    count_aj.append(pair[1])
    # salvati il df che serve per la media delle distanze e del sigma
    counts_df = monomer_pairs_df[(monomer_pairs_df['ai'] == pair[0]) & (monomer_pairs_df['aj'] == pair[1])]
    average_distance.append(counts_df['distances'].mean())
    count_distance.append(len(counts_df))
    count_ratio.append(len(counts_df)/len(u.trajectory))


Processing 1 out of 50721: ('N_1', 'CA_1')
Processing 2 out of 50721: ('N_1', 'CB_1')
Processing 3 out of 50721: ('N_1', 'CG_1')
Processing 4 out of 50721: ('N_1', 'OD1_1')
Processing 5 out of 50721: ('N_1', 'OD2_1')
Processing 6 out of 50721: ('N_1', 'C_1')
Processing 7 out of 50721: ('N_1', 'O_1')
Processing 8 out of 50721: ('N_1', 'N_2')
Processing 9 out of 50721: ('N_1', 'CA_2')
Processing 10 out of 50721: ('N_1', 'CB_2')
Processing 11 out of 50721: ('N_1', 'C_2')
Processing 12 out of 50721: ('N_1', 'O_2')
Processing 13 out of 50721: ('N_1', 'N_3')
Processing 14 out of 50721: ('N_1', 'CA_3')
Processing 15 out of 50721: ('N_1', 'CB_3')
Processing 16 out of 50721: ('N_1', 'CG_3')
Processing 17 out of 50721: ('N_1', 'CD_3')
Processing 18 out of 50721: ('N_1', 'OE1_3')
Processing 19 out of 50721: ('N_1', 'OE2_3')
Processing 20 out of 50721: ('N_1', 'C_3')
Processing 21 out of 50721: ('N_1', 'O_3')
Processing 22 out of 50721: ('N_1', 'N_4')
Processing 23 out of 50721: ('N_1', 'CA_4')
Pr

In [7]:
pairs_count = pd.DataFrame(columns=['ai', 'aj', 'count', 'ratio', 'average_distance'])
pairs_count['ai'] = count_ai
pairs_count['aj'] = count_aj
pairs_count['count'] = count_distance
pairs_count['ratio'] = count_ratio
pairs_count['average_distance'] = average_distance
pairs_count.sort_values(by = ['ratio'], inplace = True, ascending=False)
pairs_count.dropna()
pairs_count


Unnamed: 0,ai,aj,count,ratio,average_distance
1899,C_1,N_3,7921,1.0,4.007297
26422,C_12,N_14,7921,1.0,3.578960
43225,O_23,N_25,7921,1.0,3.601102
6182,C_3,N_5,7921,1.0,3.874453
30631,C_14,N_16,7921,1.0,3.970003
...,...,...,...,...,...
36190,CB_18,N_19,0,0.0,
36189,CB_18,O_18,0,0.0,
36188,CB_18,C_18,0,0.0,
36187,CB_18,CG2_18,0,0.0,


In [8]:
file = open('monomer_pairs_parallel_check.txt', 'w')
file.write(pairs_count.to_string(index=False, header=False))
file.close()

In [None]:
def pairs_counter(pair, monomer_pairs_df, total_frames):
    count_distance, count_ratio, average_distance = [], [], []
    #print(f'Processing {n_counter} out of {total_pairs}: {pair}')
    # filtering the data frame based on the pairs values
    # salvati il df che serve per la media delle distanze e del sigma
    counts_df = monomer_pairs_df[(monomer_pairs_df['ai'] == pair[0]) & (monomer_pairs_df['aj'] == pair[1])]
    average_distance.append(counts_df['distances'].mean())
    count_distance.append(len(counts_df))
    count_ratio.append(len(counts_df)/total_frames)

    return (pair[0], pair[1], average_distance, count_distance, count_ratio)


In [None]:
import ipyparallel as ipp
ippc = ipp.Client()
ippc.ids

In [None]:
from ipyparallel import Client

rc = Client()
view = rc.load_balanced_view()


count_ai, count_aj, async_results = [], [], []
for pair in pairs_list[:100]:
    count_ai.append(pair[0])
    count_aj.append(pair[1])
    async_result = view.apply_async(pairs_counter, pair, monomer_pairs_df, total_frames)
    async_results.append(async_result)

rc.wait_interactive(async_results)

results = [ar.get() for ar in async_results]
print(results)

In [None]:
job_list = []
count_ai, count_aj, count_distance, count_ratio, average_distance = [], [], [], [], []

for pair in pairs_list:
    n_counter = 1
    print(f'Processing {n_counter} out of {total_pairs}: {pair}')
    job_list.append(dask.delayed(pairs_counter(pair=pair, monomer_pairs_df=monomer_pairs_df)))
    n_counter += 1
    
pairs_count = pd.concat(dask.compute(job_list)[0])
pairs_count