## Jupyter notebook for processing and pyMol

In [2]:
import os
import pandas as pd

### Data step 1: grab PDB file paths for local
returns complex_id_pdb_dict mapping [str1_id]\_[str2_id] to pdb\_file\_path

In [75]:
# pdb stuff
garbage_pdb_path = 'garbage_out_pdb/' # relative
pdb_files_list = os.listdir(garbage_pdb_path)


# config file stuff
config_file_path = 'config_files/30dim_leq200totlen_config.txt' # relative
config_file_df = pd.read_csv(config_file_path, sep='\t')

In [76]:
config_file_counts_df = config_file_df.groupby(['seq1_id','seq2_id']).size().reset_index().rename(columns={0:'count'}) # should match output results; already checked in retry_failed_jobs.ipynb
config_file_counts_df.head()

Unnamed: 0,seq1_id,seq2_id,count
0,5oxz_A,5oxz_B,139
1,5ytq_A,5ytq_B,155
2,6dm9_A,6dm9_B,157
3,6dvu_A,6dvu_B,167
4,6eby_A,6eby_B,159


#### Create dictionary matching metadata to PDB file name
[str1_id]\_[str2_id] : pdb\_file\_name

Note: only care about unperturbed multimers

In [87]:
complex_id_pdb_dict = {}

for index, row in config_file_counts_df.iterrows():
    # only care unperturbed rows
    complex_id = row['seq1_id'] + '_' + row['seq2_id']
    baseline_complex_id = row['seq1_id'] + '_' + row['seq2_id'] + '_0' # 0 finds the ones where nothing is deleted
    # metadata_str = '5oxz_A_5oxz_B_1_1_'
    # print(metadata_str)

    for pdb_file in pdb_files_list:
        # if pdb file corresponds to a baseline run (no perturbations)
        if baseline_complex_id in pdb_file:
            complex_id_pdb_dict[complex_id] = pdb_file
            # print(pdb_file)
        
    # break

len(complex_id_pdb_dict.values())

30

#### Copy and save useful PDB files into new directory

In [84]:
#  for copying files
import shutil 

dst_path = 'pdb_files/multimer_pdb/'

for pdb_file in complex_id_pdb_dict.values():
    src_file = garbage_pdb_path + pdb_file # local path
    dst_file = dst_path + pdb_file

    # copy file
    # shutil.copyfile(src_file, dst_file)



### Data step 2: process outputs from AlphaFold

#### Multimer

In [89]:
# (1) grab results
multimer_output_df = pd.read_csv('deletion_perturb_out/multimer_output.csv')

# (2) want dataframe with metadata_id full_seq which_seq delete_index all_scores
multimer_results_df = multimer_output_df
multimer_results_df['complex_id'] = multimer_results_df['seq1_id'] + '_' + multimer_results_df['seq2_id']

# new baseline columns
for index, row in multimer_results_df.iterrows():
    if row['which_seq'] == 0:
        curr_complex_id = row['complex_id']
        baseline_mean_pLDDT_score = row['mean_pLDDT_score']
        baseline_pTM_score = row['pTM_score']
        baseline_ipTM_score = row['ipTM_score']

        is_curr_complex_id = multimer_results_df['complex_id'] == curr_complex_id
        multimer_results_df.loc[is_curr_complex_id, 'baseline_mean_pLDDT_score'] = baseline_mean_pLDDT_score
        multimer_results_df.loc[is_curr_complex_id, 'baseline_pTM_score'] = baseline_pTM_score
        multimer_results_df.loc[is_curr_complex_id, 'baseline_ipTM_score'] = baseline_ipTM_score

# compute delta scores --> high delta = high score loss upon deletion = important. 
multimer_results_df['delta_mean_pLDDT_score'] = multimer_results_df['baseline_mean_pLDDT_score'] - multimer_results_df['mean_pLDDT_score']
multimer_results_df['delta_pTM_score'] = multimer_results_df['baseline_pTM_score'] - multimer_results_df['pTM_score']
multimer_results_df['delta_ipTM_score'] = multimer_results_df['baseline_ipTM_score'] - multimer_results_df['ipTM_score']

# add pdb file names
multimer_results_df['pdb_file'] = multimer_results_df['complex_id'].map(complex_id_pdb_dict)

# check results
multimer_results_df.head()


Unnamed: 0,seq1_id,seq2_id,seq1,seq2,which_seq,delete_index,mean_pLDDT_score,pTM_score,ipTM_score,complex_id,baseline_mean_pLDDT_score,baseline_pTM_score,baseline_ipTM_score,delta_mean_pLDDT_score,delta_pTM_score,delta_ipTM_score,pdb_file
0,6owd_A,6owd_B,XQIARLQRQIRALQRQNARLQRQIRALQWX,XQIARLQRQIRALQRQNARLQRQIRALQWX,1,2,95.625,0.782227,0.75293,6owd_A_6owd_B,95.25,0.783691,0.754395,-0.375,0.001465,0.001465,6owd_A_6owd_B_0_1_9c291_unrelaxed_rank_001_alp...
1,6owd_A,6owd_B,XQIARLQRQIRALQRQNARLQRQIRALQWX,XQIARLQRQIRALQRQNARLQRQIRALQWX,1,5,93.125,0.730469,0.675293,6owd_A_6owd_B,95.25,0.783691,0.754395,2.125,0.053223,0.079102,6owd_A_6owd_B_0_1_9c291_unrelaxed_rank_001_alp...
2,6owd_A,6owd_B,XQIARLQRQIRALQRQNARLQRQIRALQWX,XQIARLQRQIRALQRQNARLQRQIRALQWX,1,3,95.5625,0.775391,0.743164,6owd_A_6owd_B,95.25,0.783691,0.754395,-0.3125,0.008301,0.01123,6owd_A_6owd_B_0_1_9c291_unrelaxed_rank_001_alp...
3,6owd_A,6owd_B,XQIARLQRQIRALQRQNARLQRQIRALQWX,XQIARLQRQIRALQRQNARLQRQIRALQWX,1,4,95.25,0.770996,0.739746,6owd_A_6owd_B,95.25,0.783691,0.754395,0.0,0.012695,0.014648,6owd_A_6owd_B_0_1_9c291_unrelaxed_rank_001_alp...
4,6owd_A,6owd_B,XQIARLQRQIRALQRQNARLQRQIRALQWX,XQIARLQRQIRALQRQNARLQRQIRALQWX,2,3,95.1875,0.771973,0.742188,6owd_A_6owd_B,95.25,0.783691,0.754395,0.0625,0.011719,0.012207,6owd_A_6owd_B_0_1_9c291_unrelaxed_rank_001_alp...


In [91]:
# save to csv
# multimer_results_df.to_csv('deletion_perturb_out/processed_multimer_output.csv', index=False)

#### Monomer (partial code; for Luke to run)

In [None]:
# # --- skeleton code for monomer --- #
# # CHANGE ALL VARIABLES WITH CAPS NAMES (PLACEHOLDERS) TO YOUR ACTUAL VARIABLES

# # assume we have these variables (CHANGE NAMES HERE AND DOWN THE LINE):
# SEQ_TO_PDB_FILE_DICT = {} # maps full sequences to names of pdb files
# MATCHED_CSV_DF = pd.read_csv('path/to/csv') # contains ArrayTaskID, seq, delete_index, mean_pLDDT_score, pTM_score

# # goal: master csv that contains column with (baseline) pdb file name and delta scores
# processed_monomer_df = MATCHED_CSV_DF

# for index, row in processed_monomer_df.iterrows():
#     # find baseline rows; use its scores as baseline scores
#     if row['delete_index'] == -1:
#         baseline_mean_pLDDT_score = row['mean_pLDDT_score']
#         baseline_pTM_score = row['pTM_score']

#         # find rows with same full seq
#         curr_full_seq = row['seq']
#         is_curr_full_seq = processed_monomer_df['seq'] == curr_full_seq 

#         # write in baseline scores
#         processed_monomer_df.loc[is_curr_full_seq, 'baseline_mean_pLDDT_score'] = baseline_mean_pLDDT_score
#         processed_monomer_df.loc[is_curr_full_seq, 'baseline_pTM_score'] = baseline_pTM_score

# # compute delta scores
# processed_monomer_df['delta_mean_pLDDT_score'] = processed_monomer_df['baseline_mean_pLDDT_score'] - processed_monomer_df['mean_pLDDT_score']
# processed_monomer_df['delta_pTM_score'] = processed_monomer_df['baseline_pTM_score'] - processed_monomer_df['pTM_score']

# # add pdb file names
# processed_monomer_df['pdb_file'] = processed_monomer_df['seq'].map(SEQ_TO_PDB_FILE_DICT) # first variable

# # check results
# processed_monomer_df.head()

# # save file when good
# # processed_monomer_df.to_csv('deletion_perturb_out/processed_monomer_output.csv', index=False)


