## Jupyter notebook for processing and pyMol (Antibody-Antigen dataset)

In [3]:
import os
import pandas as pd

### Data step 1: grab PDB file paths for local
and returns complex_id_pdb_dict mapping [str1_id]\_[str2_id] to pdb\_file\_path

#### Directory Set Up

In [20]:
# AA monomer
AA_monomer_pdb_directory = 'AA_garbage_out_pdb_mono/' # THIS IS MOST LIKELY WRONG
AA_monomer_pdb_files = os.listdir(AA_monomer_pdb_directory) # local PDB files

AA_monomer_csv_directory = 'AA_garbage_out_csv/'
AA_monomer_csv_files = os.listdir(AA_monomer_csv_directory)

AA_monomer_config_file = 'config_files/AA_monomer_config.txt' # relative path to config files
AA_monomer_config_df = pd.read_csv(AA_monomer_config_file, sep='\t')

# AA multimer
AA_multimer_pdb_directory = 'AA_garbage_out_pdb/' # relative
AA_multimer_pdb_files = os.listdir(AA_multimer_pdb_directory)

AA_multimer_config_file = 'config_files/AA_multimer_config.txt' # relative path to config files
AA_multimer_config_df = pd.read_csv(AA_multimer_config_file, sep='\t')

In [23]:
# AA_multimer_config_df.drop_duplicates(subset=['seq1_id', 'seq2_id'], keep="first").shape
AA_monomer_config_df.drop_duplicates(subset=['seq_id'], keep="first").shape

(30, 4)

#### Get unique seq_id to look for in PDB file names

In [24]:
#monomer
AA_monomer_config_unique_df = AA_monomer_config_df.drop_duplicates(subset='seq_id', keep="first")
AA_monomer_config_unique_df.head()

# multimer
# AA_multimer_config_unique_df = AA_multimer_config_df.drop_duplicates(subset=['seq1_id', 'seq2_id'], keep="first")
# AA_multimer_config_unique_df.head()

Unnamed: 0,ArrayTaskID,seq_id,seq,delete_index
0,1,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,-1
123,124,7a0w_B,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,-1
227,228,7fcq_A,QVQLVQSGAEVKKPGASVKVSCKASGYIFTSYSMHWVRQAPGQGLE...,-1
356,357,7fcq_B,QSVLTQPASVSGSPGQSITISCTGTSSDVGGYNFVSWYQQHPGKAP...,-1
467,468,7mdp_A,EVQLQESGPGLVKPPGTLSLTCAVSGGSISSSNWWSWVRQPPGKGL...,-1


#### Create dictionary matching metadata to PDB file name
For monomer: {seq : pdb\_file\_name}

For multimer: {complex\_id : pdb\_file\_name} where complex\_id concatenates seq1\_id and seq2\_id

Note: only care about unperturbed multimers

In [43]:
# --- monomer --- #
seq_pdb_dict = {}

# for each unique seq_id/seq
for index, row in AA_monomer_config_unique_df.iterrows():

    # find prefix for file
    seq_keyword = row['seq_id'] #+ "_1" # + '_1' should work??
    full_seq = row['seq'] # full sequence to look for

    # find candidate csv files with the right sequence
    candidate_csvs = [csv for csv in AA_monomer_csv_files if seq_keyword in csv] # grabs delete_index = 1 and -1

    # print(row['seq_id'])
    # print(seq_keyword)
    # print(candidate_csvs) 
    # break # breakpoint 1 - TESTING

    # loop through candidate csv files; find the one with the full sequence
    for cand_csv in candidate_csvs:
        cur_cand_path = AA_monomer_csv_directory + cand_csv
        # look for hash (store in cur_id)
        cur_id = ''
        cur_seq = ''

        # look into csv file
        with open(cur_cand_path) as cur_csv:
            for line in cur_csv:
                # Skip header line (generated by colabfold)
                if line[:2] == 'id':
                    continue
                else:
                    line_split = line.split(sep=',')
                    cur_id = line_split[0] # grab data in csv
                    cur_seq = line_split[1]

                # print(cur_id)
                # print(cur_seq)
                # print(full_seq)
                # break
            
                # if full sequence is the current sequence for this csv file:
                
                if cur_seq == full_seq:
                    # print("matched!")

                    # find matching pdbs
                    matching_pdbs = [pdb for pdb in AA_monomer_pdb_files if cur_id in pdb]
                    # if matching_pdbs is empty move on
                    if not matching_pdbs:
                        continue

                    # else, grab matching pdb and update dictionaries
                    else:
                        # error catch
                        if len(matching_pdbs) != 1:
                            print("uh oh, check code and results")
                            print(len(matching_pdbs))
                            print(matching_pdbs)

                        # update dictionaries
                        matching_pdb = matching_pdbs[0]
                        seq_pdb_dict[full_seq] = matching_pdb
                        # break

                else:
                    continue
        # break
        
    # break

print(len(seq_pdb_dict))
# print(seq_id_pdb_dict)

30


In [16]:
# --- multimer --- #
complex_id_pdb_dict = {}

for index, row in AA_multimer_config_unique_df.iterrows():
    # only care unperturbed rows
    complex_id = row['seq1_id'] + '_' + row['seq2_id']
    baseline_complex_id = row['seq1_id'] + '_' + row['seq2_id'] + '_0' # 0 finds the ones where nothing is deleted

    for multimer_pdb_file in AA_multimer_pdb_files:
        # if pdb file corresponds to a baseline run (no perturbations)
        if baseline_complex_id in multimer_pdb_file:
            complex_id_pdb_dict[complex_id] = multimer_pdb_file
            # print(pdb_file)
            
            break

print(len(complex_id_pdb_dict.values()))
print(complex_id_pdb_dict)

15
{'7a0w_A_7a0w_B': 'AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7fcq_A_7fcq_B': 'AA_7fcq_A_7fcq_B_0_1_b6d5d_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7mdp_A_7mdp_B': 'AA_7mdp_A_7mdp_B_0_1_6a8cf_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7phu_A_7phu_B': 'AA_7phu_A_7phu_B_0_1_d68ee_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7qez_A_7qez_B': 'AA_7qez_A_7qez_B_0_1_0f122_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7rew_A_7rew_B': 'AA_7rew_A_7rew_B_0_1_44146_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7s0x_A_7s0x_B': 'AA_7s0x_A_7s0x_B_0_1_3ca5f_unrelaxed_rank_001_alphafold2_multimer_v3_model_3_seed_000.pdb', '7sd5_A_7sd5_B': 'AA_7sd5_A_7sd5_B_0_1_3f9fd_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7t0l_A_7t0l_B': 'AA_7t0l_A_7t0l_B_0_1_04df1_unrelaxed_rank_001_alphafold2_multimer_v3_model_1_seed_000.pdb', '7tp4_

#### Copy and save useful PDB files into new directory

In [41]:
#  for copying files
import shutil 

AA_monomer_dst_path = 'pdb_files/AA_monomer_pdb/'
AA_multimer_dst_path = 'pdb_files/AA_multimer_pdb/'

for monomer_pdb_file in seq_pdb_dict.values(): #seq_id_pdb_dict
    src_file = AA_monomer_pdb_directory + monomer_pdb_file # local path
    dst_file = AA_monomer_dst_path + monomer_pdb_file

    # copy file
    shutil.copyfile(src_file, dst_file)

for multimer_pdb_file in complex_id_pdb_dict.values():
    src_file = AA_multimer_pdb_directory + multimer_pdb_file # local path
    dst_file = AA_multimer_dst_path + multimer_pdb_file

    # copy file
    # shutil.copyfile(src_file, dst_file)



### Data step 2: process AA outputs from AlphaFold

#### AA Monomer

In [46]:
# --- skeleton code for monomer --- #
# CHANGE ALL VARIABLES WITH CAPS NAMES (PLACEHOLDERS) TO YOUR ACTUAL VARIABLES

# assume we have these variables (CHANGE NAMES HERE AND DOWN THE LINE):
SEQ_TO_PDB_FILE_DICT = seq_pdb_dict # maps full sequences to names of pdb files
MATCHED_CSV_DF = pd.read_csv('deletion_perturb_out/matching_AA_monomer_output.csv') # contains ArrayTaskID, seq, delete_index, mean_pLDDT_score, pTM_score

# goal: master csv that contains column with (baseline) pdb file name and delta scores
processed_AA_monomer_df = MATCHED_CSV_DF

for index, row in processed_AA_monomer_df.iterrows():
    # find baseline rows; use its scores as baseline scores
    if row['delete_index'] == -1:
        baseline_mean_pLDDT_score = row['mean_pLDDT_score']
        baseline_pTM_score = row['pTM_score']

        # find rows with same full seq
        curr_full_seq = row['seq']
        is_curr_full_seq = processed_AA_monomer_df['seq'] == curr_full_seq 

        # write in baseline scores
        processed_AA_monomer_df.loc[is_curr_full_seq, 'baseline_mean_pLDDT_score'] = baseline_mean_pLDDT_score
        processed_AA_monomer_df.loc[is_curr_full_seq, 'baseline_pTM_score'] = baseline_pTM_score

# compute delta scores
processed_AA_monomer_df['delta_mean_pLDDT_score'] = processed_AA_monomer_df['baseline_mean_pLDDT_score'] - processed_AA_monomer_df['mean_pLDDT_score']
processed_AA_monomer_df['delta_pTM_score'] = processed_AA_monomer_df['baseline_pTM_score'] - processed_AA_monomer_df['pTM_score']

# add pdb file names
processed_AA_monomer_df['pdb_file'] = processed_AA_monomer_df['seq'].map(SEQ_TO_PDB_FILE_DICT) # first variable

# check results
processed_AA_monomer_df.head()


Unnamed: 0,ArrayTaskID,seq_id,seq,delete_index,mean_pLDDT_score,pTM_score,baseline_mean_pLDDT_score,baseline_pTM_score,delta_mean_pLDDT_score,delta_pTM_score,pdb_file
0,1,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,-1,95.0625,0.89209,95.0625,0.89209,0.0,0.0,AA_7a0w_A_ea32f_unrelaxed_rank_001_alphafold2_...
1,2,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,0,94.5,0.886719,95.0625,0.89209,0.5625,0.005371,AA_7a0w_A_ea32f_unrelaxed_rank_001_alphafold2_...
2,3,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,1,94.1875,0.885742,95.0625,0.89209,0.875,0.006348,AA_7a0w_A_ea32f_unrelaxed_rank_001_alphafold2_...
3,4,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,2,94.0,0.885254,95.0625,0.89209,1.0625,0.006836,AA_7a0w_A_ea32f_unrelaxed_rank_001_alphafold2_...
4,5,7a0w_A,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,3,91.5625,0.864746,95.0625,0.89209,3.5,0.027344,AA_7a0w_A_ea32f_unrelaxed_rank_001_alphafold2_...


In [47]:
# save file when good
# processed_AA_monomer_df.to_csv('deletion_perturb_out/processed_AA_monomer_output.csv', index=False)

#### AA Multimer

In [18]:
# (1) grab results
AA_multimer_output_df = pd.read_csv('deletion_perturb_out/matching_AA_multimer_output.csv')

# (2) want dataframe with metadata_id full_seq which_seq delete_index all_scores
AA_multimer_results_df = AA_multimer_output_df
AA_multimer_results_df['complex_id'] = AA_multimer_results_df['seq1_id'] + '_' + AA_multimer_results_df['seq2_id']

# new baseline columns
for index, row in AA_multimer_results_df.iterrows():
    if row['which_seq'] == 0:
        curr_complex_id = row['complex_id']
        baseline_mean_pLDDT_score = row['mean_pLDDT_score']
        baseline_pTM_score = row['pTM_score']
        baseline_ipTM_score = row['ipTM_score']

        is_curr_complex_id = AA_multimer_results_df['complex_id'] == curr_complex_id
        AA_multimer_results_df.loc[is_curr_complex_id, 'baseline_mean_pLDDT_score'] = baseline_mean_pLDDT_score
        AA_multimer_results_df.loc[is_curr_complex_id, 'baseline_pTM_score'] = baseline_pTM_score
        AA_multimer_results_df.loc[is_curr_complex_id, 'baseline_ipTM_score'] = baseline_ipTM_score

# compute delta scores --> high delta = high score loss upon deletion = important. 
AA_multimer_results_df['delta_mean_pLDDT_score'] = AA_multimer_results_df['baseline_mean_pLDDT_score'] - AA_multimer_results_df['mean_pLDDT_score']
AA_multimer_results_df['delta_pTM_score'] = AA_multimer_results_df['baseline_pTM_score'] - AA_multimer_results_df['pTM_score']
AA_multimer_results_df['delta_ipTM_score'] = AA_multimer_results_df['baseline_ipTM_score'] - AA_multimer_results_df['ipTM_score']

# add pdb file names
AA_multimer_results_df['pdb_file'] = AA_multimer_results_df['complex_id'].map(complex_id_pdb_dict)

# check results
AA_multimer_results_df.head()


Unnamed: 0,ArrayTaskID,seq1_id,seq2_id,seq1,seq2,which_seq,delete_index,complex_id,mean_pLDDT_score,pTM_score,ipTM_score,baseline_mean_pLDDT_score,baseline_pTM_score,baseline_ipTM_score,delta_mean_pLDDT_score,delta_pTM_score,delta_ipTM_score,pdb_file
0,1,7a0w_A,7a0w_B,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,0,-1,7a0w_A_7a0w_B,95.875,0.92627,0.901855,95.875,0.92627,0.901855,0.0,0.0,0.0,AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_...
1,2,7a0w_A,7a0w_B,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,1,0,7a0w_A_7a0w_B,95.875,0.926758,0.901855,95.875,0.92627,0.901855,0.0,-0.000488,0.0,AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_...
2,3,7a0w_A,7a0w_B,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,1,1,7a0w_A_7a0w_B,95.5625,0.924805,0.898926,95.875,0.92627,0.901855,0.3125,0.001465,0.00293,AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_...
3,4,7a0w_A,7a0w_B,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,1,2,7a0w_A_7a0w_B,95.625,0.924316,0.898438,95.875,0.92627,0.901855,0.25,0.001953,0.003418,AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_...
4,5,7a0w_A,7a0w_B,AVTLDESGGGLQTPGGGLSLVCKASGFTFSDYGMGWVRQAPDKGLE...,ALTQPSSVSANPGGTVKITCSGGTYSYGWFQQKSPGSAPVTVIYWN...,1,3,7a0w_A_7a0w_B,95.5,0.924316,0.899902,95.875,0.92627,0.901855,0.375,0.001953,0.001953,AA_7a0w_A_7a0w_B_0_1_60767_unrelaxed_rank_001_...


In [19]:
# save to csv
# AA_multimer_results_df.to_csv('deletion_perturb_out/processed_AA_multimer_output.csv', index=False)