## Jupyter notebook to check completeness (Antibody-Antigen) and fill in missing data (if any)
Certain jobs (from a batch) on Grace can occasionally fail due to HTTP timeouts when using the mmseqs2 API. This jupyter notebook looks to see if all jobs were successfull. If not, then it identifies missing jobs and creates a new config file to fill in the missing data.

In [13]:
import pandas as pd

### Step 1: check if data is complete

#### Grab outputs from Grace (scp to local first, then read csv)

In [42]:
# --- check path; csv outputs from grace --- #
AA_monomer_outputs_df = pd.read_csv('../deletion_perturb_out/AA_monomer_output.csv') # csv output from Grace
AA_multimer_outputs_df = pd.read_csv('../deletion_perturb_out/AA_multimer_output.csv') # csv output from Grace

AA_multimer_outputs_df['complex_id'] = AA_multimer_outputs_df['seq1_id'] + '_' + AA_multimer_outputs_df['seq2_id'] # add this

print(AA_monomer_outputs_df.shape) # want 3481 rows
print(AA_multimer_outputs_df.shape) # want 3466 rows

(3483, 5)
(3469, 10)


Manual inspection of data

In [33]:
temp = AA_monomer_outputs_df[AA_monomer_outputs_df.isnull().any(axis=1)]
temp

Unnamed: 0,seq_id,seq,delete_index,mean_pLDDT_score,pTM_score
446,609375,,,,
647,5,,,,


#### [Test 1]: compare counts with expected outputs

In [44]:
# get counts for each baseline input sequence
AA_monomer_counts = AA_monomer_outputs_df.groupby(['seq_id']).size().reset_index(name='count')

AA_multimer_counts = AA_multimer_outputs_df
AA_multimer_counts = AA_multimer_counts.groupby(['complex_id']).size().reset_index(name='count')

# get expected counts
AA_monomer_config_df = pd.read_csv('../config_files/AA_monomer_config.txt', sep='\t') # relative
AA_monomer_expected_counts = AA_monomer_config_df.groupby(['seq_id']).size().reset_index(name='count')

AA_multimer_config_df = pd.read_csv('../config_files/AA_multimer_config.txt', sep='\t') # relative
AA_multimer_config_df['complex_id'] = AA_multimer_config_df['seq1_id'] + '_' + AA_multimer_config_df['seq2_id']
AA_multimer_expected_counts = AA_multimer_config_df.groupby(['complex_id']).size().reset_index(name='count')

AA_monomer_merged_counts = pd.merge(AA_monomer_counts, AA_monomer_expected_counts, on=['seq_id'], how='outer')
AA_multimer_merged_counts = pd.merge(AA_multimer_counts, AA_multimer_expected_counts, on=['complex_id'], how='outer')

# manual inspection
# print(AA_monomer_merged_counts)
# print(AA_multimer_merged_counts)


       complex_id  count_x  count_y
0   7a0w_A_7a0w_B      226      226
1   7fcq_A_7fcq_B      239      239
2   7mdp_A_7mdp_B      232      232
3   7phu_A_7phu_B      231      231
4   7qez_A_7qez_B      233      233
5   7rew_A_7rew_B      229      229
6   7s0x_A_7s0x_B      227      227
7   7sd5_A_7sd5_B      241      241
8   7t0l_A_7t0l_B      228      228
9   7tp4_A_7tp4_B      233      233
10  7wbz_A_7wbz_B      226      226
11  7wvg_A_7wvg_B      232      232
12  7xy8_A_7xy8_B      225      225
13  7z0x_A_7z0x_B      238      238
14  8be1_A_8be1_B      226      226


#### [Test 2]: left join outputs onto expected to find matching/missing rows

In [45]:
# join outputs onto the expected df
AA_monomer_left_join_df = pd.merge(AA_monomer_config_df, AA_monomer_outputs_df, on=['seq_id', 'seq', 'delete_index'], how='left') 
AA_multimer_left_join_df = pd.merge(AA_multimer_config_df, AA_multimer_outputs_df, on=['complex_id', 'seq1_id', 'seq2_id', 'seq1', 'seq2', 'which_seq', 'delete_index'], how='left') 

# sub dataframe of matching data
AA_monomer_matching_df = AA_monomer_left_join_df[~AA_monomer_left_join_df.isnull().any(axis=1)]
AA_multimer_matching_df = AA_multimer_left_join_df[~AA_multimer_left_join_df.isnull().any(axis=1)]

print(AA_monomer_matching_df.shape)
print(AA_multimer_matching_df.shape)

# sub dataframe of missing data
AA_monomer_missing_df = AA_monomer_left_join_df[AA_monomer_left_join_df.isnull().any(axis=1)]
print(AA_monomer_missing_df.shape)
AA_multimer_missing_df = AA_multimer_left_join_df[AA_multimer_left_join_df.isnull().any(axis=1)]
print(AA_multimer_missing_df.shape)

# print(AA_monomer_missing_df) # if empty, then all data matches!
AA_monomer_missing_df.head()
# AA_multimer_missing_df.head()

# temp save to local to inspect
# AA_monomer_missing_df.to_csv('find_AA_round1_missing.csv')

(3481, 6)
(3466, 11)
(0, 6)
(0, 11)


Unnamed: 0,ArrayTaskID,seq_id,seq,delete_index,mean_pLDDT_score,pTM_score


### Step 2: if missing data, create new config file for Grace

#### Save missing data as a "to fill in" config file

In [39]:
n_copies = 1 # run each missing job once

# --- fill monomer --- #
fill_AA_monomer_df = pd.DataFrame({})
for _ in range(n_copies):
    fill_AA_monomer_df = pd.concat([fill_AA_monomer_df, AA_monomer_missing_df], ignore_index=True)

fill_AA_monomer_df = fill_AA_monomer_df.drop(['mean_pLDDT_score', 'pTM_score'], axis=1)
fill_AA_monomer_df = fill_AA_monomer_df.reset_index(drop=True)
fill_AA_monomer_df['ArrayTaskID'] = fill_AA_monomer_df.index + 1 # overwrites ArrayTaskID

# save to local
# fill_AA_monomer_df.to_csv('config_files/fill2_AA_monomer_config.txt', sep='\t', index=False) # CAN CHANGE FILE NAME/PATH


# --- fill multimer --- #
fill_AA_multimer_df = pd.DataFrame({})
for _ in range(n_copies):
    fill_AA_multimer_df = pd.concat([fill_AA_multimer_df, AA_multimer_missing_df], ignore_index=True)

fill_AA_multimer_df = fill_AA_multimer_df.drop(['mean_pLDDT_score', 'pTM_score', 'ipTM_score', 'complex_id'], axis=1)
fill_AA_multimer_df = fill_AA_multimer_df.reset_index(drop=True)
fill_AA_multimer_df['ArrayTaskID'] = fill_AA_multimer_df.index + 1 # overwrites ArrayTaskID

# save to local
# fill_AA_multimer_df.to_csv('config_files/fill1_AA_multimer_config.txt', sep='\t', index=False) # CAN CHANGE FILE NAME/PATH




In [40]:
fill_AA_monomer_df
# fill_AA_multimer_df

Unnamed: 0,ArrayTaskID,seq_id,seq,delete_index


#### When done, save completed data 

In [46]:
# --- save matching data (remove weird stuff) --- #
# AA_monomer_matching_df.to_csv('../matching_perturb_out/matching_AA_monomer_output.csv', index=False) 
# AA_multimer_matching_df.to_csv('../matching_perturb_out/matching_AA_multimer_output.csv', index=False) 