## Jupyter notebook to check completeness (Multimer) and fill in missing data (if any)
Certain jobs (from a batch) on Grace can occasionally fail due to HTTP timeouts when using the mmseqs2 API. This jupyter notebook looks to see if all jobs were successfull. If not, then it identifies missing jobs and creates a new config file to fill in the missing data.

In [2]:
import pandas as pd

### Step 1: check if data is complete

#### Grab outputs from Grace (scp to local first, then read csv)

In [11]:
# --- check path; csv outputs from grace --- #
outputs_df = pd.read_csv('../deletion_perturb_out/multimer_output.csv') # csv output from Grace
print(outputs_df.shape)
# outputs_df.head()

(3897, 9)


#### [Test 1]: compare counts with expected outputs

In [12]:
# inspecting output for missing data
# outputs_unique_df = outputs_df.drop_duplicates(subset=['seq1_id', 'seq2_id', 'which_seq', 'delete_index'])
# outputs_unique_df.shape

grouped_counts = outputs_df.groupby(['seq1_id', 'seq2_id']).size().reset_index(name='count')
# grouped_counts.head()
print(grouped_counts.shape)

expected_df = pd.read_csv('../config_files/multimer_config.txt', sep='\t')
expected_counts = expected_df.groupby(['seq1_id', 'seq2_id']).size().reset_index(name='count')
print(expected_counts.shape)

merged_counts = pd.merge(grouped_counts, expected_counts, on=['seq1_id', 'seq2_id'], how='outer')

# manual inspection
merged_counts


(29, 3)
(30, 3)


Unnamed: 0,seq1_id,seq2_id,count_x,count_y
0,5oxz_A,5oxz_B,137.0,139
1,5ytq_A,5ytq_B,155.0,155
2,6dm9_A,6dm9_B,157.0,157
3,6dvu_A,6dvu_B,167.0,167
4,6eby_A,6eby_B,110.0,159
5,6fan_A,6fan_B,90.0,127
6,6gto_A,6gto_B,,177
7,6h7a_A,6h7a_B,168.0,185
8,6j08_A,6j08_B,146.0,146
9,6jfv_A,6jfv_B,178.0,194


#### [Test 2]: left join outputs onto expected to find matching/missing rows

In [14]:
# join outputs onto the expected df
left_join_df = pd.merge(expected_df, outputs_df, on=['seq1_id', 'seq2_id', 'seq1', 'seq2', 'which_seq', 'delete_index'], how='left') # multimer

# sub dataframe of matching data
matching_df = left_join_df[~left_join_df.isnull().any(axis=1)]
print(matching_df.shape)

# sub dataframe of missing data
missing_df = left_join_df[left_join_df.isnull().any(axis=1)]
print(missing_df.shape)
#missing_df.head() # if empty, then all data matches!

(3897, 10)
(776, 10)


Unnamed: 0,ArrayTaskID,seq1_id,seq2_id,seq1,seq2,which_seq,delete_index,mean_pLDDT_score,pTM_score,ipTM_score
24,25,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,23,,,
25,26,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,24,,,
27,28,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,26,,,
30,31,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,29,,,
40,41,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,39,,,


In [None]:
# Once missing_df has shape (0, 5) (i.e. it is missing no rows), save matching df to csv
# matching_df.to_csv('../deletion_perturb_out/matching_multimer_output.csv', index=False) # check path

### Step 2: if missing data, create new config file for Grace

#### Save missing data as a "to fill in" config file

In [15]:
n_copies = 1 # run each missing job once

fill_df = pd.DataFrame({})
for _ in range(n_copies):
    fill_df = pd.concat([fill_df, missing_df], ignore_index=True)

fill_df = fill_df.drop(['mean_pLDDT_score', 'pTM_score', 'ipTM_score'], axis=1)
fill_df = fill_df.reset_index(drop=True)
fill_df['ArrayTaskID'] = fill_df.index + 1 # overwrites ArrayTaskID

# save to local
# fill_df.to_csv('config_files/fill_30dim_leq200totlen_config.txt', sep='\t', index=False) # check path


In [17]:
fill_df

Unnamed: 0,ArrayTaskID,seq1_id,seq2_id,seq1,seq2,which_seq,delete_index
0,1,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,23
1,2,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,24
2,3,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,26
3,4,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,29
4,5,6eby_A,6eby_B,AMADIGSMTNPFDDDEGVFLVLVNDEDQYSLWPEFAEVPQGWRTVF...,NTELYVLDSSLRPLPTGAVGELYLGGVQLARGYVGRPGMTASRFVA...,1,39
...,...,...,...,...,...,...,...
771,772,6jfv_A,6jfv_B,GKSEISELRRTMQNLEIELQSQLSMKASLENSLEETKGRYAMQLAQ...,LRNTKHEISEMNRMIQRLRAEIDNVKKQCANLQNAIADAEQRGELA...,2,31
772,773,6jfv_A,6jfv_B,GKSEISELRRTMQNLEIELQSQLSMKASLENSLEETKGRYAMQLAQ...,LRNTKHEISEMNRMIQRLRAEIDNVKKQCANLQNAIADAEQRGELA...,2,32
773,774,6jfv_A,6jfv_B,GKSEISELRRTMQNLEIELQSQLSMKASLENSLEETKGRYAMQLAQ...,LRNTKHEISEMNRMIQRLRAEIDNVKKQCANLQNAIADAEQRGELA...,2,33
774,775,6jfv_A,6jfv_B,GKSEISELRRTMQNLEIELQSQLSMKASLENSLEETKGRYAMQLAQ...,LRNTKHEISEMNRMIQRLRAEIDNVKKQCANLQNAIADAEQRGELA...,2,35


#### When successful, save processed