## Jupyter notebook file to fill in missing data from failed jobs on Grace

In [1]:
import pandas as pd

### Grab outputs from Grace (with scp)

In [4]:
outputs_df = pd.read_csv('../deletion_perturb_out/monomer_output.csv') # csv outputs from grace
new_df = outputs_df.drop_duplicates(subset=['seq', 'delete_index'])
print(new_df.shape)
new_df.head()

(3579, 4)


Unnamed: 0,seq,delete_index,mean_pLDDT_score,pTM_score
0,MTNTDLKPLLDNLRNATEFWNLVAAASATDESTVHNRSYRDALDWL...,19.0,82.375,0.694824
1,MTNTDLKPLLDNLRNATEFWNLVAAASATDESTVHNRSYRDALDWL...,3.0,83.9375,0.629395
2,MTNTDLKPLLDNLRNATEFWNLVAAASATDESTVHNRSYRDALDWL...,11.0,82.0625,0.674805
3,MTNTDLKPLLDNLRNATEFWNLVAAASATDESTVHNRSYRDALDWL...,10.0,81.875,0.666504
4,MTNTDLKPLLDNLRNATEFWNLVAAASATDESTVHNRSYRDALDWL...,12.0,75.6875,0.498291


### Convert config file to df
Contains expected outputs

In [5]:
expected_df = pd.read_csv('../config_files/monomer_config.txt', sep='\t') # check path
print(expected_df.shape)

(3576, 3)


### Left join outputs onto expected to find matching/missing rows

In [6]:
left_join_df = pd.merge(expected_df, new_df, on=['seq', 'delete_index'], how='right')
print(left_join_df.shape)

(3579, 5)


In [7]:
# sub dataframe of matching data
matching_df = left_join_df[~left_join_df.isnull().any(axis=1)]
print(matching_df.shape)

# sub dataframe of missing data
missing_df = left_join_df[left_join_df.isnull().any(axis=1)]
print(missing_df.shape)

(3576, 5)
(3, 5)


In [18]:
# Once missing_df has shape (0, 5) (i.e. it is missing no rows), save matching df to csv
# matching_df.to_csv('../deletion_perturb_out/matching_monomer_output.csv', index=False) # check path

#### Save missing data as a "to fill in" config file

In [8]:
n_copies = 1 # run each missing job twice

fill_df = pd.DataFrame({})
for _ in range(n_copies):
    fill_df = pd.concat([fill_df, missing_df], ignore_index=True)

fill_df = fill_df.drop(['mean_pLDDT_score', 'pTM_score'], axis=1)
fill_df = fill_df.reset_index(drop=True)
fill_df['ArrayTaskID'] = fill_df.index + 1 # overwrites ArrayTaskID

# save to local
# fill_df.to_csv('../job_array_config_files/fill_monomer_config.txt', sep='\t', index=False)
