In [1]:
#This script sets up the information needed to run arpeggio directly on the server. Besides, the script reads the output and stores the output as a summary per complex. 
#@Author: Henriette Capel
#@Date: 11-04-2022

In [2]:
#Import modules
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import ast
from ABDB import database as db
from sklearn import metrics
from pandas.errors import EmptyDataError 
import subprocess



In [3]:
#Location files
#/data/icarus/capel/ABDB/entries/7jmo/structure/imgt/7jmo.pdb

In [4]:
start_command = "python /data/icarus/capel/arpeggio/arpeggio.py"
arpeggio_output_path = "/data/icarus/capel/data_arpeggio/"
folder_input = "/data/icarus/capel/ABDB/entries/_/structure/imgt/"
options_command = "-i 4.5 -he "
file_cleaning = "/data/icarus/capel/pdbtools/clean_pdb.py"

columns_types_interactions = ['clash', 'covalent', 'vdw_clash', 'vdw', 'proximal', 'hbond', 'weak_hbond', 'xbond', 'ionic', 'metal_complex', 'aromatic', 'hydrophobic', 'carbonyl', 'polar', 'weak_polar']
interaction_types = ['covalent', 'vdw', 'hbond', 'weak_hbond', 'xbond', 'ionic', 'metal_complex', 'aromatic', 'hydrophobic', 'carbonyl', 'polar', 'weak_polar'] #Check
contact_types = ['proximal']

In [5]:
#Functions
def read_dataset(filename_ds):
    
    df = pd.read_csv(filename_ds, converters={i: str for i in range(100)})
    
    for colname in df.columns.values.tolist():
        try:
            df[colname] = [ast.literal_eval(d) for d in df[colname]]
        except:
            pass

    return df

#Functions creating running commands/files
def create_sh_arpeggio_commands(df, start_command, folder_input, options_command, arpeggio_output_path, name_output_file):
    pdb_list = df["pdb"].tolist()
    txt_string = ""
    for pdb in set(pdb_list):
        ap_command = create_arpeggio_command(df, pdb, start_command, folder_input, options_command)
        
        txt_string += f'echo "{pdb}" \n{ap_command}\n'
    
    #write to sh file
    with open (f'{arpeggio_output_path}{name_output_file}.sh', 'w') as rsh:
        rsh.writelines("#! /bin/bash \n")
        rsh.writelines('echo "Running Arpeggio for the whole dataset"\n')
        rsh.writelines(txt_string)
        rsh.writelines('echo "Done"\n')
    return None

def create_py_arpeggio_commands(df, start_command, folder_input, options_command, arpeggio_output_path, name_output_file):
    pdb_list = df["pdb"].tolist()
    txt_string = ""
    for pdb in set(pdb_list):
        ap_command = create_arpeggio_command(df, pdb, start_command, folder_input, options_command)

        txt_string += f'subprocess.call("{ap_command}", shell=True)\n'
    
    #write to sh file
    with open (f'{arpeggio_output_path}{name_output_file}.py', 'w') as f:
        f.writelines('import subprocess \n')
        f.writelines(txt_string)
    return None

def create_arpeggio_command(df, pdb, start_command, folder_input, options_command, cleaned=False):
    df_selected = df[df["pdb"]== pdb]
    string_interaction_info = ""
    for index, row in df_selected.iterrows():
        ag_chain = row["antigen_chain"]
        epitope_dict = row["epitope_num_interactions"]
        string_interaction_info += transform_dict_epitope(epitope_dict, ag_chain)
    list_path_input_parts = folder_input.split("_")
    
    if cleaned:
        correct_path_input = f"{list_path_input_parts[0]}{pdb}{list_path_input_parts[1]}{pdb}.clean.pdb"
    else:
        correct_path_input = f"{list_path_input_parts[0]}{pdb}{list_path_input_parts[1]}{pdb}.pdb"
    
    command_line = f"{start_command} {correct_path_input} {options_command}"
    
    return command_line

def create_sh_arpeggio_failed_pdbs(df, list_failed_pdbs, start_command, folder_input, options_command, file_cleaning, name_output_file):
    #First run pdbtools/clean_pdb.py than try running areggio again
    list_path_input_parts = folder_input.split("_")
    running_commands = ""
    for pdb in list_failed_pdbs:
        cleaning_command = f"python {file_cleaning} {list_path_input_parts[0]}{pdb}{list_path_input_parts[1]}{pdb}.pdb"
        ap_command = create_arpeggio_command(df, pdb, start_command, folder_input, options_command, cleaned=True)
        
        running_commands += f'echo "{pdb}" \n{cleaning_command} \n{ap_command}\n'
    
    with open (f'{arpeggio_output_path}{name_output_file}.sh', 'w') as rsh:
        rsh.writelines("#! /bin/bash \n")
        rsh.writelines('echo "Running Arpeggio for the failed dataset"\n')
        rsh.writelines(running_commands)
        rsh.writelines('echo "Done"\n')
    return None  


def create_py_arpeggio_failed_pdbs(df, list_failed_pdbs, start_command, folder_input, options_command, file_cleaning, name_output_file):
    #First run pdbtools/clean_pdb.py than try running areggio again
    list_path_input_parts = folder_input.split("_")
    running_commands = ""
    for pdb in list_failed_pdbs:
        cleaning_command = f"python {file_cleaning} {list_path_input_parts[0]}{pdb}{list_path_input_parts[1]}{pdb}.pdb"
        ap_command = create_arpeggio_command(df, pdb, start_command, folder_input, options_command, cleaned=True)
        
        running_commands += f'subprocess.call("{cleaning_command}", shell=True) \nsubprocess.call("{ap_command}", shell=True)\n'
    
    with open (f'{arpeggio_output_path}{name_output_file}.py', 'w') as f:
        f.writelines('import subprocess \n')
        f.writelines(running_commands)
    return None
    

def transform_dict_epitope(epitope_interaction_dict, ag_chain):
    string_interaction_info = ""
    for residue_id in epitope_interaction_dict.keys():
        try: 
            res_aa_name, res_num, res_ins_code = residue_id.split("_")
            string_interaction_info += f"-s /{str(ag_chain)}/{str(res_num)}[{res_ins_code}]/ "
        except ValueError: #2 values to unpack if no res_ins_code
            res_aa_name, res_num = residue_id.split("_")
            string_interaction_info += f"-s /{str(ag_chain)}/{str(res_num)}/ "
    return string_interaction_info



### Functions to read output arpeggio
def read_arpeggio_result(main_path, df_input, columns_types_interactions, interaction_types_list, contact_types_list, folder, failed_try_pdb_list=False, writing_file=False):
    
    #This function reads the output ".contacts" file of all complexes created by arpeggio. It selects all atom interactions between the antibody and the antigen.
    #Next another function (store_interactions_per_residue) will be called to store the interactions per residue instead of per atom. A general summary about the interactings in one complex is stored as a row in a dataframe. 
    #Note: Run script for single domain and full length seperately
    #Note the first run of this function will indicate pdbs that need to be cleaned before running Arpeggio. If you want to run this function after cleaning these pdbs, indicate a list of these pdbs as 7th input argument.
    
    #Set up empty summary dataframe
    columns_ds_df = ['pdb', 'antigen_chain', 'antibody_chain', 'epitope_interactions', 'paratope_interactions', 'number_interactions', 'number_contacts'] + columns_types_interactions
    df_store_interaction_per_dataset = pd.DataFrame(columns= columns_ds_df)
    
    index_counter_main = 0
    count_arpeggio_failed = 0
    failed_arpeggio_pdbs = set()
    
    #Check if function is run over all pdbs or only over the failed (and now cleaned) pdbs. 
    if failed_try_pdb_list:
        pdbs_to_study = failed_try_pdb_list
    else:
        pdbs_to_study = df_input['pdb'].tolist()
    
    #Do for every pdb in the dataset 
    for pdb in set(pdbs_to_study):
        df_selected = df_input[df_input["pdb"] == pdb]
        
        #Check if arpeggio has created the output file
        try: 
            if failed_try_pdb_list:
                df_output = pd.read_csv(f"{main_path}{pdb}.clean.contacts", sep="\t")
            else:
                df_output = pd.read_csv(f"{main_path}{pdb}.contacts", sep="\t")
            
            #one pdb can contain multiple complexes. Look into each complex individualy.
            for index, row in df_selected.iterrows():  
                
                #Store only interactions between our antibody and antigen
                indexes_to_keep = []
                ag_chain = row["antigen_chain"]
                ab_chain = row["antibody_chain"]
                for index_2, row_2 in df_output.iterrows():
                    if row_2["interacting_entities"] == "INTRA_SELECTION": #Do not included the interactions between or with water molecules. Note, we did not make a further selection, we look into all atoms, so INTER and INTRA_NON_SELECTION does not exist.
                        if row_2["atom_bgn"][0] == ag_chain:
                            if row_2["atom_end"][0] in ab_chain: #"in" such that works for full length antibodies
                                indexes_to_keep.append(index_2)
                        elif row_2["atom_end"][0] in ab_chain:
                            if row_2["atom_bgn"][0] == ag_chain:
                                indexes_to_keep.append(index_2)
                df_filtered= df_output.iloc[indexes_to_keep, :].copy()
                
                ##Write all atom interacting information to file
                if writing_file:
                    print("WARNING: you are writing to a file")
                    df_filtered.to_csv(f"{main_path}/output/{folder}/{pdb}_{ag_chain}_{ab_chain}_atom_interactions.csv", index=False)
                
           
                #Store interaction information per residue
                df_residue_interaction = store_interactions_per_residue(df_filtered, columns_types_interactions)
                
                ##write all residue information to a file 
                if writing_file:
                    print("WARNING: you are writing to a file")
                    df_residue_interaction.to_csv(f"{main_path}/output/{folder}/{pdb}_{ag_chain}_{ab_chain}_residue_interactions.csv", index=False)
                
                #Summarise result in one line
                if len(ab_chain) == 2: #full length antibody
                    df_residue_interaction[['chain_2', 'position_2']] = df_residue_interaction['residue_2'].str.split('/', expand=True)
                    df_residue_interaction_heavy = df_residue_interaction[df_residue_interaction["chain_2"] == ab_chain[0]]
                    df_residue_interaction_light = df_residue_interaction[df_residue_interaction["chain_2"] == ab_chain[1]]
                    df_residue_interaction_heavy = df_residue_interaction_heavy.drop(['chain_2', 'position_2'], axis=1)
                    df_residue_interaction_light = df_residue_interaction_light.drop(['chain_2', 'position_2'], axis=1)
                    
                    dict_summary_per_complex_heavy = store_interactions_residue_per_complex_dict(df_residue_interaction_heavy, columns_types_interactions, interaction_types_list, contact_types_list)
                    dict_summary_per_complex_light = store_interactions_residue_per_complex_dict(df_residue_interaction_light, columns_types_interactions, interaction_types_list, contact_types_list)
                    #Combine
                    dict_summary_per_complex = combine_full_length_output(dict_summary_per_complex_heavy, dict_summary_per_complex_light, ab_chain[0], ab_chain[1])
                else: #Single domain antibody
                    dict_summary_per_complex = store_interactions_residue_per_complex_dict(df_residue_interaction, columns_types_interactions, interaction_types_list, contact_types_list)
                
                dict_summary_per_complex['pdb'] = pdb
                dict_summary_per_complex['antigen_chain'] = ag_chain
                dict_summary_per_complex['antibody_chain'] = ab_chain
                df_line_info = pd.DataFrame(dict_summary_per_complex, index=[index_counter_main])
                index_counter_main += 1
                
                #Update dataframe
                df_store_interaction_per_dataset = pd.concat([df_store_interaction_per_dataset, df_line_info], ignore_index = True, axis = 0)
        
        #Failed arpeggio
        except FileNotFoundError: #Arpeggio has not created the output file. Try cleaning the pdb
            count_arpeggio_failed +=1
            failed_arpeggio_pdbs.add(pdb)
        except EmptyDataError: #The dataset is empty. Try cleaning the output file. 
            failed_arpeggio_pdbs.add(pdb)
            count_arpeggio_failed +=1
    
    #Statement about the failed arpeggios
    print(f"Arpeggio could not run for {count_arpeggio_failed} PDBs")
    print(failed_arpeggio_pdbs)
    
    return df_store_interaction_per_dataset, failed_arpeggio_pdbs

def store_interactions_per_residue(df_contacts, columns_types_interactions):
    #this function summarise all the atom interacting information to residue interacting information. 
    #It loops over dataframes containing all information of the interacting between two residues. 
    #Per residues interacting it stores per type of interacting the amount of interactings that are made. So 2 means that two different atom combinations of the two residues are making this bond. 
    
    columns_df = ['residue_1', 'residue_2', 'number_interactions'] + columns_types_interactions
    df_residue_info = pd.DataFrame(columns=columns_df) 
    
    #Split atom from chain and position
    df_contacts[['chain_bgn', 'position_bgn', 'atom_type_bgn']] = df_contacts['atom_bgn'].str.split('/', expand=True)
    df_contacts["chain_position_bng"] = df_contacts['chain_bgn'] +"/"+ df_contacts["position_bgn"] #Needed for full length
    df_contacts[['chain_end', 'position_end', 'atom_type_end']] = df_contacts['atom_end'].str.split('/', expand=True)
    df_contacts["chain_position_end"] = df_contacts['chain_end'] +"/"+ df_contacts["position_end"] #Needed for full length
    
    index_counter = 0
    
    #Select part dataframe for one bgn position
    unique_positions_bgn_set = set(df_contacts['chain_position_bng'].tolist())
    for unique_position_bgn in unique_positions_bgn_set:
        df_selected_position = df_contacts.loc[df_contacts['chain_position_bng'] == unique_position_bgn]
        
        #Select part dataframe for on end position (so all atoms of two residues interacting)
        unique_positions_bgn_end_set = set(df_selected_position['chain_position_end'].tolist())
        for unique_position_end in unique_positions_bgn_end_set:
            df_selected_interaction = df_selected_position.loc[df_contacts['chain_position_end'] == unique_position_end]
            
            #sum the types of interactions and add to dataframe 
            interaction_sum_serie=df_selected_interaction[columns_types_interactions].sum()
            interaction_dict=interaction_sum_serie.to_dict()
            interaction_dict["residue_1"] = unique_position_bgn
            interaction_dict["residue_2"] = unique_position_end
            interaction_dict["number_interactions"] = df_selected_interaction.shape[0]

            df_new_line = pd.DataFrame(interaction_dict, index=[index_counter])
            df_residue_info = pd.concat([df_residue_info, df_new_line], ignore_index = True, axis = 0)
            index_counter +=1
    return df_residue_info
            
def store_interactions_residue_per_complex_dict(df_residue_interaction, columns_types_interactions, interaction_types_list, contact_types_list):
    #This function stores the interaction of one antigen-antibody complex as one line in a dataframe
    #For all types of bindings that are determined as interactions by "columns_types_interactions". It only counts how often it occurs between two residues. Not how many times it occurs within two residues. 
    #So for example if residues A-B make 3 times a hydrophilic bond (because 3 different atom combinations of these two residues make hydrophilic bonds) it is count as 1 in the summary file. 
    #Note, if these same A-B make also x times another bond this bond is also counted as 1. Therefore one residue pair can make multiple types of interactions 

    dict_interaction_res_occurence_complex = {}
    for interaction_type in columns_types_interactions:
        interaction_seen_between_residues_complex = df_residue_interaction[interaction_type].astype(bool).sum(axis=0)
        dict_interaction_res_occurence_complex[interaction_type] = interaction_seen_between_residues_complex
        
    #Determine epitope, paratope, number interactions, number contacts and add this to the dictionary 
    dict_epitope, dict_paratope, number_interactions, number_contacts = store_epitope_paratope_dict(df_residue_interaction, interaction_types_list, contact_types_list)
    dict_interaction_res_occurence_complex["epitope_interactions"] = [dict_epitope]
    dict_interaction_res_occurence_complex["paratope_interactions"] = [dict_paratope]
    dict_interaction_res_occurence_complex["number_interactions"] = number_interactions
    dict_interaction_res_occurence_complex["number_contacts"] = number_contacts

    return dict_interaction_res_occurence_complex

def store_epitope_paratope_dict(df_residue_interaction, interaction_types_list, contact_types_list):
    #This function set up the dataframes and calls the "determine_interaction_dict()" function in order to determine both the epitope and the paratope
    df_residue_interaction[['chain_bgn', 'position_bgn']] = df_residue_interaction['residue_1'].str.split('/', expand=True)
    df_residue_interaction[['chain_end', 'position_end']] = df_residue_interaction['residue_2'].str.split('/', expand=True)
    
    #antigen chain always in chain_bgn, antibody chain always in chain_end
    dict_epitope_position, count_number_total_interactions_epi, count_number_total_contacts_epi = determine_interaction_dict(df_residue_interaction, interaction_types_list, contact_types_list, "position_bgn")
    dict_paratope_position, count_number_total_interactions_para, count_number_total_contacts_para = determine_interaction_dict(df_residue_interaction, interaction_types_list, contact_types_list, "position_end")
    
    #Sanity check 
    if count_number_total_interactions_epi != count_number_total_interactions_para:
        print("not the same interactions")
    if count_number_total_contacts_epi != count_number_total_contacts_para:
        print("not the same contacts")
        
    return dict_epitope_position, dict_paratope_position, count_number_total_interactions_epi, count_number_total_contacts_epi
    
def determine_interaction_dict(df_residue_interaction, interaction_types_list, contact_types_list, column_name):
    #This function determines for every epitope (or paratope) residue how often it is interacting with another residue of the paratope (or epitope). Besides it counts the total amount of interactions within one complex.
    #Note: now the residue can both form an interacting and a contact!!! Change to elif if we want to investigates the contacts. 
    
    interaction_positions_set = set(df_residue_interaction[column_name].tolist())
    dict_interaction_position = {}
    count_number_total_interactions = 0
    count_number_total_contacts = 0
    
    for interaction_pos in interaction_positions_set:
        count_is_interacting = 0
        df_selected_interaction_region = df_residue_interaction.loc[df_residue_interaction[column_name] == interaction_pos]
        
        for index, row in df_selected_interaction_region.iterrows():
            if row[interaction_types_list].sum() >0:
                #If one of the interaction types is seen, count it
                count_is_interacting += 1
                count_number_total_interactions +=1
            if row[contact_types_list].sum() >0:
                #If one of the contact types is seen, count it.
                count_number_total_contacts += 1 
        
        #Save with how mamy residues the certain position is interacting. 
        if count_is_interacting > 0:
            dict_interaction_position[interaction_pos] = count_is_interacting
        
    return dict_interaction_position, count_number_total_interactions, count_number_total_contacts
       
def combine_full_length_output(dict_heavy, dict_light, chain_heavy, chain_light):
    #This function stores the information of the heavy and the light chain seperately and together. 
    dict_combined = {}
    for key in dict_heavy.keys():
        info_dict = {}
        if isinstance(dict_heavy[key], list): #get dictionary out of the list
            dict_heavy[key] = dict_heavy[key][0]
            dict_light[key] = dict_light[key][0]
        info_dict[chain_heavy] = dict_heavy[key]
        info_dict[chain_light] = dict_light[key]
        dict_combined[key] = [info_dict] #Brackets are needed to save it as one entry in the pdb. 
    return dict_combined
        


In [6]:
#Single domain antibodies
ds_name_nb = "Dataset_nb_filtered_num_interactions.csv"
df_interactions_nb = read_dataset(ds_name_nb)

In [7]:
#### create_sh_arpeggio_commands(df_interactions_nb, start_command, folder_input, options_command, arpeggio_output_path, "single_domain_arpeggio_commands")

In [8]:
#df_summary_arpeggio_nb, failed_arpeggio_pdbs_nb = read_arpeggio_result(arpeggio_output_path, df_interactions_nb, columns_types_interactions, interaction_types, contact_types, "single_domain")


In [9]:
# df_summary_arpeggio_nb

In [10]:
#Make script that does cleaning first on the failed once
##create_sh_arpeggio_failed_pdbs(df_interactions_nb, failed_arpeggio_pdbs_nb, start_command, folder_input, options_command, file_cleaning, "sd_arpeggio_failed_pdbs")
##create_py_arpeggio_failed_pdbs(df_interactions_nb, failed_arpeggio_pdbs_nb, start_command, folder_input, options_command, file_cleaning, "sd_arpeggio_failed_pdbs")


In [11]:
# # #Do for the failed once and concat results
# df_summary_arpeggio_failed_nb, failed_twice_arpeggio_pdbs_nb = read_arpeggio_result(arpeggio_output_path, df_interactions_nb, columns_types_interactions, interaction_types, contact_types, "single_domain", failed_arpeggio_pdbs_nb)
# print(f"{len(failed_twice_arpeggio_pdbs_nb)} failed twice")

# # ##Combine dataframes
# df_summary_arpeggio_total_nb = pd.concat([df_summary_arpeggio_nb, df_summary_arpeggio_failed_nb])


In [12]:
# df_summary_arpeggio_total_nb

In [13]:
## #SAVE dataframe
# df_summary_arpeggio_total_nb.to_csv("Dataset_nb_filtered_arpeggio.csv", index=False)

In [14]:
# #DO THIS FOR 7d2z, 6ui1, 5mhr
# # #check specific pdb
# df_input_test = df_interactions_nb[df_interactions_nb["pdb"]=="6ui1"]
# # failed_list = ["6ui1"]
# failed_list = False
# test_nb, failed_test = read_arpeggio_result(arpeggio_output_path, df_input_test, columns_types_interactions, interaction_types, contact_types, "single_domain", failed_list, True)


Arpeggio could not run for 0 PDBs
set()


In [15]:
#Load created file
ds_name_arp_nb = "Dataset_nb_filtered_arpeggio.csv"
df_arp_nb = read_dataset(ds_name_arp_nb)

In [16]:
########

In [17]:
#Full length antibodies
ds_name_fv = "Dataset_fv_filtered_num_interactions.csv"
df_interactions_fv = read_dataset(ds_name_fv)

In [18]:
## Create sh or py script that runs arpeggio automatically for all pdbs
## create_sh_arpeggio_commands(df_interactions_fv, start_command, folder_input, options_command, arpeggio_output_path, "full_length_arpeggio_unique_commands")
##create_py_arpeggio_commands(df_interactions_fv, start_command, folder_input, options_command, arpeggio_output_path, "full_length_arpeggio_unique_commands")


In [19]:
# df_summary_arpeggio_fv, failed_arpeggio_pdbs_fv = read_arpeggio_result(arpeggio_output_path, df_interactions_fv, columns_types_interactions, interaction_types, contact_types, "full_length", False)


Arpeggio could not run for 284 PDBs
{'7a3o', '3kr3', '3bn9', '5x2o', '2xqy', '4qhu', '6o39', '3zkn', '2aep', '7lfa', '3u7y', '3p0y', '5o14', '5wb9', '5o1r', '5bjz', '6glw', '5kvd', '6yla', '5vag', '6ml8', '4i77', '7bej', '6azz', '6erx', '6h2y', '4xwo', '4uta', '3l5w', '7neg', '7dm2', '6nmt', '5tfw', '6d2p', '7dr4', '6vmj', '4xvu', '6osv', '5vjq', '7djz', '3wkm', '6al0', '3wih', '4jlr', '6was', '7jmp', '6bit', '2vxt', '3lh2', '3se8', '4zff', '5ugy', '5b3j', '2ypv', '7lsf', '3liz', '5usi', '7msq', '4ogy', '5bk1', '7vux', '7mzj', '6ohg', '5th9', '7dc8', '7kmi', '5d70', '7ps4', '4lsu', '6ddv', '6j14', '4lvh', '7mzh', '6o3a', '2w9e', '4cni', '7nx7', '4zfg', '3tje', '6jep', '6cxy', '7kf0', '4hc1', '6wzl', '7n3d', '3d85', '5vkd', '7ps2', '4uu9', '5mes', '4bz1', '6osh', '7jx3', '7s4s', '7bz5', '6wzm', '5tzt', '6ddm', '7or9', '4ot1', '7np1', '4xmp', '7kmg', '6hf1', '7lm9', '6lyn', '1yqv', '5f96', '6tyb', '5tzu', '4jkp', '6cbv', '6ddr', '7coe', '7ps1', '6a67', '4d9r', '6b0h', '6iea', '4i3r', '4o

In [20]:
# df_summary_arpeggio_fv

In [21]:
## Clean the failed once first and run again
#create_py_arpeggio_failed_pdbs(df_interactions_fv, failed_arpeggio_pdbs_fv, start_command, folder_input, options_command, file_cleaning, "fv_arpeggio_failed_pdbs")


In [22]:
# # # #Do for the failed once and concat results
# df_summary_arpeggio_failed_fv, failed_twice_arpeggio_pdbs_fv = read_arpeggio_result(arpeggio_output_path, df_interactions_fv, columns_types_interactions, interaction_types, contact_types, "full_length", failed_arpeggio_pdbs_fv)
# print(f"{len(failed_twice_arpeggio_pdbs_fv)} failed twice")

# #Combine dataframes
# df_summary_arpeggio_total_fv = pd.concat([df_summary_arpeggio_fv, df_summary_arpeggio_failed_fv])


Arpeggio could not run for 2 PDBs
{'4xwo', '5k9q'}
2 failed twice


In [23]:
# df_summary_arpeggio_total_fv

In [24]:
##Save information to files
# df_summary_arpeggio_total_fv.to_csv("Dataset_fv_filtered_arpeggio.csv", index=False)