In [1]:
#Script to combine datasets used in project: "Binding site comparison of single domain and full length antibodies"
#@Author: Henriette Capel
#@Date: 12-06-2022

In [2]:
##### STRUCTURE OF RESULTING DATASET #####
#The dataset will have the following columns:
#pdb: the pdb entry which stores this structure
#antigen_chain: the corresponding antigen chain
#antibody_chain: the corresponding antibody chain
#antigen_sequence: the sequence of the antigen
#antibody_sequence: the sequence of the antibody
#cdr_sequence: sequences of the CDR loops
#epitope_dis: epitiope as determined based on solely the distance. Format sdAbs: {number: aminoacid}, flAbs: {VH: {number: aminoacid}, VL: {number_aminoacid}, VHVL: {number_aminoacid}}
#eptiope_dis_interactions: the number of interactions every epitope residue makes as determined based on solely the distance. Format sdAbs: {aminoacid_number: count}, flAbs: {aminoacid: count}
#paratope_dis: paratope as determined based on solely the distance. Format sdAbs: {number: aminoacid}, flAbs: {VH: {number: aminoacid}, VL: {number: aminoacid}}.
#paratope_dis_interactions: the number of interactions every paratope residue makes as determined based on solely the distance. Format sdAbs: {aminoacid_number: count}, flAbs: {VH: {aminoacid_number: count}, VL: {aminoacid_number: count}}.
#epitope_arp: epitiope as determined based arpeggio annotations. Format: see epitope_dis
#eptiope_arp_interactions: the number of interactions every epitope residue makes as determined based on arpeggio annotations. Format: see epitope_dis_interactions
#paratope_arp: paratope as determined based on arpeggio annotations. Format: see paratope_dis
#paratope_arp_interactions: the number of interactions every paratope residue makes as determined based on arpeggio annoations. Format: see paratope_dis_interactions
#canonical_form_CDRH1: predicted canonical form of CDR-H1 loop by SCALOP
#canonical_form_CDRH2: predicted canonical form of CDR-H2 loop by SCALOP

In [3]:
#Import modules
import pandas as pd
import numpy as np

In [4]:
def read_dataset(filename_ds):
    
    df = pd.read_csv(filename_ds, converters={i: str for i in range(100)})
    
    for colname in df.columns.values.tolist():
        try:
            df[colname] = [ast.literal_eval(d) for d in df[colname]]
        except:
            pass

    return df

In [28]:
def combine_sets(df_binding_dis, df_binding_dis_int, df_binding_arp_int, df_cf):
    #This function combines the seperate dataframes storing partly similar and partly different information of the same dataset
    #columns: pdb - antigen_chain - antibody_chain - antigen_sequence - antibody_sequence - cdr_sequence - epitope_dis - eptiope_dis_interactions - paratope_dis - paratope_dis_interactions - epitope_arp - eptiope_arp_interactions - paratope_arp - paratope_arp_interactions - canonical_form_CDRH1 - canonical_form_CDRH2
    
    #Select the important columns of the different sets
    df_binding_dis_selected = df_binding_dis[["pdb", "antigen_chain", "antibody_chain", "antigen_sequence", "antibody_sequence", "cdr_sequence", "epitope_MWV_dict", "paratope_MWV_dict"]]
    df_binding_dis_int_selected = df_binding_dis_int[["pdb", "antigen_chain", "antibody_chain", "epitope_num_interactions", "paratope_num_interactions"]]
    df_binding_arp_int_selected = df_binding_arp_int[["pdb", "antigen_chain", "antibody_chain", "epitope_MWV_dict", "epitope_num_interactions", "paratope_MWV_dict", "paratope_num_interactions"]]
    df_cf_selected = df_cf[["pdb", "antigen_chain", "antibody_chain", "canonical_form_CDRH1", "canonical_form_CDRH2"]]
    
    #Rename columns
    df_binding_dis_selected = df_binding_dis_selected.rename(columns={"epitope_MWV_dict": "epitope_dis", "paratope_MWV_dict": "paratope_dis"})
    df_binding_dis_int_selected = df_binding_dis_int_selected.rename(columns={"epitope_num_interactions": "epitope_dis_interactions", "paratope_num_interactions": "paratope_dis_interactions"})
    df_binding_arp_int_selected = df_binding_arp_int_selected.rename(columns={"epitope_MWV_dict": "epitope_arp", "paratope_MWV_dict": "paratope_arp", "epitope_num_interactions": "epitope_arp_interactions", "paratope_num_interactions": "paratope_arp_interactions"})
    
    #combine and reorder columns. Note use how="outer" to include complexes for which info is missing in one of the datasets
    df_dis_info = pd.merge(df_binding_dis_selected, df_binding_dis_int_selected, on=["pdb", "antigen_chain", "antibody_chain"])
    df_dis_info = df_dis_info[["pdb", "antigen_chain", "antibody_chain", "antigen_sequence", "antibody_sequence", "cdr_sequence", "epitope_dis", "epitope_dis_interactions", "paratope_dis", "paratope_dis_interactions"]] #reorder columns
    df_dis_arp_info = pd.merge(df_dis_info, df_binding_arp_int_selected, how="outer", on=["pdb", "antigen_chain", "antibody_chain"])
    df_summary = pd.merge(df_dis_arp_info, df_cf_selected, how="outer", on=["pdb", "antigen_chain", "antibody_chain"])
    return df_summary


In [26]:
#The seperate datasets containing important information
df_sdab_filtered = read_dataset("Dataset_nb_filtered.csv")
df_flab_filtered = read_dataset("Dataset_fv_filtered.csv")
df_sdab_dis_int = read_dataset("Dataset_nb_filtered_num_interactions.csv")
df_flab_dis_int = read_dataset("Dataset_fv_filtered_num_interactions.csv")
df_sdab_arp_int = read_dataset("Dataset_nb_arpeggio_interactions.csv")
df_flab_arp_int = read_dataset("Dataset_fv_arpeggio_interactions.csv")
df_sdab_cf = read_dataset("Dataset_nb_filtered_canonical_form.csv")
df_flab_cf = read_dataset("Dataset_fv_filtered_canonical_form.csv")

In [29]:
#Make the summary datasets
df_sdab_summary = combine_sets(df_sdab_filtered, df_sdab_dis_int, df_sdab_arp_int, df_sdab_cf)
df_flab_summary = combine_sets(df_flab_filtered, df_flab_dis_int, df_flab_arp_int, df_flab_cf)

In [30]:
#Store datasets in csv file
# df_sdab_summary.to_csv("Dataset_sdAbs_summary.csv", index=False)
# df_flab_summary.to_csv("Dataset_flAbs_summary.csv", index=False)