In [1]:
##### CREATE single-domain antibody dataset based on SAbDab #####
#@author: Henriette Capel
#@date: 23 Feb 2022

In [2]:
#Import modules
import numpy as np
from collections import defaultdict
import csv

In [3]:
#Import ABDB
from ABDB import database as db



In [4]:
#Constants
TYPE_ANTIGEN = 'protein' 
TYPE_METHOD = 'X-RAY DIFFRACTION'
RESOLUTION_CUTOFF = 3.0

In [5]:
#Functions
def get_abtype(p):
    #Save types of antibodies. 
    types = []
    for fab in p.get_fabs():
        if fab.VL == "NA":
            types.append("VHH")
        elif fab.VH == "NA":
            types.append("VL-only")
        elif fab.VL == fab.VH:
            types.append("scFv")
        else:
            types.append("Fv")        
    return types

def apply_filters(p, type_antigen, type_method, resolution_cutoff):
    if type_antigen in list(set([ag.agtype for ag in p.antigens])):
        if p.method == type_method:
            if float(p.get_resolution()) <= resolution_cutoff:
                return True
            
def get_general_stats(list_pdbs):
    #Function tells how many paired VHVL and total Fvs are present in the dataset 
    PAIRED_VHVL = 0
    TOTAL_FVS = 0

    for pdbid in list_pdbs:
        p = db.fetch(pdbid)
        if p.has_completefab():
            PAIRED_VHVL += 1
        for fab in p.get_fabs():
            TOTAL_FVS += 1

    SUMMARY_STATS = {'PDBs'     : len(list_pdbs),
                     'vhvl'     : PAIRED_VHVL,
                     'fvs'      : TOTAL_FVS}   
    
    return SUMMARY_STATS

def species_stats(list_pdbs):
    species = []
    species_vhh = set([])
    
    for pdbid in list_pdbs:
        p = db.fetch(pdbid)
        species.append(p.get_species())
        if p.fabs[0].VL == "NA":
            species_vhh.add(p.get_species())
            
    species_names_all, species_counts_all = np.unique(species, return_counts = True)
    species_names = []
    species_counts = []
    other_count = 0
    for i in range(len(species_names_all)):
        if species_counts_all[i] < 10:
            other_count += 1
        else:
            species_names.append(species_names_all[i])
            species_counts.append(species_counts_all[i])
    species_names = [x for _,x in sorted(zip(species_counts, species_names), reverse=True)]
    species_counts = sorted(species_counts, reverse=True)
    species_names.append('OTHER')
    species_counts.append(other_count)

    SPECIES_DATA = dict(zip(species_names, species_counts))
    
    return SPECIES_DATA

In [6]:
#Store selected pdbs in lists
selected_pdbs_nb = []
selected_pdbs_vlonly = []
selected_pdbs_fv = []

for pdbid in db:
    p = db.fetch(pdbid)
    #Note: PDB entry can have multiple types of antibodies
    if 'VHH' in get_abtype(p):
        if apply_filters(p, TYPE_ANTIGEN, TYPE_METHOD, RESOLUTION_CUTOFF) is True:
            selected_pdbs_nb.append(pdbid)
    if 'VL-only' in get_abtype(p):
        if apply_filters(p, TYPE_ANTIGEN, TYPE_METHOD, RESOLUTION_CUTOFF) is True:
            selected_pdbs_vlonly.append(pdbid)
    if 'Fv' in get_abtype(p):
        if apply_filters(p, TYPE_ANTIGEN, TYPE_METHOD, RESOLUTION_CUTOFF) is True:
            selected_pdbs_fv.append(pdbid)

In [7]:
print(f'The number of Nanobodies: {len(selected_pdbs_nb)}')
print(f'The number of VL-only antibodies: {len(selected_pdbs_vlonly)}')
print(f'The number of Fv antibodies: {len(selected_pdbs_fv)}')

The number of Nanobodies: 411
The number of VL-only antibodies: 12
The number of Fv antibodies: 1260


In [8]:
intersection_Nb_Fv = set(selected_pdbs_nb).intersection(selected_pdbs_fv)
intersection_Nb_vl = set(selected_pdbs_nb).intersection(selected_pdbs_vlonly)
intersection_vl_Fv = set(selected_pdbs_vlonly).intersection(selected_pdbs_fv)
print(f'Number of entries in both Nb and Fv dataset: {len(intersection_Nb_Fv)}')
print(f'Number of entries in both Nb and VL-only dataset: {len(intersection_Nb_vl)}')
print(f'Number of entries in both VL-only and Fv dataset: {len(intersection_vl_Fv)}')

Number of entries in both Nb and Fv dataset: 6
Number of entries in both Nb and VL-only dataset: 2
Number of entries in both VL-only and Fv dataset: 0


In [10]:
general_nb = get_general_stats(selected_pdbs_nb)
general_vl = get_general_stats(selected_pdbs_vlonly)
general_fv = get_general_stats(selected_pdbs_fv)
print(f'General statistics Nb dataset: {general_nb}')
print(f'General statistics VL-only dataset: {general_vl}')
print(f'General statistics Fv dataset: {general_fv}')

General statistics Nb dataset: {'PDBs': 411, 'vhvl': 6, 'fvs': 799}
General statistics VL-only dataset: {'PDBs': 12, 'vhvl': 0, 'fvs': 27}
General statistics Fv dataset: {'PDBs': 1260, 'vhvl': 1260, 'fvs': 2060}


In [11]:
species_nb = species_stats(selected_pdbs_nb)
species_vl = species_stats(selected_pdbs_vlonly)
species_fv = species_stats(selected_pdbs_fv)
print(f'Species statistics Nb dataset: {species_nb}')
print(f'Species statistics VL-only dataset: {species_vl}')
print(f'Species statistics Fv dataset: {species_fv}')

Species statistics Nb dataset: {'LAMA GLAMA': 186, 'VICUGNA PACOS': 90, 'CAMELUS DROMEDARIUS': 43, 'SYNTHETIC CONSTRUCT': 34, 'HOMO SAPIENS': 23, 'CAMELIDAE': 11, 'OTHER': 13}
Species statistics VL-only dataset: {'HOMO SAPIENS': 12, 'OTHER': 0}
Species statistics Fv dataset: {'HOMO SAPIENS': 712, 'MUS MUSCULUS': 449, 'RATTUS NORVEGICUS': 21, 'SYNTHETIC CONSTRUCT': 18, 'CHIMERIC HOMO SAPIENS/MUS MUSCULUS': 12, 'ORYCTOLAGUS CUNICULUS': 11, 'OTHER': 18}


In [25]:
# #Store selected pdbs in files
# names = ["Nb", "VL", "Fv"]
# list_pdbs = [selected_pdbs_nb, selected_pdbs_vlonly, selected_pdbs_fv]

# for i in range(len(names)):
#     with open(f'/data/icarus/capel/{names[i]}.txt', 'w') as f:
#         for pdbid in list_pdbs[i]:
#             f.write(f"{pdbid}\n")