In [7]:
import os
import sys
import tarfile
import requests
import pandas as pd
import argparse
from Bio import SeqIO
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
import wget
import time
from pprint import pprint as pp
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
import sys

# Custom imports from aggrepred package
top_folder_path = os.path.abspath(os.path.join(os.path.dirname('__file__'), '..'))
sys.path.insert(0, top_folder_path)

from aggrepred.graph_utils import *



In [10]:
_DEFAULT_SUMMARY_FILE_PATH = "../data/summary/summary.csv"
_DEFAULT_SCORE_FILE_PATH = "../data/score/"
_DEFAULT_PDB_FILE_PATH = "../data/pdb/"

In [7]:
def download_file_request(url, output_path, isprint=True):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for HTTP status codes indicating failure
        
        with open(output_path, 'wb') as f:  # Use 'wb' mode for writing binary data
            f.write(response.content)
        if isprint:
            print(f"Successfully downloaded {url} to {output_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

def download_file_wget(url, output_path):
    """
    Download a file from the given URL using wget and save it to the specified output path.
    
    Args:
        url (str): The URL of the file to download.
        output_path (str): The path where the downloaded file will be saved.
    """
    try:
        # Check if the file already exists at the output path
        if not os.path.exists(output_path):
            # Download the file using wget
            wget.download(url, output_path)
            print(f"Successfully downloaded {url} to {output_path}")
        else:
            print(f"File already exists at {output_path}. Skipping download.")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


def extract_and_remove_tar(tar_path):
    """
    Extracts a .tar file from the specified path and removes the extracted file.

    Parameters:
        tar_path (str): The path to the .tar file.
    """
    try:
        # Extract the .tar file
        with tarfile.open(tar_path, 'r') as tar:
            tar.extractall(path=os.path.dirname(tar_path))

        # Remove the extracted file
        os.remove(tar_path)
        
        print(f"Successfully extracted and removed {tar_path}")
        return True
    except Exception as e:
        print(f"Error occurred: {e}")
        return False

def download_pdb_files(job_names, job_ids, 
                        pdb_file_path= _DEFAULT_PDB_FILE_PATH,
                        max_workers=16):
    """
    This function is used to download the PDB files of protein from the Aggrescan3D database 
    based on the job_id from the summary file.
    """
    pdb_file_paths = [
        os.path.join(pdb_file_path, job_name + '.pdb') for job_name in job_names
    ]

    # #Here we try to scrap the download link of the pdb file from the database
    
    # base_url = 'https://biocomp.chem.uw.edu.pl/A3D2/hproteome_job/{}/'
    # job_urls = [base_url.format(job_id) for job_id in job_ids]

    # query = requests.get(job_urls[0])
    # html = BeautifulSoup(query.content, 'html.parser')
    # download_ulr = 'https://biocomp.chem.uw.edu.pl' + html.find(id="structure").find('a').get('href')
    
    # # it is in the form of:  https://biocomp.chem.uw.edu.pl/A3D2/compute_static/{job_id}/output.pdb
    # # it varies only the 'job_id', therefore we use this url directly, instead of scraping every link to each protein
    
    template_url = 'https://biocomp.chem.uw.edu.pl/A3D2/compute_static/{}/output.pdb'
    download_urls = [template_url.format(job_id) for job_id in job_ids]

    #create directories if they don't exist
    os.makedirs(pdb_file_path, exist_ok=True)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = [
            executor.submit(lambda a: download_file_request(*a), (url, path, False))
                for url, path in zip(download_urls, pdb_file_paths)
        ]
        print('Downloading pdb files to {}...'.format(pdb_file_path))
        for _ in tqdm(as_completed(results), total=len(download_urls)):
            pass


def download_aggrescan3d_summary_file(summary_file_path=_DEFAULT_SUMMARY_FILE_PATH, score_file_path=_DEFAULT_SCORE_FILE_PATH, max_workers=8):
    """
    This function is used to download the summary file of all species from the Aggrescan3D database.
    """
    base_url = 'https://biocomp.chem.uw.edu.pl/'
    database_url = base_url + 'A3D2/MODB'
    query = requests.get(database_url)
    html = BeautifulSoup(query.content, 'html.parser')
    
    summary_download_urls = [base_url+link['href'] for link in html.find(id='Contact').find_all('h4')[1].find_all('a')]
    score_download_urls = [base_url+link['href'] for link in html.find(id='Contact').find_all('h4')[2].find_all('a')]

    #name of score download file is more organised      
    summary_save_paths =  [summary_file_path + os.path.basename(score_download_url).replace('SCORES.tar.gz', 'SUMMARY.csv')
                            for score_download_url in score_download_urls]          
    score_save_paths =  [score_file_path + os.path.basename(score_download_url) for score_download_url in score_download_urls]
    
    #create directories if they don't exist
    os.makedirs(summary_file_path, exist_ok=True)
    os.makedirs(score_file_path, exist_ok=True)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = [
            executor.submit(lambda a: download_file_request(*a), args)
            for args in zip(summary_download_urls, summary_save_paths)
        ]
        print('Downloading summary aggregation files to {}/ from {} ...'.format(
            summary_file_path, database_url))
        for _ in tqdm(as_completed(results), total=len(summary_download_urls)):
            pass


def download_sabdab_summary_file(summary_file_path=_DEFAULT_SUMMARY_FILE_PATH,
                                 seqid=95,
                                 paired=True,
                                 nr_complex='All',
                                 nr_rfactor='',
                                 nr_res=3):
    base_url = 'http://opig.stats.ox.ac.uk'
    search_url = base_url + '/webapps/newsabdab/sabdab/search/'
    params = dict(seqid=seqid,
                  paired=paired,
                  nr_complex=nr_complex,
                  nr_rfactor=nr_rfactor,
                  nr_res=nr_res)
    query = requests.get(search_url, params=params)
    html = BeautifulSoup(query.content, 'html.parser')
    summary_file_url = base_url + html.find(
        id='downloads').find('a').get('href')
    print('Downloading sabdab summary to {} from: {} ...'.format(
        summary_file_path, summary_file_url))

    os.makedirs(os.path.split(summary_file_path)[0], exist_ok=True)
    print(os.path.split(summary_file_path)[0])
    download_file_request(summary_file_url, summary_file_path)


In [8]:
## download 90% id antibody
download_sabdab_summary_file(summary_file_path="../data/summary/summary90.csv",
                                 seqid=90)
## download 60% id antibody
download_sabdab_summary_file(summary_file_path="../data/summary/summary60.csv",
                                 seqid=60)                        

Downloading sabdab summary to ../data/summary/summary90.csv from: http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/summary/20241017_0760107/ ...
../data/summary
Successfully downloaded http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/summary/20241017_0760107/ to ../data/summary/summary90.csv
Downloading sabdab summary to ../data/summary/summary60.csv from: http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/summary/20241017_0187847/ ...
../data/summary
Successfully downloaded http://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/summary/20241017_0187847/ to ../data/summary/summary60.csv


#### If it doesn't work, try download the antibody dataset manually at [SabDab](https://opig.stats.ox.ac.uk/webapps/sabdab-sabpred/sabdab/search/)



In [4]:
summary90_df = pd.read_csv("../data/summary/summary90.csv", sep='\t')
summary90_df

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,3u1s,H,L,0,,,,,IMMUNE SYSTEM,09/30/11,...,False,True,IGHV1,IGKV2,Kappa,,,,,
1,5uxq,H,L,0,,,,,IMMUNE SYSTEM,02/23/17,...,False,True,IGHV1,IGKV2,Kappa,,,,,
2,5uxq,A,B,0,,,,,IMMUNE SYSTEM,02/23/17,...,False,True,IGHV1,IGKV2,Kappa,,,,,
3,4nwu,H,L,0,,,,,IMMUNE SYSTEM,12/06/13,...,False,True,IGHV1,IGKV4,Kappa,,,,,
4,8ezl,H,H,0,A,protein,,25 kda ookinete surface antigen,IMMUNE SYSTEM,08/16/23,...,True,True,unknown,unknown,unknown,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3686,5ywf,B,A,0,,,,,IMMUNE SYSTEM,11/29/17,...,False,False,IGHV9,IGKV3,Kappa,,,,,
3687,8ry2,A,B,0,,,,,CYTOKINE,07/31/24,...,False,True,IGHV5,IGKV1,Kappa,,,,,
3688,8ry2,H,L,0,,,,,CYTOKINE,07/31/24,...,False,True,IGHV5,IGKV1,Kappa,,,,,
3689,8sve,H,L,0,,,,,CYTOKINE,05/22/24,...,False,True,IGHV2,IGKV1,Kappa,,,,,


In [5]:
print("Before Excluding the failed calculation antibody")
print("total number of antibody:", len(summary90_df.pdb.unique()))
print("total number of paired chains:",len(summary90_df))

Before Excluding the failed calculation antibody
total number of antibody: 1976
total number of paired chains: 3691


## This part is for calculation of aggrescan score using Aggrescan3D for each antibody

In [6]:
pdb_values = summary90_df['pdb'].tolist()

# Specify the file path
file_path = 'pdb_values.txt'

# Write the 'pdb' values to a text file
with open(file_path, 'w') as file:
    for value in pdb_values:
        file.write(f"{value}\n")


### After obtaining all the necessary PDB codes, execute the file ""run_a3d.py"" from the command line.
### Note that the A3D tool requires Python 2, which differs from the Python 3 environment used in this notebook
### That's why we need 2 conda environment , aggrepred (for normal coding) and a3d (only for running a3d)

## before going to following part, make sure to run aggrescan3d file to get the score of each antibody



## exclude the failed antibody

In [144]:
directory = '../data/score/'
pdb_files = [filename.split('.')[0] for filename in os.listdir(directory) if filename.endswith('.csv')]

In [145]:
len(pdb_files)

2109

In [146]:
summary90_df = summary90_df[summary90_df['pdb'].isin(pdb_files)]
summary90_df

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,3u1s,H,L,0,,,,,IMMUNE SYSTEM,09/30/11,...,False,True,IGHV1,IGKV2,Kappa,,,,,
1,5uxq,H,L,0,,,,,IMMUNE SYSTEM,02/23/17,...,False,True,IGHV1,IGKV2,Kappa,,,,,
2,5uxq,A,B,0,,,,,IMMUNE SYSTEM,02/23/17,...,False,True,IGHV1,IGKV2,Kappa,,,,,
3,4nwu,H,L,0,,,,,IMMUNE SYSTEM,12/06/13,...,False,True,IGHV1,IGKV4,Kappa,,,,,
4,8ezl,H,H,0,A,protein,,25 kda ookinete surface antigen,IMMUNE SYSTEM,08/16/23,...,True,True,unknown,unknown,unknown,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3647,7fab,H,L,0,,,,,IMMUNOGLOBULIN,11/20/91,...,False,True,IGHV4,IGLV1,Lambda,,,,,
3648,6nmv,H,L,0,S,protein,,tyrosine-protein phosphatase non-receptor type...,IMMUNE SYSTEM,01/11/19,...,False,True,IGHV3,unknown,unknown,,,,,
3649,1fh5,H,L,0,,,,,IMMUNE SYSTEM,07/31/00,...,False,False,IGHV5,IGKV5,Kappa,,,,,
3650,5ywf,D,C,0,,,,,IMMUNE SYSTEM,11/29/17,...,False,False,IGHV9,IGKV3,Kappa,,,,,


In [147]:
print("After Excluding the failed calculation antibody")
print("total number of antibody:", len(summary90_df.pdb.unique()))
print("total number of paired chains:",len(summary90_df))

After Excluding the failed calculation antibody
total number of antibody: 1854
total number of paired chains: 3454


## Extract the seq and score

In [164]:
import os
import pandas as pd

# Directory containing the PDB files
directory = '../data/score_antibody/'  # Adjust to your directory path if not correct

# Function to extract sequence and score list for a specific chain
def extract_chain_data(pdb_id, chain_type):
    file_path = os.path.join(directory, pdb_id + '.csv')
    if os.path.exists(file_path):
        pdb_df = pd.read_csv(file_path)
        chain_df = pdb_df[pdb_df['chain'] == chain_type]
        sequence = ''.join(chain_df['residue_name'].values)
        score_list = chain_df['score'].tolist()
        return sequence, score_list
    else:
        return None, None

# Initialize a list to store the extracted data
extracted_data = []

# Loop through each PDB entry and extract sequences and scores for both H and L chains
for _, row in summary90_df.iterrows():
    pdb_id = row['pdb']
    h_chian = row["Hchain"]
    l_chian = row["Lchain"]
    h_sequence, h_scores = extract_chain_data(pdb_id, h_chian)
    l_sequence, l_scores = extract_chain_data(pdb_id, l_chian)
    extracted_data.append({
        'ID': pdb_id,
        'Hchain': h_chian,
        'Lchain': l_chian,
        'lenH': len(h_sequence),
        'lenL': len(l_sequence),
        'Hchain_sequence': h_sequence,
        'Hchain_scores': h_scores,
        'Lchain_sequence': l_sequence,
        'Lchain_scores': l_scores
    })




In [165]:
# Create a new DataFrame from the extracted data
extracted_df = pd.DataFrame(extracted_data)

# Display the new DataFrame
extracted_df

Unnamed: 0,ID,Hchain,Lchain,lenH,lenL,Hchain_sequence,Hchain_scores,Lchain_sequence,Lchain_scores
0,3u1s,H,L,238,217,QVQLVQSGAEVKKPGSSVKVSCKASGNSFSNHDVHWVRQATGQGLE...,"[-1.3041, -0.8502, -1.106, -0.2978, 0.672, 0.0...",VVITQSPLFLPVTPGEAASLSCKCSHSLQHSTGANYLAWYLQRPGQ...,"[2.2901, 2.2593, 0.0, -0.2676, 0.0, 0.0103, 0...."
1,5uxq,H,L,239,209,QLEQSGAEVKKPGSSVKVSCKASGNTFSKYDVHWVRQATGQGLEWV...,"[-2.1262, 0.0, -2.9957, 0.0, -1.366, -1.2997, ...",TVVTQSPLSLPVTPGEAASMSCTSTQSLRHSNGANYLAWYQHKPGQ...,"[0.3407, 1.524, 0.0, -0.0221, 0.0, -0.0858, 0...."
2,5uxq,A,B,234,217,QLEQSGAEVKKPGSSVKVSCKAKYDVHWVRQATGQGLEWVGWMSHE...,"[-1.752, -1.6886, -2.5538, 0.0, -1.2796, -1.24...",TVVTQSPLSLPVTPGEAASMSCTSTQSLRHSNGANYLAWYQHKPGQ...,"[0.3314, 1.3221, 0.0, -0.0843, -0.2196, -0.215..."
3,4nwu,H,L,231,220,VQLVQSGAEVKKPGSSVKVSCKASGGDTFSNYAISWVRQAPGQGFE...,"[0.8732, -0.2832, 0.0, 0.3784, 0.0, -0.5198, -...",DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLTWYQQKP...,"[-1.5465, 0.0, 0.8229, 0.0, -0.6331, 0.0, -1.1..."
4,8ezl,H,H,254,254,QITLKESGPTLVKPTQTLTLTCTFSGSSLSTSGVGVGWIRQPPGKA...,"[-1.473, -0.5983, -0.8699, 0.0, -1.8994, 0.0, ...",QITLKESGPTLVKPTQTLTLTCTFSGSSLSTSGVGVGWIRQPPGKA...,"[-1.473, -0.5983, -0.8699, 0.0, -1.8994, 0.0, ..."
...,...,...,...,...,...,...,...,...,...
3449,7fab,H,L,209,204,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,"[-0.3334, -0.5101, -1.762, 0.0, -2.5669, -1.47...",ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,"[0.2098, 0.5373, 1.5762, 0.0, 0.1975, 0.0, -0...."
3450,6nmv,H,L,205,206,DVQLVESGGGVVRPGESLTLSCTASGFTFTSSTMNWVRQAPGEGLD...,"[-1.9175, -1.1187, -1.3395, 0.0, 0.4296, 0.0, ...",ALTQPASVSANPGETVKITCFGSSGNYGWFQQKSPGSAPVTVIHYN...,"[0.2785, 0.0, 0.3839, 0.1696, -0.1682, -0.0434..."
3451,1fh5,H,L,198,213,SGGGLVKPAGSLKLSCAASGFTFSSYYMYWVRQTPDKRLEWVATIS...,"[-0.5968, -0.7365, -0.1267, 0.3451, 0.2102, 0....",DIVLTQSPATLSVTPGESVSLSCRASQSISNNLHWYQQKSHESPRL...,"[-1.208, 0.0, 0.8122, 0.0, -0.4456, -0.6001, -..."
3452,5ywf,D,C,218,215,QVQLMESGPELKKPGETVKISCKASGYTFTDYSMHWVKQAPGKGLK...,"[-1.3718, -0.6457, -1.1689, 0.0, -0.596, 0.0, ...",DIVLTQSPASLAVSLGQRATISCRASQSVSTSYMHWYQQKPGQPPR...,"[-1.2521, 0.0, 0.8261, 0.0, -0.5678, 0.0, -0.4..."


In [166]:
extracted_df = extracted_df[(extracted_df['Hchain_sequence'].str.len() > 0) & 
                            (extracted_df['Lchain_sequence'].str.len() > 0) &
                            (extracted_df['Hchain_scores'].apply(lambda x: len(x) > 0)) &
                            (extracted_df['Lchain_scores'].apply(lambda x: len(x) > 0))]

extracted_df = extracted_df[extracted_df.Hchain != extracted_df.Lchain]

In [167]:
extracted_df

Unnamed: 0,ID,Hchain,Lchain,lenH,lenL,Hchain_sequence,Hchain_scores,Lchain_sequence,Lchain_scores
0,3u1s,H,L,238,217,QVQLVQSGAEVKKPGSSVKVSCKASGNSFSNHDVHWVRQATGQGLE...,"[-1.3041, -0.8502, -1.106, -0.2978, 0.672, 0.0...",VVITQSPLFLPVTPGEAASLSCKCSHSLQHSTGANYLAWYLQRPGQ...,"[2.2901, 2.2593, 0.0, -0.2676, 0.0, 0.0103, 0...."
1,5uxq,H,L,239,209,QLEQSGAEVKKPGSSVKVSCKASGNTFSKYDVHWVRQATGQGLEWV...,"[-2.1262, 0.0, -2.9957, 0.0, -1.366, -1.2997, ...",TVVTQSPLSLPVTPGEAASMSCTSTQSLRHSNGANYLAWYQHKPGQ...,"[0.3407, 1.524, 0.0, -0.0221, 0.0, -0.0858, 0...."
2,5uxq,A,B,234,217,QLEQSGAEVKKPGSSVKVSCKAKYDVHWVRQATGQGLEWVGWMSHE...,"[-1.752, -1.6886, -2.5538, 0.0, -1.2796, -1.24...",TVVTQSPLSLPVTPGEAASMSCTSTQSLRHSNGANYLAWYQHKPGQ...,"[0.3314, 1.3221, 0.0, -0.0843, -0.2196, -0.215..."
3,4nwu,H,L,231,220,VQLVQSGAEVKKPGSSVKVSCKASGGDTFSNYAISWVRQAPGQGFE...,"[0.8732, -0.2832, 0.0, 0.3784, 0.0, -0.5198, -...",DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNNKNYLTWYQQKP...,"[-1.5465, 0.0, 0.8229, 0.0, -0.6331, 0.0, -1.1..."
5,6urh,H,L,242,213,VQLEQSGAEVKKPGSSVKVSCKASGGTFSSFVINWVRQAPGQGLEW...,"[1.0415, -0.8273, 0.0, -2.2169, 0.0, -1.0435, ...",EIELTQSPATLSVSPGESATLSCRASQSVSDNLAWYQQKPGQAPRL...,"[-2.626, 0.0, -2.8836, 0.0, -1.6899, -1.1665, ..."
...,...,...,...,...,...,...,...,...,...
3449,7fab,H,L,209,204,AVQLEQSGPGLVRPSQTLSLTCTVSGTSFDDYYWTWVRQPPGRGLE...,"[-0.3334, -0.5101, -1.762, 0.0, -2.5669, -1.47...",ASVLTQPPSVSGAPGQRVTISCTGSSSNIGAGHNVKWYQQLPGTAP...,"[0.2098, 0.5373, 1.5762, 0.0, 0.1975, 0.0, -0...."
3450,6nmv,H,L,205,206,DVQLVESGGGVVRPGESLTLSCTASGFTFTSSTMNWVRQAPGEGLD...,"[-1.9175, -1.1187, -1.3395, 0.0, 0.4296, 0.0, ...",ALTQPASVSANPGETVKITCFGSSGNYGWFQQKSPGSAPVTVIHYN...,"[0.2785, 0.0, 0.3839, 0.1696, -0.1682, -0.0434..."
3451,1fh5,H,L,198,213,SGGGLVKPAGSLKLSCAASGFTFSSYYMYWVRQTPDKRLEWVATIS...,"[-0.5968, -0.7365, -0.1267, 0.3451, 0.2102, 0....",DIVLTQSPATLSVTPGESVSLSCRASQSISNNLHWYQQKSHESPRL...,"[-1.208, 0.0, 0.8122, 0.0, -0.4456, -0.6001, -..."
3452,5ywf,D,C,218,215,QVQLMESGPELKKPGETVKISCKASGYTFTDYSMHWVKQAPGKGLK...,"[-1.3718, -0.6457, -1.1689, 0.0, -0.596, 0.0, ...",DIVLTQSPASLAVSLGQRATISCRASQSVSTSYMHWYQQKPGQPPR...,"[-1.2521, 0.0, 0.8261, 0.0, -0.5678, 0.0, -0.4..."


In [168]:
print("After Excluding the failed antibody from extraction")
print("total number of antibody:", len(extracted_df.ID.unique()))
print("total number of paired chains:",len(extracted_df))

After Excluding the failed antibody from extraction
total number of antibody: 1755
total number of paired chains: 3223


## split to train-valid-test 80-10-10

In [169]:
from sklearn.model_selection import train_test_split, KFold
sampled_df = extracted_df.copy()
train_set, temp_set = train_test_split(sampled_df, test_size=0.2, random_state=42)

# Split the temp set into validation (10%) and test (10%) sets
valid_set, test_set = train_test_split(temp_set, test_size=0.5, random_state=42)

train_set['split'] = 'train'
valid_set['split'] = 'valid'
test_set['split'] = 'test'

# Combine the train, validation, and test sets back into df30
sampled_df = pd.concat([train_set, valid_set, test_set])

cols = sampled_df.columns.tolist()
cols.insert(0, cols.pop(cols.index('split')))
sampled_df = sampled_df[cols]

# Reset the index of df30
sampled_df = sampled_df.reset_index(drop=True)


sampled_df

Unnamed: 0,split,ID,Hchain,Lchain,lenH,lenL,Hchain_sequence,Hchain_scores,Lchain_sequence,Lchain_scores
0,train,5cd3,E,F,220,206,QVQLVESGTQFRRPGASVRLSCEASGYTFISSFIHWIRQGPGQGLE...,"[-1.4401, -0.9416, -1.4235, 0.0, -0.2421, 0.0,...",IQMTQSPVTLSASIGDRVTITCRASQRIDNWVAWYQQKPGRAPKLL...,"[-0.1236, -1.3844, 0.0, -1.1614, 0.0, -0.1844,..."
1,train,2b1h,H,L,226,215,EIQLEQSGAEVKKSGESLKISCQTSGYSFSDYWIGWVRQMPGKGLE...,"[-2.1125, -1.2399, -2.1454, 0.0, -2.5531, 0.0,...",QSVLTQPPSASGTPGQRISISCSGTSSNVENNYVYWYQHLPGTAPK...,"[-1.0325, -0.0439, 1.3282, 0.0, 0.0978, 0.0, -..."
2,train,3cfk,N,M,224,215,EVKLLESGGGLAQPGGSLKLSCAASGFDFRRYWMTWVRQAPGKGLE...,"[-1.2824, -0.443, 0.0, 0.0, -0.1279, -0.486, -...",ELVVTQESALTTSPGETVTLTCRSSSGAVTTSNYATWVQEKPDHLF...,"[-1.3419, 0.0066, 1.1607, 0.0, -0.2615, 0.0, -..."
3,train,8eee,H,L,125,115,VQLVESGGGLAKPGGSLRLSCAASGFTFSDYYMDWVRQAPGKGLEW...,"[1.314, 0.3637, 0.7836, 1.2758, 0.0, -0.2252, ...",PVLTQPPSLSASPGASARLPCTLSSDLSVGSKNMYWYQQKPGSAPR...,"[0.1796, 1.4011, 0.0, 0.1413, 0.0, -0.2944, -0..."
4,train,6yxd,H,L,119,107,EVLLQQSGPELVKPGASVRITCKASGYTFTDFNMDWVKQSPGKSLE...,"[-1.5851, -0.3245, -0.0573, 0.0, -1.6006, 0.0,...",DIQMTQSPASLSASVGETVTITCRASGNIHNFLAWYQQKQGKSPQV...,"[-2.0521, -1.5957, -1.9033, 0.0, -1.1156, -0.8..."
...,...,...,...,...,...,...,...,...,...,...
3218,test,6x1t,A,B,221,214,QSVKESEGGLFKPTDTLTLTCTASGFSLNGYGVIWVRQAPGKGLEW...,"[-1.4382, -1.4155, 0.0, -2.0769, 0.0, -1.3772,...",DMTQTPSSKSVPVGDTVTINCQASESVYSNNRLSWFQQKPGQPPKL...,"[-2.5146, -1.4996, -1.1187, 0.0, -0.9498, -1.0..."
3219,test,4k3j,H,L,221,217,EVQLVESGGGLVQPGGSLRLSCAASGYTFTSYWLHWVRQAPGKGLE...,"[-2.0939, -1.5332, -1.4309, 0.0, 0.5382, 0.0, ...",DIQMTQSPSSLSASVGDRVTITCKSSQSLLYTSSQKNYLAWYQQKP...,"[-2.1651, 0.0, -2.2596, 0.0, -1.3952, 0.0, -0...."
3220,test,4rgn,B,C,213,213,EVNLIESGGDLVKPGGSLKLSCATSGFTFSAYGLSWVRQTPERRLE...,"[-1.9599, -0.8577, -0.5375, 0.0, 1.4901, 0.444...",DIVMTQSPATLSVTPGDRVSLSCRASQSIGDYLHWYQQKSHESPRL...,"[-1.3525, 0.0, 0.7166, 0.0, -0.6232, -0.6413, ..."
3221,test,6z7w,H,G,219,214,QVQLQQSGAELVRPGTSVKVSCKASGYAFTNHLIEWVNQRPGQGLE...,"[-1.5092, -0.9358, -2.0002, 0.0, -2.2747, 0.0,...",DIVMTQSQKFMSTSVGDRVSITCKASQNVRTAVAWYQQRPGQSPKA...,"[-1.5331, 0.0, 0.7708, 0.0, -0.7584, 0.0, -1.2..."


In [179]:
print(f"Maximum length of the heavy chain sequence: {extracted_df['Hchain_sequence'].str.len().max()}")
print(f"Maximum length of the light chain sequence: {extracted_df['Lchain_sequence'].str.len().max()}")
print(f"Average length of the heavy chain sequence: {extracted_df['Hchain_sequence'].str.len().mean()}")
print(f"Average length of the light chain sequence: {extracted_df['Lchain_sequence'].str.len().mean()}")


Maximum length of the heavy chain sequence: 444
Maximum length of the light chain sequence: 230
Average length of the heavy chain sequence: 212.85572448029785
Average length of the light chain sequence: 207.42817251008378


In [180]:
# Save the DataFrame to a CSV file
save_col= ['split','ID','Hchain','Lchain','Hchain_sequence','Hchain_scores', 'Lchain_sequence','Lchain_scores']
sampled_df[save_col].to_csv("../data/csv/antibody.csv", index=False)

## Convert to graph

In [None]:
# check if pdb have many chains

data_dir  = "../data/"
pdb_dir = "../data/pdb_antibody/"
graph_dir = "../data/graph_antibody/"

df = pd.read_csv(data_dir+"csv/antibody.csv").sample(frac=0.10, random_state=42)
df

for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing rows"):
    code = row["ID"]

    Hchain = row["Hchain"]
    Lchain = row["Lchain"]

    pdb_path = pdb_dir+ f"{code}.pdb"

    graph_file_path = os.path.join(graph_dir, f"{code}.pt")

    # Skip processing if the graph file already exists
    if os.path.exists(graph_file_path):
        continue
    
    _ = process_pdb2graph_withseq(pdb_path,graph_file_path,[Hchain,Lchain])
