In [1]:
import pandas as pd
import numpy as np
from abnumber import Chain
import os

from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
print(date)

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False, nb_workers=64)

2024-06-10
INFO: Pandarallel will run on 64 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


#### Cleaning up outputs

In [2]:
RBD_seq = 'RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNF'

In [3]:
# Loading in raw generated sequences
raw_file_path = './raw_sequences/RBD/'
seq_df = pd.concat([pd.read_csv(raw_file_path + i, index_col=0) for i in os.listdir(raw_file_path)])
seq_df.columns = ['full_seq']
seq_df['Antigen'] = 'RBD'
seq_df.reset_index(inplace=True, drop=True)
seq_df.index =  'RBD-V2_' + seq_df.index.astype(str)

# Removing antigen sequence
seq_df['ab_seq'] = seq_df['full_seq'].apply(lambda x: x.replace(RBD_seq, ''))
# Separating out heavy chains
seq_df['HC'] = seq_df['ab_seq'].apply(lambda x: x.split('[LC]')[0].replace('[SEP]', ''))
# Removing sequences with no light chain
seq_df = seq_df[seq_df['ab_seq'].apply(lambda x: '[LC]' in x)]
seq_df['LC'] = seq_df['ab_seq'].apply(lambda x: x.split('[LC]')[1].split('<')[0])

print(seq_df['Antigen'].value_counts())

Antigen
RBD    997
Name: count, dtype: int64


In [6]:
def annotate_regions(sequence):
    try:
        variable = Chain(sequence, scheme='imgt', allowed_species='human', assign_germline=True)
        
        cdr1, cdr2, cdr3 = variable.cdr1_seq, variable.cdr2_seq, variable.cdr3_seq                  # CDRs
        fr1, fr2, fr3, fr4 = variable.fr1_seq, variable.fr2_seq, variable.fr3_seq, variable.fr4_seq # FWRs

        return [cdr1, cdr2, cdr3, fr1, fr2, fr3, fr4]
    
    except:
        return 'not human variable'
    
seq_df['annot_H'] = seq_df['HC'].parallel_apply(annotate_regions)
seq_df['annot_L'] = seq_df['LC'].parallel_apply(annotate_regions)

In [11]:
print(len(seq_df[(seq_df['annot_H'] == 'not human variable') | (seq_df['annot_L'] == 'not human variable')]))

seq_df= seq_df[~(seq_df['annot_H'] == 'not human variable')]
seq_df= seq_df[~(seq_df['annot_L'] == 'not human variable')]

4


In [12]:
# Extract Abnumber annotations
seq_df[['CDR1.H', 'CDR2.H', 'CDR3.H', 'FR1.H', 'FR2.H', 'FR3.H', 'FR4.H']] = pd.DataFrame(seq_df['annot_H'].to_list(), index=seq_df.index)
seq_df.drop('annot_H', axis=1, inplace=True)

seq_df[['CDR1.L', 'CDR2.L', 'CDR3.L', 'FR1.L', 'FR2.L', 'FR3.L', 'FR4.L']] = pd.DataFrame(seq_df['annot_L'].to_list(), index=seq_df.index)
seq_df.drop('annot_L', axis=1, inplace=True)

Fasta output for humanness analysis:
https://biophi.dichlab.org/humanization/humanness/

In [16]:
def write_to_fasta(df, output_file):
    """
    Writes a specified column of a pandas DataFrame to a FASTA file.

    :param df: pandas DataFrame
    :param column_name: Name of the column that contains the sequences
    :param output_file: Name of the output FASTA file
    """
    with open(output_file, 'w') as fasta_file:
        for index, row in df.iterrows():
            for column_name in ['H_abnum', 'L_abnum']:
                header = f'>{index}' + '_' + column_name
                sequence = row[column_name]
                fasta_file.write(f'{header}\n{sequence}\n')

oasis_input_name = 'oasis_test.fasta'

write_to_fasta(seq_df.sample(n=10, random_state=42), oasis_input_name)


In [13]:
import subprocess
import shlex

path_to_db = '../../perry/databases/OASis_9mers_v1.db'

def run_OASis_in_bg(input_name, output_name):

    oasis_command = 'biophi oasis '+  input_name + ' --oasis-db ' +  path_to_db +  ' --min-percent-subjects 10 --scheme imgt --cdr-definition imgt --output ' + output_name
    command_list = shlex.split(oasis_command)

    process = subprocess.Popen(command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    print("OASis hummanness is running in background...")

    # Wait for the background process to finish
    process.wait()

    print("Oasis has completed. Output: " + oasis_output_name)

    # Optional: Capture the output
    stdout, stderr = process.communicate()
    if process.returncode == 0:
        print("Process succeeded.")
    else:
        print(f"Process failed with return code {process.returncode}: {stderr}")


In [15]:
oasis_input_name = 'concat_V2_gen_humanness_input_24-03-15.fasta'

In [None]:
run_OASis_in_bg(oasis_input_name, 'concat_V2_humanness_24-03-15.xlsx')

In [34]:
import subprocess
import shlex

oasis_output_name = 'concat_V2_humanness_24-03-15.xlsx'

oasis_command = 'biophi oasis '+  oasis_input_name + ' --oasis-db  ../../perry/databases/OASis_9mers_v1.db --min-percent-subjects 10 --scheme imgt --cdr-definition imgt --output ' + oasis_output_name
command_list = shlex.split(oasis_command)

process = subprocess.Popen(command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

print("OASis hummanness is running in background...")

# Wait for the background process to finish
process.wait()

print("Oasis has completed. Output: " + oasis_output_name)

# Optional: Capture the output
stdout, stderr = process.communicate()
if process.returncode == 0:
    print("Process succeeded.")
else:
    print(f"Process failed with return code {process.returncode}: {stderr}")

OASis hummanness is running in background...
Oasis has completed. Output: concat_V2_humanness_24-03-15.xlsx
Process succeeded.


In [35]:
oasis_report = pd.read_excel(oasis_output_name, index_col=0)

seq_df['OASis_score'] = oasis_report['OASis Percentile']
seq_df['Heavy OASis Percentile'] = oasis_report['Heavy OASis Percentile']
seq_df['Light OASis Percentile'] = oasis_report['Light OASis Percentile']

Fasta output for ANARCI in terminal

In [31]:
def write_to_fasta(df, column_name, output_file):
    """
    Writes a specified column of a pandas DataFrame to a FASTA file.

    :param df: pandas DataFrame
    :param column_name: Name of the column that contains the sequences
    :param output_file: Name of the output FASTA file
    """
    with open(output_file, 'w') as fasta_file:
        for index, row in df.iterrows():
            header = f'>{index}'
            sequence = row[column_name]
            fasta_file.write(f'{header}\n{sequence}\n')

anarci_HC_name = 'concat_V2_gen_HC_24-03-15'
anarci_LC_name = 'concat_V2_gen_LC_24-03-13'

anarci_HC_fasta = anarci_HC_name + '.fasta'
anarci_LC_fasta = anarci_LC_name + '.fasta'


write_to_fasta(seq_df, 'H_abnum', anarci_HC_fasta)
write_to_fasta(seq_df, 'L_abnum', anarci_LC_fasta)

#### Running ANARCI to annotate sequences

In [43]:
!ANARCI -i {anarci_HC_fasta} --scheme imgt --restrict heavy --ncpu 8 --assign_germline --use_species human --csv -o {anarci_HC_name}
!ANARCI -i {anarci_LC_fasta} --scheme imgt --restrict light --ncpu 8 --assign_germline --use_species human --csv -o {anarci_LC_name}

In [39]:
anarci_hc = pd.read_csv(('concat_V2_gen_HC_24-03-15_H.csv'), index_col=0)
anarci_hc = anarci_hc[['domain_no', 'hmm_species', 'chain_type', 'e-value', 'score','seqstart_index', 'seqend_index', 'identity_species', 'v_gene', 'v_identity', 'j_gene', 'j_identity']]
anarci_hc.columns = anarci_hc.columns + '.H'

anarci_lc = pd.read_csv((anarci_LC_name + '_KL.csv'), index_col=0)
anarci_lc = anarci_lc[['domain_no', 'hmm_species', 'chain_type', 'e-value', 'score','seqstart_index', 'seqend_index', 'identity_species', 'v_gene', 'v_identity', 'j_gene', 'j_identity']]
anarci_lc.columns = anarci_lc.columns + '.L'

In [40]:
anarci_df = pd.merge(anarci_hc, anarci_lc, left_index=True, right_index=True)
seq_df = pd.merge(seq_df, anarci_df,  left_index=True, right_index=True)

#### Further annotation with Abnumber

In [47]:
def annotate_regions(sequence):
          variable = Chain(sequence, scheme='imgt', allowed_species='human', assign_germline=True)
          
          cdr1 = variable.cdr1_seq
          cdr2 = variable.cdr2_seq
          cdr3 = variable.cdr3_seq

          fr1 = variable.fr1_seq
          fr2 = variable.fr2_seq
          fr3 = variable.fr3_seq
          fr4 = variable.fr4_seq

          return [cdr1, cdr2, cdr3, fr1, fr2, fr3, fr4]

In [48]:
seq_df['abnum_hc'] = seq_df.apply(lambda x: annotate_regions(x['VH']), axis=1)
seq_df[['CDR1.H', 'CDR2.H', 'CDR3.H', 'FR1.H', 'FR2.H', 'FR3.H', 'FR4.H']] = pd.DataFrame(seq_df['abnum_hc'].to_list(), index=seq_df.index)
seq_df.drop('abnum_hc', axis=1, inplace=True)

seq_df['abnum_lc'] = seq_df.apply(lambda x: annotate_regions(x['VL']), axis=1)
seq_df[['CDR1.L', 'CDR2.L', 'CDR3.L', 'FR1.L', 'FR2.L', 'FR3.L', 'FR4.L']] = pd.DataFrame(seq_df['abnum_lc'].to_list(), index=seq_df.index)
seq_df.drop('abnum_lc', axis=1, inplace=True)

In [54]:
anarci_hc = pd.read_csv((anarci_HC_name + '_H.csv'), index_col=0)
anarci_hc = anarci_hc[['domain_no', 'hmm_species', 'chain_type', 'e-value', 'score','seqstart_index', 'seqend_index', 'identity_species', 'v_gene', 'v_identity', 'j_gene', 'j_identity']]
anarci_hc.columns = anarci_hc.columns + '.H'

anarci_lc = pd.read_csv((anarci_LC_name + '_KL.csv'), index_col=0)
anarci_lc = anarci_lc[['domain_no', 'hmm_species', 'chain_type', 'e-value', 'score','seqstart_index', 'seqend_index', 'identity_species', 'v_gene', 'v_identity', 'j_gene', 'j_identity']]
anarci_lc.columns = anarci_lc.columns + '.L'

anarci_df = pd.merge(anarci_hc, anarci_lc, left_index=True, right_index=True)

In [None]:
seq_df = pd.merge(seq_df, anarci_df,  left_index=True, right_index=True)

In [None]:
seq_df.to_csv('RBD_1K_sequences_24-03-16.csv')