## 1. Setup and Imports {#setup}

Let's start by importing all necessary modules and setting up the environment.

In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
import sys
import os
warnings.filterwarnings('ignore')

from AbXtract import *
from AbXtract import AntibodyDescriptorCalculator, Config, load_config
from AbXtract.sequence import (
    SequenceLiabilityAnalyzer,
    BashourDescriptorCalculator,
    PeptideDescriptorCalculator,
    AntibodyNumbering
)
from AbXtract.structure import (
    SASACalculator,
    ChargeAnalyzer,
    DSSPAnalyzer,
    PropkaAnalyzer,
    ArpeggioAnalyzer
)
from AbXtract.utils import (
    read_fasta,
    write_fasta,
    parse_sequence,
    validate_sequence
)
    

ModuleNotFoundError: No module named 'AbXtract'

# Load config

In [None]:
# default configuration
custom_config = Config()

'''
# Test custom configuration
custom_config = Config.from_dict({
    'pH': 7.4,
    'numbering_scheme': 'kabat',
    'verbose': True,
    'calculate_dssp': tool_status.get('dssp', False),
    'calculate_propka': tool_status.get('propka', False),
    'calculate_arpeggio': tool_status.get('arpeggio', False)
})
'''


# Check external tool availability
tool_status = custom_config.check_external_tools()
print("🛠️ External Tool Status:")
for tool, available in tool_status.items():
    status = "✅" if available else "❌"
    print(f"  {tool}: {status}")


# Load classes

In [None]:
numbering = AntibodyNumbering(scheme='imgt')
peptide_calc = PeptideDescriptorCalculator()
calc = AntibodyDescriptorCalculator(config=custom_config)

# Define path 

In [None]:
abxtract_path = "/home/HX46_FR5/repo_perso/AbXtract"
sys.path.insert(0, abxtract_path)

# Set up test data paths
BASE_DIR = Path.cwd() 
DATA_DIR = BASE_DIR / "data" / "test"
DATA_DIR.mkdir(parents=True, exist_ok=True)


# Define test file paths
RESULTS_DIR = DATA_DIR / "results"
RESULTS_DIR.mkdir(exist_ok=True)


# Input sequence and pdb

In [None]:
# Test antibody sequences (based on therapeutic antibodies)
HEAVY_SEQUENCE = (
    "QVQLVQSGAEVKKPGASVKVSCKASGGTFSSYAISWVRQAPGQGLEWMGGIIPIFGTANYAQKFQGRVTITADESTSTAYMELSSLRSEDTAVYYCARSHYGLDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYASTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK"
)
# Light chain: Includes realistic VL domain + human kappa constant region  
LIGHT_SEQUENCE = (
    "DIQMTQSPSSLSASVGDRVTITCRASHSISSYLAWYQQKPGKAPKLLIYAASSLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPLTFGGGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC"
)
PDB_FILE = DATA_DIR / "test.pdb"  # User will provide this

# Sequence validity for numbering

In [None]:
heavy_valid, heavy_msg = validate_sequence(HEAVY_SEQUENCE)
light_valid, light_msg = validate_sequence(LIGHT_SEQUENCE)


# A. Numbering

In [None]:
heavy_numbered = numbering.number_sequence(HEAVY_SEQUENCE, 'H')  # Use VH portion only
light_numbered = numbering.number_sequence(LIGHT_SEQUENCE, 'L')  # Use VH portion only

annotated_H, cdrs_H = numbering.get_cdr_sequences(heavy_numbered, 'H')
annotated_L, cdrs_L = numbering.get_cdr_sequences(light_numbered, 'L')

heavy_profiles = numbering.get_peptide_profiles(HEAVY_SEQUENCE)
light_profiles = numbering.get_peptide_profiles(LIGHT_SEQUENCE)

# B. Peptide descriptors

In [None]:
peptide_results = peptide_calc.calculate_all(
    heavy_sequence=HEAVY_SEQUENCE,
    light_sequence=LIGHT_SEQUENCE
)


# C. Sequence descriptors

In [None]:
sequence_results, liabilities = calc.calculate_sequence_descriptors(
    heavy_sequence=HEAVY_SEQUENCE,
    light_sequence=LIGHT_SEQUENCE,
    sequence_id="TestAb_Sequence"
)

# D. Sequence descriptors

In [None]:


# Run structure analysis if PDB is available
structure_results_seq, structure_results_comp, df_residues = calc.calculate_structure_descriptors(
    pdb_file=PDB_FILE,
    structure_id="TestAb_Structure"
)




# Organise outputs

In [None]:
heavy_valid, light_valid

In [None]:
heavy_numbered, light_numbered

In [None]:
cdrs_H, cdrs_L


In [None]:
annotated_H, annotated_L

In [None]:
heavy_profiles, light_profiles


In [None]:
liabilities

In [None]:
peptide_results

In [None]:
sequence_results

In [None]:
structure_results_seq

# Format standard

### 1. Residue annotation

In [None]:
# Creating comprehensive heavy chain dataframe
def create_comprehensive_df(annotations, hydrophobicity, chain_type='Heavy'):
    # Start with basic annotation data
    data = []
    
    for item in annotations:
        position_tuple, amino_acid, region = item
        position_num = position_tuple[0]
        
        # Get index for hydrophobicity values (0-based)
        idx = position_num - 1
        
        # Create row with all information
        row = {
            'position': position_num,
            'amino_acid': amino_acid,
            'region': region,
            'charge_sign': hydrophobicity['charge_sign'][idx] if idx < len(hydrophobicity['charge_sign']) else np.nan,
            'hydrophobicity_hw': hydrophobicity['hydrophobicity_hw'][idx] if idx < len(hydrophobicity['hydrophobicity_hw']) else np.nan,
            'hydrophobicity_eisenberg': hydrophobicity['hydrophobicity_eisenberg'][idx] if idx < len(hydrophobicity['hydrophobicity_eisenberg']) else np.nan,
            'hydrophobicity_rose': hydrophobicity['hydrophobicity_rose'][idx] if idx < len(hydrophobicity['hydrophobicity_rose']) else np.nan,
            'hydrophobicity_janin': hydrophobicity['hydrophobicity_janin'][idx] if idx < len(hydrophobicity['hydrophobicity_janin']) else np.nan,
            'hydrophobicity_engelman': hydrophobicity['hydrophobicity_engelman'][idx] if idx < len(hydrophobicity['hydrophobicity_engelman']) else np.nan
        }
        data.append(row)
    
    return pd.DataFrame(data)

import pandas as pd
import numpy as np

def add_liability_columns(df, chain_type, liabilities_list):
    """
    Add boolean columns for each LIABILITY TYPE (not just the ones present in this chain).
    Place these columns BEFORE the position column.
    """
    chain_letter = 'H' if chain_type == 'Heavy' else 'L'
    
    # Define ALL possible liability types based on your liability definitions
    all_liability_types = [
        'Unpaired_Cys',
        'N-linked_glycosylation',
        'Met_oxidation',
        'Trp_oxidation',
        'Asn_deamidation',
        'Asp_isomerisation',
        'Lysine_Glycation',
        'N-terminal_glutamate',
        'Integrin_binding',
        'CD11c/CD18_binding',
        'Fragmentation',
        'Polyreactivity'
    ]
    
    # Initialize ALL liability columns as False
    for col_name in all_liability_types:
        df[col_name] = False
    
    # Now mark positions that have each liability based on the actual data
    for liability in liabilities_list:
        if liability['chain'] == chain_letter:
            # Get position range
            start_pos = liability['start_position'][0]
            end_pos = liability['end_position'][0]
            
            # Create column name by simplifying the liability name
            col_name = liability['name'].split('(')[0].strip().replace(' ', '_').replace('/', '')
            
            # Mark all positions in range as True
            mask = (df['position'] >= start_pos) & (df['position'] <= end_pos)
            if mask.any():
                df.loc[mask, col_name] = True
                print(f"Marked {mask.sum()} positions for {col_name} in {chain_type} chain (positions {start_pos}-{end_pos})")
    
    # Reorder columns: liability columns FIRST, then position, amino_acid, region, then hydrophobicity
    liability_cols = all_liability_types
    base_cols = ['position', 'amino_acid', 'region']
    hydro_cols = ['charge_sign', 'hydrophobicity_hw', 'hydrophobicity_eisenberg', 
                  'hydrophobicity_rose', 'hydrophobicity_janin', 'hydrophobicity_engelman']
    
    # New column order: liabilities first, then base, then hydrophobicity
    new_order = base_cols + hydro_cols + liability_cols
    df = df[new_order]
    
    return df

# Extract the liabilities list from the DataFrame
# Assuming 'liabilities' is a DataFrame with a column called 'liabilities' containing the list
liabilities_list = liabilities['liabilities'].iloc[0]  # Get the list from the first row


# Create comprehensive dataframes for both chains
df_heavy = create_comprehensive_df(annotated_H, heavy_profiles, 'Heavy')
df_light = create_comprehensive_df(annotated_L, light_profiles, 'Light')

# Add liability columns to both dataframes
df_heavy = add_liability_columns(df_heavy, 'Heavy', liabilities_list)
df_light = add_liability_columns(df_light, 'Light', liabilities_list)

df_light

In [None]:
liabilities_list

In [None]:
liabilities_list

In [None]:
df_heavy

In [None]:
liabilities["liabilities"][0]

In [None]:
structure_results_seq.style

In [None]:
1

### 2. Chain annotation

In [None]:
peptide_results
heavy_valid, light_valid
cdrs_H, cdrs_L

### 3. Antibody annotation

In [None]:
sequence_results
peptide_results
heavy_valid, light_valid
cdrs_H, cdrs_L