In [2]:
import pandas as pd
from pathlib import Path
df = pd.read_csv("../processed/apd6_cVAE.csv")

In [3]:
df.head()

Unnamed: 0,APD ID,Name/Class,Source,Sequence,Length,Net charge,Hydrophobic residue%,Boman Index,3D Structure,Method,...,Crucial residues,Additional info,Title,Author,Reference,Activity_seq,addinfo_sequence_analysis_seq,addinfo_in_vitro_toxicity_seq,addinfo_structure_seq,addinfo_chemical_modification_seq
0,AP00001,"Dermaseptin-B2 (XXA, DRS-B2, Dermaseptin B2, D...","skin, the arboreal frog,\nGiant leaf frog\n,\n...",GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV,33,4,54%,0.23,Helix,NMR,...,N-terminal segment,"History and discovery\n: A frog used for ""hunt...",Molecular cloning of a cDNA encoding the precu...,"Amiche M, Ducancel F, Lajeunesse E, Boulain JC...",Biochem Biophys Res Commun. 1993 Mar 31;191(3)...,Active against M. canis IP 1194 (MIC 10 ug/ml)...,Alanine rich (33.3%).,,A helix-hinge-helix structural motif (helix 1:...,
1,AP00002,Abaecin (natural AMPs; Pro-rich; PrAMPs; insec...,"honeybee,\nApis mellifera\nL.",YVPLPNVPQPGRRPFPTFPGQGPFNPKIKWPQGY,34,4,23%,1.19,Rich,,...,,Sequence analysis\n: Rich in P (29.4%).\nDisco...,"Isolation and characterization of abaecin, a m...","Casteels P, Ampe C, Riviere L, Van Damme J, El...",Eur J Biochem. 1990 Jan 26;187(2):381-6.\nPubM...,Active against A. tumefaciens Gembloux A (MIC ...,Rich in P (29.4%).,,,
2,AP00003,"Hs-AFP1 (HsAFP1, H. sanguinea antifungal prote...",Heuchera sanguinea,DGVKLCDVPSGTWSGHCGSSSKCSQQCKDREHFAYGGACHYQFPSV...,54,6,33%,1.95,Bridge,,...,,Activity\n: In medium A supplemented with 1 mM...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62.\nPubMed\n.,In medium A supplemented with 1 mM CaCl2 and 5...,,,,
3,AP00004,"Ct-AMP1 (CtAMP1, C. ternatea-antimicrobial pep...","Asian pigeonwings, bluebellvine, blue pea, but...",NLCERASLTWTGNCGNTGHCDTQCRNWESAKHGACHKRGNWKCFCYFDC,49,5,36%,2.43,Bridge,,...,,Activity\n: In medium A supplemented with 1 mM...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62.\nPubMed\n.,In medium A supplemented with 1 mM CaCl2 and 5...,,,,
4,AP00005,"Andropin (natural AMPs; insects, arthropods, i...","Fruit fly,\nDrosophila melanogaster",VFIDILDKVENAIHNAAQVGIGFAKPFEKLINPK,34,1,50%,0.55,Unknown,,...,,Activity\n: Active against B. megatherium Bml ...,"The andropin gene and its product, a male-spec...","Samakovlis, C., Kylsten, P., Kimbrell, DA., En...",EMBO J. 1991; 10:163-169.\nPubMed\n.,Active against B. megatherium Bml 1 (MIC 11 uM...,,,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6342 entries, 0 to 6341
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   APD ID                             6342 non-null   object 
 1   Name/Class                         6342 non-null   object 
 2   Source                             6338 non-null   object 
 3   Sequence                           6342 non-null   object 
 4   Length                             6342 non-null   int64  
 5   Net charge                         6342 non-null   int64  
 6   Hydrophobic residue%               6342 non-null   object 
 7   Boman Index                        6342 non-null   float64
 8   3D Structure                       6342 non-null   object 
 9   Method                             1276 non-null   object 
 10  Activity                           6127 non-null   object 
 11  Crucial residues                   223 non-null    objec

In [5]:
df.describe()

Unnamed: 0,Length,Net charge,Boman Index
count,6342.0,6342.0,6342.0
mean,27.895301,4.900978,1.387179
std,20.73655,3.468511,1.873395
min,2.0,-12.0,-3.82
25%,15.0,3.0,0.12
50%,23.0,5.0,1.29
75%,33.0,7.0,2.44
max,199.0,64.0,14.92


Validate Sequences: Check for 20 Standard Amino Acids

Filter and clean peptide sequences to ensure they contain only the 20 standard amino acids (ACDEFGHIKLMNPQRSTVWY). Remove invalid sequences and normalize remaining data.

In [6]:
import numpy as np

# Standard 20 amino acids
STD_AA = set('ACDEFGHIKLMNPQRSTVWY')

print(df.shape[0])

# Check each seq 
def check_sequence(seq):
    if pd.isna(seq):
        return False, set(), 'NaN'
    
    seq = str(seq).strip().upper()
    
    if len(seq) == 0:
        return False, set(), 'empty'
    
    # Unique characters in the sequence
    unique_chars = set(seq)
    
    # Check only standard aa
    valid_chars = unique_chars & STD_AA
    invalid_chars = unique_chars - STD_AA
    
    is_valid = len(invalid_chars) == 0
    
    status = 'OK' if is_valid else f' invalid: {invalid_chars}'
    
    return is_valid, invalid_chars, status

# Check each seq
results = df['Sequence'].apply(check_sequence)
df[['seq_valid', 'invalid_chars', 'seq_status']] = pd.DataFrame(results.tolist(), index=df.index)

# Statistics
print("\n Results:")
print(f" Valid Seq: {df['seq_valid'].sum()}")
print(f" Invalid: {df['seq_valid'].eq(False).sum()} ({100*(1-df['seq_valid'].mean()):.1f}%)")

# Show typical problems
invalid_df = df[~df['seq_valid']].copy()
if len(invalid_df) > 0:
    print("\n Top problems (first 10):")
    display_cols = ['APD ID', 'Sequence', 'seq_status', 'invalid_chars']
    print(invalid_df[display_cols].head(10).to_string(index=False))
    
    # Statistics on invalid characters
    all_invalid = set()
    for chars in invalid_df['invalid_chars']:
        all_invalid.update(chars)
    
    print(f"\n  FFound non-standard characters: {sorted(all_invalid)}")

# Filtering - only valid
df = df[df['seq_valid']].copy()

# normalization (upper case, strip)
df['Sequence'] = df['Sequence'].astype(str).str.strip().str.upper()


# final check
final_check = df['Sequence'].apply(lambda x: len(set(str(x)) - STD_AA) == 0).all()
print(f"All {df.shape[0]} sequences are clean!")



# Save Sequence
df.drop(['seq_valid', 'invalid_chars', 'seq_status'], axis=1, inplace=True, errors='ignore')
output_path = '../processed/apd6_cVAE.csv' 
df.to_csv(output_path, index=False)



6342

 Results:
 Valid Seq: 6342
 Invalid: 0 (0.0%)
All 6342 sequences are clean!


In [7]:

# Filter by Sequence Length (5-60 amino acids)
MIN_LENGTH = 5
MAX_LENGTH = 60

# Calculate sequence lengths
df['length'] = df['Sequence'].apply(len)

# Statistics before filtering
print(f"Before length filtering:")
print(f"  Total sequences: {df.shape[0]}")
print(f"  Length range: {df['length'].min()}-{df['length'].max()}")
print(f"  Length distribution:")
print(df['length'].describe())

# Apply length filter
df_filtered = df[(df['length'] >= MIN_LENGTH) & (df['length'] <= MAX_LENGTH)].copy()

# Statistics after filtering
removed_count = df.shape[0] - df_filtered.shape[0]
print(f"\nAfter length filtering ({MIN_LENGTH}-{MAX_LENGTH}):")
print(f"  Remaining sequences: {df_filtered.shape[0]}")
print(f"  Removed sequences: {removed_count} ({100*removed_count/df.shape[0]:.1f}%)")
print(f"  New length range: {df_filtered['length'].min()}-{df_filtered['length'].max()}")

# Drop the temporary length column and save
df_filtered.drop('length', axis=1, inplace=True)
output_path = '../processed/apd6_cVAE.csv'
df_filtered.to_csv(output_path, index=False)
print(f"\nFiltered data saved to {output_path}")

# Update df to filtered version
df = df_filtered


Before length filtering:
  Total sequences: 6342
  Length range: 2-199
  Length distribution:
count    6342.000000
mean       27.884894
std        20.732943
min         2.000000
25%        15.000000
50%        23.000000
75%        33.000000
max       199.000000
Name: length, dtype: float64

After length filtering (5-60):
  Remaining sequences: 5984
  Removed sequences: 358 (5.6%)
  New length range: 5-60

Filtered data saved to ../processed/apd6_cVAE.csv


In [9]:
df.head(5)

Unnamed: 0,APD ID,Name/Class,Source,Sequence,Length,Net charge,Hydrophobic residue%,Boman Index,3D Structure,Method,...,Crucial residues,Additional info,Title,Author,Reference,Activity_seq,addinfo_sequence_analysis_seq,addinfo_in_vitro_toxicity_seq,addinfo_structure_seq,addinfo_chemical_modification_seq
0,AP00001,"Dermaseptin-B2 (XXA, DRS-B2, Dermaseptin B2, D...","skin, the arboreal frog,\nGiant leaf frog\n,\n...",GLWSKIKEVGKEAAKAAAKAAGKAALGAVSEAV,33,4,54%,0.23,Helix,NMR,...,N-terminal segment,"History and discovery\n: A frog used for ""hunt...",Molecular cloning of a cDNA encoding the precu...,"Amiche M, Ducancel F, Lajeunesse E, Boulain JC...",Biochem Biophys Res Commun. 1993 Mar 31;191(3)...,Active against M. canis IP 1194 (MIC 10 ug/ml)...,Alanine rich (33.3%).,,A helix-hinge-helix structural motif (helix 1:...,
1,AP00002,Abaecin (natural AMPs; Pro-rich; PrAMPs; insec...,"honeybee,\nApis mellifera\nL.",YVPLPNVPQPGRRPFPTFPGQGPFNPKIKWPQGY,34,4,23%,1.19,Rich,,...,,Sequence analysis\n: Rich in P (29.4%).\nDisco...,"Isolation and characterization of abaecin, a m...","Casteels P, Ampe C, Riviere L, Van Damme J, El...",Eur J Biochem. 1990 Jan 26;187(2):381-6.\nPubM...,Active against A. tumefaciens Gembloux A (MIC ...,Rich in P (29.4%).,,,
2,AP00003,"Hs-AFP1 (HsAFP1, H. sanguinea antifungal prote...",Heuchera sanguinea,DGVKLCDVPSGTWSGHCGSSSKCSQQCKDREHFAYGGACHYQFPSV...,54,6,33%,1.95,Bridge,,...,,Activity\n: In medium A supplemented with 1 mM...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62.\nPubMed\n.,In medium A supplemented with 1 mM CaCl2 and 5...,,,,
3,AP00004,"Ct-AMP1 (CtAMP1, C. ternatea-antimicrobial pep...","Asian pigeonwings, bluebellvine, blue pea, but...",NLCERASLTWTGNCGNTGHCDTQCRNWESAKHGACHKRGNWKCFCYFDC,49,5,36%,2.43,Bridge,,...,,Activity\n: In medium A supplemented with 1 mM...,Isolation and characterisation of plant defens...,"Osborn RW, De Samblanx GW, Thevissen K, Goderi...",FEBS Lett. 1995 Jul 17;368(2):257-62.\nPubMed\n.,In medium A supplemented with 1 mM CaCl2 and 5...,,,,
4,AP00005,"Andropin (natural AMPs; insects, arthropods, i...","Fruit fly,\nDrosophila melanogaster",VFIDILDKVENAIHNAAQVGIGFAKPFEKLINPK,34,1,50%,0.55,Unknown,,...,,Activity\n: Active against B. megatherium Bml ...,"The andropin gene and its product, a male-spec...","Samakovlis, C., Kylsten, P., Kimbrell, DA., En...",EMBO J. 1991; 10:163-169.\nPubMed\n.,Active against B. megatherium Bml 1 (MIC 11 uM...,,,,
