## Baca FL PAseq735 ##

In [1]:
from Bio import SeqIO

# Function to read the PAD4 sequence from a FASTA file
def load_fasta_sequence(file_path):
    with open(file_path, 'r') as file:
        for record in SeqIO.parse(file, "fasta"):
            return str(record.seq)  # Returns the sequence as a string

# Example: Load the PAD4 sequence from a FASTA file
fasta_file = "PAseq735.fasta"  # Replace with the path to your FASTA file
full_length_PAD4 = load_fasta_sequence(fasta_file)

# Print the loaded sequence
print("Full-length PAD4 sequence loaded from FASTA:")
print(full_length_PAD4)

Full-length PAD4 sequence loaded from FASTA:
EVKQENRLLNESESSSQGLLGYYFSDLNFQAPMVVTSSTTGDLSIPSSELENIPSENQYFQSAIWSGFIKVKKSDEYTFATSADNHVTMWVDDQEVINKASNSNKIRLEKGRLYQIKIQYQRENPTEKGLDFKLYWTDSQNKKEVISSDNLQLPELKQKSSNSRKKRSTSAGPTVPDRDNDGIPDSLEVEGYTVDVKNKRTFLSPWISNIHEKKGLTKYKSSPEKWSTASDPYSDFEKVTGRIDKNVSPEARHPLVAAYPIVHVDMENIILSKNEDQSTQNTDSETRTISKNTSTSRTHTSEVHGNAEVHASFFDIGGSVSAGFSNSNSSTVAIDHSLSLAGERTWAETMGLNTADTARLNANIRYVNTGTAPIYNVLPTTSLVLGKNQTLATIKAKENQLSQILAPNNYYPSKNLAPIALNAQDDFSSTPITMNYNQFLELEKTKQLRLDTDQVYGNIATYNFENGRVRVDTGSNWSEVLPQIQETTARIIFNGKDLNLVERRIAAVNPSDPLETTKPDMTLKEALKIAFGFNEPNGNLQYQGKDITEFDFNFDQQTSQNIKNQLAELNATNIYTVLDKIKLNAKMNILIRDKRFHYDRNNIAVGADESVVKEAHREVINSSTEGLLLNIDKDIRKILSGYIVEIEDTEGLKEVINDRYDMLNISSLRQDGKTFIDFKKYNDKLPLYISNPNYKVNVYAVTKENTIINPSENGDTSTNGIKKILIFSKKGYEIG


## Data preparation ##

In [23]:
import pandas as pd

def read_specific_columns(csv_file, columns):
    # Membaca kolom tertentu dari file CSV
    df = pd.read_csv(csv_file, usecols=columns)
    return df



### PAD4C8 ###

In [24]:
# Contoh penggunaan
csv_file = "Dataset/csv/PAD4C8BoLA.csv"
columns_to_read = ['Peptide', 'EL-score', 'BA-score']  # Ganti dengan nama kolom yang ingin dibaca
df = read_specific_columns(csv_file, columns_to_read)

# Menampilkan DataFrame dengan kolom yang dipilih
print(df)

      Peptide  EL-score  BA-score
0    FHYDRNNI    0.0027    0.0558
1    HYDRNNIA    0.0004    0.0289
2    YDRNNIAV    0.0004    0.0316
3    DRNNIAVG    0.0000    0.0143
4    RNNIAVGA    0.0002    0.0334
..        ...       ...       ...
128  ILIFSKKG    0.0002    0.0392
129  LIFSKKGY    0.0952    0.1801
130  IFSKKGYE    0.0002    0.0258
131  FSKKGYEI    0.0051    0.0520
132  SKKGYEIG    0.0001    0.0169

[133 rows x 3 columns]


### PAD4C9 ###

In [25]:
csv_file2 = "Dataset/csv/PAD4C9Bola.csv"
columns_to_read = ['Peptide', 'EL-score', 'BA-score']
df2 = read_specific_columns(csv_file2, columns_to_read)

# Menampilkan DataFrame dengan kolom yang dipilih
print(df2.head)

<bound method NDFrame.head of        Peptide  EL-score  BA-score
0    FHYDRNNIA    0.0138    0.1028
1    HYDRNNIAV    0.0017    0.0468
2    YDRNNIAVG    0.0000    0.0268
3    DRNNIAVGA    0.0009    0.0301
4    RNNIAVGAD    0.0000    0.0282
..         ...       ...       ...
127  KILIFSKKG    0.0004    0.0485
128  ILIFSKKGY    0.1836    0.2640
129  LIFSKKGYE    0.0032    0.0912
130  IFSKKGYEI    0.0035    0.0567
131  FSKKGYEIG    0.0001    0.0311

[132 rows x 3 columns]>


### Data cleaning ###

In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Step 3: Load 8mer and 9mer datasets
pad4_8mer_df = pd.read_csv('Dataset/csv/PAD4C8BoLA.csv')
pad4_9mer_df = pd.read_csv('Dataset/csv/PAD4C9BoLA.csv')

# Check dataset structure
print(pad4_8mer_df.head())
print(pad4_9mer_df.head())

# Step 4: Data Preprocessing

# Select important columns, assuming 'Peptide' and score columns like 'BA_Score' or 'EL_Score'
selected_columns_8mer = ['Peptide', 'BA-score', 'EL-score']
selected_columns_9mer = ['Peptide', 'BA-score', 'EL-score']

# Extract features (BA_Score, EL_Score) for 8mer and 9mer
features_8mer = pad4_8mer_df[selected_columns_8mer]
features_9mer = pad4_9mer_df[selected_columns_9mer]

# Combine datasets (you can add more if you have 10mer, 11mer, etc.)
combined_data = pd.concat([features_8mer, features_9mer], ignore_index=True)

# Normalize scores for better VAE performance (Min-Max Scaling)
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(combined_data[['BA-score', 'EL-score']])

# Prepare final dataset
peptides = combined_data['Peptide'].values
features = np.hstack([peptides.reshape(-1, 1), scaled_features])

# Check the preprocessed data
print(features[:5])  # Checking first 5 entries

   Pos   Peptide       ID       core     icore  EL-score  EL_Rank  BA-score  \
0    0  FHYDRNNI  PEPLIST  FHYDRN-NI  FHYDRNNI    0.0027  20.0803    0.0558   
1    0  HYDRNNIA  PEPLIST  H-YDRNNIA  HYDRNNIA    0.0004  44.5556    0.0289   
2    0  YDRNNIAV  PEPLIST  YDR-NNIAV  YDRNNIAV    0.0004  43.5714    0.0316   
3    0  DRNNIAVG  PEPLIST  DRNNIAVG-  DRNNIAVG    0.0000  73.7500    0.0143   
4    0  RNNIAVGA  PEPLIST  RNN-IAVGA  RNNIAVGA    0.0002  55.0000    0.0334   

   BA_Rank     Ave  NB  
0  34.2327  0.0027   0  
1  68.0273  0.0004   0  
2  63.6080  0.0004   0  
3  92.1082  0.0000   0  
4  60.7318  0.0002   0  
   Pos    Peptide       ID       core      icore  EL-score  EL_Rank  BA-score  \
0    0  FHYDRNNIA  PEPLIST  FHYDRNNIA  FHYDRNNIA    0.0138   8.4640    0.1028   
1    0  HYDRNNIAV  PEPLIST  HYDRNNIAV  HYDRNNIAV    0.0017  24.9753    0.0468   
2    0  YDRNNIAVG  PEPLIST  YDRNNIAVG  YDRNNIAVG    0.0000  74.2500    0.0268   
3    0  DRNNIAVGA  PEPLIST  DRNNIAVGA  DRNNIAVGA   

In [21]:
import pandas as pd

# Function to load the CSV file with fragment scores (BA and EL)
def load_fragment_scores(csv_file):
    return pd.read_csv(csv_file)

# Example: Load the 8-12mer fragment scores from a CSV file
csv_file = "Dataset/csv/PAD4C8BoLA.csv"  # Replace with the path to your CSV file
fragment_scores_df = load_fragment_scores(csv_file)

# Show the first few rows of the loaded dataset
print("Fragment scores (first 5 rows):")
print(fragment_scores_df.head())

Fragment scores (first 5 rows):
   Pos   Peptide       ID       core     icore  EL-score  EL_Rank  BA-score  \
0    0  FHYDRNNI  PEPLIST  FHYDRN-NI  FHYDRNNI    0.0027  20.0803    0.0558   
1    0  HYDRNNIA  PEPLIST  H-YDRNNIA  HYDRNNIA    0.0004  44.5556    0.0289   
2    0  YDRNNIAV  PEPLIST  YDR-NNIAV  YDRNNIAV    0.0004  43.5714    0.0316   
3    0  DRNNIAVG  PEPLIST  DRNNIAVG-  DRNNIAVG    0.0000  73.7500    0.0143   
4    0  RNNIAVGA  PEPLIST  RNN-IAVGA  RNNIAVGA    0.0002  55.0000    0.0334   

   BA_Rank     Ave  NB  
0  34.2327  0.0027   0  
1  68.0273  0.0004   0  
2  63.6080  0.0004   0  
3  92.1082  0.0000   0  
4  60.7318  0.0002   0  


## VAE MODELS ##

In [20]:
import numpy as np
import random

# Function to simulate VAE model for generating mutations (simplified)
def vae_mutate_fragments(fragments, num_mutations=3):
    mutated_fragments = []
    for fragment in fragments:
        mutated_fragment = list(fragment)
        for _ in range(num_mutations):
            idx = random.randint(0, len(mutated_fragment)-1)
            mutated_fragment[idx] = random.choice("ACDEFGHIKLMNPQRSTVWY")  # Random AA mutation
        mutated_fragments.append("".join(mutated_fragment))
    return mutated_fragments

# Apply VAE to mutate the peptide fragments
peptides = fragment_scores_df['Peptide'].tolist()
mutated_fragments = vae_mutate_fragments(peptides)

# Print the first few mutated fragments
print("Mutated fragments (first 5):")
print(mutated_fragments[:5])

Mutated fragments (first 5):
['THYDQNNC', 'HYTRENTA', 'HSANNIAV', 'GRNNPAVG', 'RNHIAVGW']


In [19]:
# Function to introduce mutations into the full-length sequence based on mutated fragments
def mutate_full_length_sequence(full_length_seq, original_fragments, mutated_fragments):
    mutated_sequence = full_length_seq
    for original, mutated in zip(original_fragments, mutated_fragments):
        # Find the original fragment in the full sequence and replace it with the mutated fragment
        index = mutated_sequence.find(original)
        if index != -1:
            mutated_sequence = mutated_sequence[:index] + mutated + mutated_sequence[index+len(original):]
    return mutated_sequence

# Mutate the full-length PAD4 sequence
mutated_full_length_PAD4 = mutate_full_length_sequence(full_length_PAD4, peptides, mutated_fragments)

# Print the mutated full-length PAD4 sequence
print("\nMutated full-length PAD4 sequence:")
print(mutated_full_length_PAD4)


Mutated full-length PAD4 sequence:
EVKQENRLLNESESSSQGLLGYYFSDLNFQAPMVVTSSTTGDLSIPSSELENIPSENQYFQSAIWSGFIKVKKSDEYTFATSADNHVTMWVDDQEVINKASNSNKIRLEKGRLYQIKIQYQRENPTEKGLDFKLYWTDSQNKKEVISSDNLQLPELKQKSSNSRKKRSTSAGPTVPDRDNDGIPDSLEVEGYTVDVKNKRTFLSPWISNIHEKKGLTKYKSSPEKWSTASDPYSDFEKVTGRIDKNVSPEARHPLVAAYPIVHVDMENIILSKNEDQSTQNTDSETRTISKNTSTSRTHTSEVHGNAEVHASFFDIGGSVSAGFSNSNSSTVAIDHSLSLAGERTWAETMGLNTADTARLNANIRYVNTGTAPIYNVLPTTSLVLGKNQTLATIKAKENQLSQILAPNNYYPSKNLAPIALNAQDDFSSTPITMNYNQFLELEKTKQLRLDTDQVYGNIATYNFENGRVRVDTGSNWSEVLPQIQETTARIIFNGKDLNLVERRIAAVNPSDPLETTKPDMTLKEALKIAFGFNEPNGNLQYQGKDITEFDFNFDQQTSQNIKNQLAELNATNIYTVLDKIKLNAKMNILIRDKRFHYSDYNIAVGIDQSVVHELHGEVIMSFTEGLLFNIQKDIRAIAGGYTIEIDDTHMLKSVPMDRYDNLSISKLRWDLWTYIDNKKYGDKIDLYISYCNYKHNRYIVAKENTIHNEYENGNRQTQDIKQILRGSKRGYEIG


In [26]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
import numpy as np

# Step 4: Preprocess the data (scaling)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features[:, 1:].astype(np.float32))  # Skip the first column (peptide seq)

# Encoder
input_peptide = layers.Input(shape=(features_scaled.shape[1],))  # Use the scaled features
x = layers.Dense(64, activation='relu')(input_peptide)
x = layers.Dense(32, activation='relu')(x)

# Latent space representation
latent_dim = 2  # Latent space dimension
z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)

# Sampling function (updated to avoid TensorFlow issue)
def sampling(z_mean, z_log_var):
    epsilon = tf.keras.backend.random_normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

z = layers.Lambda(lambda args: sampling(*args))([z_mean, z_log_var])

# Decoder
decoder_input = layers.Input(shape=(latent_dim,))
x_decoder = layers.Dense(32, activation='relu')(decoder_input)
x_decoder = layers.Dense(64, activation='relu')(x_decoder)
output_peptide = layers.Dense(features_scaled.shape[1], activation='sigmoid')(x_decoder)

# Define Encoder and Decoder models
encoder = models.Model(input_peptide, [z_mean, z_log_var, z], name='encoder')
decoder = models.Model(decoder_input, output_peptide, name='decoder')

# Define VAE model by combining encoder and decoder
output_vae = decoder(encoder(input_peptide)[2])
vae = models.Model(input_peptide, output_vae)

# VAE loss function
def vae_loss(input_peptide, output_vae):
    reconstruction_loss = tf.reduce_mean(tf.square(input_peptide - output_vae))
    kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
    kl_loss = tf.reduce_mean(kl_loss)
    kl_loss *= -0.5
    return reconstruction_loss + kl_loss

vae.compile(optimizer='adam', loss=vae_loss)

# Step 5: Train the VAE model
vae.fit(features_scaled, features_scaled, epochs=50, batch_size=32)

Epoch 1/50


ValueError: Tried to convert 'x' to a tensor and failed. Error: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```
