<a href="https://colab.research.google.com/github/mandalsudipta/Biopython_codes/blob/main/dna_seq_classification_using_neural_net.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Sample DNA sequences
sequences = [
    "ATCGTACGTA",
    "CGTAGCTAGC",
    "TACGTAGCTA",
    "GCTAGCTAGC",
    "TAGCTAGCTA"
]

# Sample labels (0 or 1 for binary classification)
labels = [0, 1, 0, 1, 0]

# Tokenize the DNA sequences
tokenizer = Tokenizer(char_level=True)  # Character-level tokenization
tokenizer.fit_on_texts(sequences)
encoded_sequences = tokenizer.texts_to_sequences(sequences)

# Pad sequences to the same length
max_len = max([len(seq) for seq in encoded_sequences])
X = pad_sequences(encoded_sequences, maxlen=max_len, padding='post')

# Convert labels to a numpy array
y = np.array(labels)

# Define the model
model = Sequential()

# Embedding layer (optional, for converting sequences to dense vectors)
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=8, input_length=max_len))

# Convolutional layer
model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))

# Max pooling layer
model.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model.add(Flatten())

# Dense layer
model.add(Dense(64, activation='relu'))

# Output layer (for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()

# Train the model (with dummy data here, normally you'd use a large dataset)
model.fit(X, y, epochs=10, batch_size=2, verbose=2)

# Predict on new sequences
new_sequences = ["GCTAGCTAGC", "ATCGTACGTA"]
encoded_new_sequences = tokenizer.texts_to_sequences(new_sequences)
X_new = pad_sequences(encoded_new_sequences, maxlen=max_len, padding='post')
predictions = model.predict(X_new)

print("Predictions:", predictions)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 8)             40        
                                                                 
 conv1d (Conv1D)             (None, 8, 32)             800       
                                                                 
 max_pooling1d (MaxPooling1  (None, 4, 32)             0         
 D)                                                              
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                        