<a href="https://colab.research.google.com/github/olfabre/ABtoABR/blob/main/TensorFlow_with_GPU_V3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, Dropout
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import requests
from sklearn.metrics import accuracy_score
from keras.callbacks import EarlyStopping

# Télécharger le fichier depuis l'URL
url = "https://olivier-fabre.com/Prediction_proteines/protein-secondary-structure.train"
response = requests.get(url)
data_lines = response.text.splitlines()

# Initialisation des variables
data_lines = [line.strip() for line in data_lines]
input_sequences = []
output_sequences = []
acides_amines = "ACDEFGHIKLMNPQRSTVWY"

sequence = []
structure = []

# Lecture et traitement des données
for line in data_lines:
    if line.startswith("#") or line.startswith("<>") or line.startswith("<end>"):
        continue

    if line:
        listeEnsemble = line.split()
        if len(listeEnsemble) == 2:
            acide_amin, structure_sec = listeEnsemble
            sequence.append(acide_amin)
            structure.append(structure_sec)

    if len(sequence) > 0 and (line == "" or line.startswith("<end>")):
        # Convertir la séquence d'acides aminés et la structure en indices
        input_sequences.append([acides_amines.index(acides) for acides in sequence])
        sequence_structure = []
        for index in structure:
            if index == 'h':
                sequence_structure.append(0)  # Hélice alpha
            elif index == 'e':
                sequence_structure.append(1)  # Feuillet bêta
            elif index == 'c':
                sequence_structure.append(2)  # Pelote aléatoire
            else:
                sequence_structure.append(2)  # Pelote aléatoire (par défaut)
        output_sequences.append(sequence_structure)

        # Réinitialiser pour la prochaine séquence
        sequence = []
        structure = []

# Découper en fenêtres glissantes de taille 17
window_size = 17  # Augmenté pour capturer un contexte encore plus large
def generate_windows(sequences, structures, window_size):
    windowed_sequences = []
    windowed_structures = []

    for seq, struct in zip(sequences, structures):
        for i in range(len(seq) - window_size + 1):
            windowed_sequences.append(seq[i:i+window_size])
            windowed_structures.append(struct[i+window_size//2])

    return np.array(windowed_sequences), np.array(windowed_structures)

input_sequences, output_sequences = generate_windows(input_sequences, output_sequences, window_size)

# Paramètres
num_amino_acids = 20
num_structures = 3

# Préparation des données
input_sequences = pad_sequences(input_sequences, maxlen=window_size, padding='post')
output_sequences = to_categorical(output_sequences, num_classes=num_structures)

# Construction du modèle
model = Sequential()
model.add(Embedding(input_dim=num_amino_acids, output_dim=128, input_length=window_size))
model.add(LSTM(128, return_sequences=True, kernel_regularizer='l2'))
model.add(Dropout(0.5))  # Régularisation accrue
model.add(LSTM(64, return_sequences=False, kernel_regularizer='l2'))  # Deuxième couche LSTM
model.add(Dense(64, activation='relu', kernel_regularizer='l2'))
model.add(Dense(num_structures, activation='softmax'))

# Compilation du modèle
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Résumé du modèle
model.summary()

# Early Stopping Callback
early_stopping = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

# Entraînement du modèle
model.fit(input_sequences, output_sequences, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Prédiction avec le modèle
predictions = model.predict(input_sequences)
print(predictions)

# Calcul explicite de l'accuracy
predicted_classes = np.argmax(predictions, axis=-1)
true_classes = np.argmax(output_sequences, axis=-1)
accuracy = accuracy_score(true_classes.flatten(), predicted_classes.flatten())
print(f"Accuracy calculée sur les données d'entraînement : {accuracy:.2f}")


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 17, 128)           2560      
                                                                 
 lstm_5 (LSTM)               (None, 17, 128)           131584    
                                                                 
 dropout_2 (Dropout)         (None, 17, 128)           0         
                                                                 
 lstm_6 (LSTM)               (None, 64)                49408     
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dense_6 (Dense)             (None, 3)                 195       
                                                                 
Total params: 187907 (734.01 KB)
Trainable params: 187