<a href="https://colab.research.google.com/github/olfabre/LSTM_avec_Keras/blob/main/TensorFlow_with_GPU_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import requests
from sklearn.metrics import accuracy_score

# Télécharger le fichier depuis l'URL
url = "https://olivier-fabre.com/Prediction_proteines/protein-secondary-structure.train"
response = requests.get(url)
data_lines = response.text.splitlines()

# Initialisation des variables
data_lines = [line.strip() for line in data_lines]
input_sequences = []
output_sequences = []
acides_amines = "ACDEFGHIKLMNPQRSTVWY"

sequence = []
structure = []

# Lecture et traitement des données
for line in data_lines:
    if line.startswith("#") or line.startswith("<>") or line.startswith("<end>"):
        continue

    if line:
        listeEnsemble = line.split()
        if len(listeEnsemble) == 2:
            acide_amin, structure_sec = listeEnsemble
            sequence.append(acide_amin)
            structure.append(structure_sec)

    if len(sequence) > 0 and (line == "" or line.startswith("<end>")):
        # Convertir la séquence d'acides aminés et la structure en indices
        input_sequences.append([acides_amines.index(acides) for acides in sequence])
        sequence_structure = []
        for index in structure:
            if index == 'h':
                sequence_structure.append(0)  # Hélice alpha
            elif index == 'e':
                sequence_structure.append(1)  # Feuillet bêta
            elif index == 'c':
                sequence_structure.append(2)  # Pelote aléatoire
            else:
                sequence_structure.append(2)  # Pelote aléatoire (par défaut)
        output_sequences.append(sequence_structure)

        # Réinitialiser pour la prochaine séquence
        sequence = []
        structure = []

# Paramètres
num_amino_acids = 20
num_structures = 3
max_sequence_length = max(len(seq) for seq in input_sequences)

# Préparation des données
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
output_sequences = pad_sequences(output_sequences, maxlen=max_sequence_length, padding='post')

# Conversion des sorties en one-hot encoding
output_sequences = to_categorical(output_sequences, num_classes=num_structures)

# Construction du modèle
model = Sequential()
model.add(Embedding(input_dim=num_amino_acids, output_dim=64, input_length=max_sequence_length))
model.add(LSTM(128, return_sequences=True))
model.add(TimeDistributed(Dense(num_structures, activation='softmax')))

# Compilation du modèle
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Résumé du modèle
model.summary()

# Entraînement du modèle
model.fit(input_sequences, output_sequences, epochs=10, batch_size=32)

# Prédiction avec le modèle
predictions = model.predict(input_sequences)
print(predictions)

# Calcul explicite de l'accuracy
predicted_classes = np.argmax(predictions, axis=-1)
true_classes = np.argmax(output_sequences, axis=-1)
accuracy = accuracy_score(true_classes.flatten(), predicted_classes.flatten())
print(f"Accuracy calculée sur les données d'entraînement : {accuracy:.2f}")


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18105, 64)         1280      
                                                                 
 lstm (LSTM)                 (None, 18105, 128)        98816     
                                                                 
 time_distributed (TimeDist  (None, 18105, 3)          387       
 ributed)                                                        
                                                                 
Total params: 100483 (392.51 KB)
Trainable params: 100483 (392.51 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[[0.3203105  0.3246717  0.35501778]
  [0.31455338 0.32121888 0.36422783]
  [0.30521706 0.31384158 0.38