In [1]:
# 1. Import des bibliothèques nécessaires
import numpy as np
import requests
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU
from tensorflow.keras.callbacks import EarlyStopping

# 2. Chargement et prétraitement des données
def load_and_preprocess_text(url):
    # Télécharger le texte
    response = requests.get(url)
    text = response.text
    
    # Trouver le début et la fin du texte principal
    start_index = text.find("*** START")
    end_index = text.find("*** END")
    main_text = text[start_index:end_index]
    
    # Nettoyage du texte
    def preprocess_text(text):
        # Supprimer les caractères spéciaux et les nombres
        text = re.sub(r'[^a-zA-Z\s.]', '', text)
        # Convertir en minuscules
        text = text.lower()
        # Supprimer les espaces multiples
        text = re.sub(r'\s+', ' ', text)
        return text
    
    cleaned_text = preprocess_text(main_text)
    
    # Diviser en phrases
    corpus = cleaned_text.split('.')
    # Nettoyer les phrases vides
    corpus = [sentence.strip() for sentence in corpus if len(sentence.strip()) > 0]
    
    return corpus

# Charger et prétraiter le texte
url = "https://www.gutenberg.org/cache/epub/11/pg11.txt"
corpus = load_and_preprocess_text(url)

# Afficher les 200 premiers caractères
print("Premiers 200 caractères du corpus :")
print(''.join(corpus)[:200])

Premiers 200 caractères du corpus :
start of the project gutenberg ebook alices adventures in wonderland illustration alices adventures in wonderland by lewis carroll the millennium fulcrum editioncontents chapter idown the rabbithole c


In [2]:
# 3. Création du vocabulaire avec Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1


In [3]:
# 4. Préparation des séquences d'entrée
def create_sequences(corpus, sequence_length=10):
    input_sequences = []
    
    for sentence in corpus:
        token_list = tokenizer.texts_to_sequences([sentence])[0]
        
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    return input_sequences

# Créer les séquences
input_sequences = create_sequences(corpus)

# Padding des séquences (pre-padding car c'est plus approprié pour le RNN)
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, 
                                       maxlen=max_sequence_length, 
                                       padding='pre'))

# Séparer les entrées et les sorties
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [4]:
# 5. Construction du modèle (version LSTM)
def create_model(total_words, max_sequence_length, embedding_dim=100):
    model = Sequential([
        Embedding(total_words, embedding_dim, input_length=max_sequence_length-1),
        LSTM(150, return_sequences=True),
        LSTM(100),
        Dense(100, activation='relu'),
        Dropout(0.2),
        Dense(total_words, activation='softmax')
    ])
    return model

# Version alternative avec GRU
def create_model_gru(total_words, max_sequence_length, embedding_dim=100):
    model = Sequential([
        Embedding(total_words, embedding_dim, input_length=max_sequence_length-1),
        GRU(150, return_sequences=True),
        GRU(100),
        Dense(100, activation='relu'),
        Dropout(0.2),
        Dense(total_words, activation='softmax')
    ])
    return model

In [5]:
# 6. Compilation et entraînement
model = create_model(total_words, max_sequence_length)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping pour éviter le surapprentissage
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Entraînement
history = model.fit(X, y, 
                   epochs=50, 
                   batch_size=128, 
                   validation_split=0.2,
                   callbacks=[early_stopping])

Epoch 1/50


2025-03-10 12:25:33.252720: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2025-03-10 12:25:33.252771: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 64.00 GB
2025-03-10 12:25:33.252778: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 24.00 GB
2025-03-10 12:25:33.252820: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-03-10 12:25:33.252833: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-03-10 12:25:33.858741: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 338ms/step - accuracy: 0.0420 - loss: 6.7990 - val_accuracy: 0.0852 - val_loss: 6.2385
Epoch 2/50
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 290ms/step - accuracy: 0.0545 - loss: 6.0008 - val_accuracy: 0.0871 - val_loss: 6.1682
Epoch 3/50
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 305ms/step - accuracy: 0.0602 - loss: 5.8242 - val_accuracy: 0.0926 - val_loss: 6.1294
Epoch 4/50
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 357ms/step - accuracy: 0.0694 - loss: 5.6647 - val_accuracy: 0.0922 - val_loss: 6.1180
Epoch 5/50
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 358ms/step - accuracy: 0.0670 - loss: 5.5859 - val_accuracy: 0.0928 - val_loss: 6.1027
Epoch 6/50
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 318ms/step - accuracy: 0.0755 - loss: 5.4843 - val_accuracy: 0.0973 - val_loss: 6.0667
Epoch 7/50
[1m160/16

In [8]:
# 7. Fonction de génération de texte
def generate_text(seed_text, next_words, model, max_sequence_length):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
        
        # Prédiction
        predicted = model.predict(token_list, verbose=0)
        
        # Appliquer la température et normaliser
        temperature = 0.5
        predicted = predicted[0] / temperature
        # Convertir en probabilités avec softmax
        exp_preds = np.exp(predicted)
        predicted = exp_preds / np.sum(exp_preds)
        
        # Maintenant on peut utiliser multinomial en toute sécurité
        predicted = np.random.multinomial(1, predicted)
        predicted_index = np.argmax(predicted)
        
        # Convertir l'index en mot
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        # Ajouter le mot prédit au texte
        seed_text += " " + output_word
    
    return seed_text

# Test
seed_texts = [
    "alice wondered",
    "the white rabbit",
    "the queen of"
]

for seed in seed_texts:
    print(f"Seed: {seed}")
    print(f"Generated: {generate_text(seed, 5, model, max_sequence_length)}\n")



Seed: alice wondered
Generated: alice wondered fifteenth fair family wife grow

Seed: the white rabbit
Generated: the white rabbit snappishly left through bound confusion

Seed: the queen of
Generated: the queen of pounds tumbling favoured picked hoarse

