In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import numpy as np
import pandas as pd
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.utils import train_test_split
from neuralnetlib.losses import CrossEntropyWithLabelSmoothing
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler

In [2]:
def prepare_causal_lm_data(text_data, tokenizer, max_length=512, stride=256):
    """
    Prépare les données pour l'entraînement causal LM en utilisant une fenêtre glissante
    """
    # Tokenisation du texte complet
    tokens = tokenizer.texts_to_sequences([text_data], add_special_tokens=True)[0]
    
    # Création des séquences d'entraînement avec une fenêtre glissante
    sequences = []
    for i in range(0, len(tokens) - max_length + 1, stride):
        sequence = tokens[i:i + max_length]
        if len(sequence) == max_length:
            sequences.append(sequence)
    
    # Conversion en array numpy
    sequences = np.array(sequences)
    
    # Création des entrées et cibles (décalées d'une position)
    X = sequences[:, :-1]
    y = sequences[:, 1:]
    
    return X, y

In [3]:
class TextGenerationCallback(Callback):
    def __init__(self, model, tokenizer, prompt_texts, max_length=50, temperature=0.8):
        self.model = model
        self.tokenizer = tokenizer
        self.prompt_texts = prompt_texts
        self.max_length = max_length
        self.temperature = temperature
    
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nGénération d'exemples après l'epoch {epoch}:")
        for prompt in self.prompt_texts:
            sequence = self.tokenizer.texts_to_sequences([prompt], add_special_tokens=True)[0]
            input_sequence = pad_sequences([sequence], max_length=self.model.max_sequence_length, 
                                        padding='post', pad_value=self.model.PAD_IDX)
            
            generated = self.model.predict(
                input_sequence,
                max_length=self.max_length,
                temperature=self.temperature,
                beam_size=1  # On utilise un beam search de 1 pour la génération simple
            )
            
            generated_text = self.tokenizer.sequences_to_texts(generated.tolist())[0]
            print(f"Prompt: {prompt}")
            print(f"Généré: {generated_text}\n")

In [4]:
def train_causal_lm(text_data, model, tokenizer, max_length=512, batch_size=32, epochs=10):
    """
    Entraîne le modèle comme un LLM causal
    """
    # Préparation des données
    X, y = prepare_causal_lm_data(text_data, tokenizer, max_length)
    
    # Prompts de test pour la génération
    test_prompts = [
        "Il était une fois",
        "Le chat",
        "Je pense que",
        "Dans la forêt"
    ]
    
    # Callbacks
    callbacks = [
        EarlyStopping(monitor='loss', patience=5, restore_best_weights=True),
        TextGenerationCallback(model, tokenizer, test_prompts),
        LearningRateScheduler(
            schedule="warmup_cosine",
            initial_learning_rate=0.0001,
            verbose=True
        )
    ]
    
    # Entraînement
    history = model.fit(
        X, y,
        epochs=epochs,
        batch_size=batch_size,
        callbacks=callbacks,
        metrics=['bleu'],
    )
    
    return history

In [5]:
def generate_text(model, tokenizer, prompt, max_length=50, temperature=0.8):
    """
    Génère du texte à partir d'un prompt
    """
    sequence = tokenizer.texts_to_sequences([prompt], add_special_tokens=True)[0]
    input_sequence = pad_sequences([sequence], max_length=model.max_sequence_length, 
                                 padding='post', pad_value=model.PAD_IDX)
    
    generated = model.predict(
        input_sequence,
        max_length=max_length,
        temperature=temperature,
        beam_size=1
    )
    
    return tokenizer.sequences_to_texts(generated.tolist())[0]

In [6]:
# Charger votre corpus de texte
with open('text8_light.txt', 'r', encoding='utf-8') as f:
    text_data = f.read()

# Créer et entraîner le tokenizer
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([text_data])

vocab_size = len(tokenizer.word_index) + 1

In [7]:
model = Transformer(
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    d_model=256,
    n_heads=4,
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=516,
    dropout_rate=0.1,
    max_sequence_length=512,
    random_state=42
)

model.compile(
    loss_function=CrossEntropyWithLabelSmoothing(label_smoothing=0.1),
    optimizer=Adam(
        learning_rate=0.0001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9,
        clip_norm=1.0
    )
)

In [8]:
# Entraîner le modèle
history = train_causal_lm(text_data, model, tokenizer)



Initial learning rate: 0.000100
[=====-------------------------] 16% Epoch 1/10 - loss: 15.0754 - bleu: 9.1815612211e-09 - 206.08s

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x000001C5708C25F0>>
Traceback (most recent call last):
  File "c:\Program Files\Python310\lib\site-packages\ipykernel\ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [9]:
# Générer du texte
generated = generate_text(model, tokenizer, "this would ensure that", temperature=0.8)
print(generated)

proposal proposal evaluate quantification decorations quba evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate quantification evaluate runner runner runner runner runner evaluate quantification quba evaluate quantification evaluate quba evaluate quantification evaluate quantification evaluate quantification evaluate quantification
