# Chargement du dataset

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.datasets import imdb

# Charger le dataset IMDb avec TensorFlow en texte brut
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000, index_from=3)
word_index = imdb.get_word_index()

# Inverser word_index pour décoder les séquences d'entiers en texte
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index[0] = "<PAD>"
reverse_word_index[1] = "<START>"
reverse_word_index[2] = "<UNK>"

def decode_review(sequence):
    return " ".join([reverse_word_index.get(i, "?") for i in sequence])

# Décoder les données d'entraînement et de test en texte brut
train_texts = [decode_review(seq) for seq in train_data]
test_texts = [decode_review(seq) for seq in test_data]

# Diviser les données de test en validation et test
val_size = 5000
train_texts, val_texts = train_texts[val_size:], train_texts[:val_size]
train_labels, val_labels = train_labels[val_size:], train_labels[:val_size]

2024-10-22 16:03:00.295409: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 16:03:00.314045: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 16:03:00.336258: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 16:03:00.342815: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 16:03:00.360006: I tensorflow/core/platform/cpu_feature_guar

# Construction du modèle

In [2]:
# Créer une couche TextVectorization pour convertir le texte en séquences d'entiers
max_tokens = 10000  # Taille maximale du vocabulaire
max_len = 100       # Longueur maximale des séquences

vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=max_len
)

# Adapter la couche TextVectorization sur les textes d'entraînement
vectorizer.adapt(train_texts)

# Définir le modèle TensorFlow
embedding_dim = 64
hidden_dim = 16
output_dim = 1

model = models.Sequential([
    vectorizer,
    layers.Embedding(input_dim=max_tokens, output_dim=embedding_dim, mask_zero=True),
    layers.GlobalAveragePooling1D(),
    layers.Dense(hidden_dim, activation='relu'),
    layers.Dense(output_dim, activation='sigmoid')  # Sigmoid pour la classification binaire
])

# Compiler le modèle
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Passer une dummy input pour construire le modèle
dummy_text = tf.constant(["This is a dummy input for testing purposes"])

# Passer la dummy input textuelle dans le modèle
dummy_output = model(dummy_text)

# Maintenant, tu peux afficher le résumé du modèle avec les paramètres
model.summary()


2024-10-22 16:03:08.739491: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 12821 MB memory:  -> device: 0, name: NVIDIA A2, pci bus id: 0000:17:00.0, compute capability: 8.6


# Boucles d'entrainement

In [3]:
# Convertir les données en datasets TensorFlow
def convert_to_tf_dataset(data, labels):
    tf_dataset = tf.data.Dataset.from_tensor_slices((data, labels))
    return tf_dataset

train_dataset = convert_to_tf_dataset(train_texts, train_labels)
val_dataset = convert_to_tf_dataset(val_texts, val_labels)
test_dataset = convert_to_tf_dataset(test_texts, test_labels)

In [4]:
# Prétraitement des datasets pour le modèle TensorFlow
batch_size = 32
train_dataset = train_dataset.shuffle(len(train_texts)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Entraîner le modèle
epochs = 30
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=epochs
)

# Évaluer le modèle
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")

Epoch 1/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.6940 - loss: 0.5898 - val_accuracy: 0.8222 - val_loss: 0.3971
Epoch 2/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.8798 - loss: 0.2983 - val_accuracy: 0.8218 - val_loss: 0.3933
Epoch 3/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9071 - loss: 0.2398 - val_accuracy: 0.8236 - val_loss: 0.4057
Epoch 4/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9276 - loss: 0.1955 - val_accuracy: 0.8188 - val_loss: 0.4418
Epoch 5/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.9446 - loss: 0.1602 - val_accuracy: 0.8128 - val_loss: 0.4913
Epoch 6/30
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9546 - loss: 0.1415 - val_accuracy: 0.8098 - val_loss: 0.5399
Epoch 7/30
[1m625/625