In [None]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import json

def safe_train_test_split(X, y, test_size=0.5, random_state=None, stratify_labels=None):
    """Performs train_test_split, falling back to non-stratified if stratification fails due to minority classes."""
    if stratify_labels is None:
        return train_test_split(X, y, test_size=test_size, random_state=random_state)

    try:
        # Attempt stratified split
        return train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=stratify_labels)
    except ValueError as e:
        if "The least populated class in y has only 1 member" in str(e):
            print(f"Warning: Stratified split failed due to minority class(es). Falling back to non-stratified split for this step.")
            # Fallback to non-stratified split
            return train_test_split(X, y, test_size=test_size, random_state=random_state)
        else:
            # Re-raise other ValueErrors
            raise e

In [None]:
# Config
# ---------------------------
DATA_PATH = "Hadoop_2k.log_structured.csv"  # <-- change si nécessaire
MODEL_DIR = "saved_model"
TOKENIZER_PATH = os.path.join(MODEL_DIR, "tokenizer.json")
MODEL_PATH = os.path.join(MODEL_DIR, "lstm_Hadoop.h5")
MAX_VOCAB = 20000
MAX_LEN = 120
EMBEDDING_DIM = 128
BATCH_SIZE = 64
EPOCHS = 10
RANDOM_STATE = 42


In [None]:
# Utilitaires
# ---------------------------
def load_data(path=DATA_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Le fichier {path} est introuvable. Placez le CSV dans le dossier courant.")
    df = pd.read_csv(path, low_memory=False)
    # Assure existence de colonnes
    if 'Content' not in df.columns or 'Level' not in df.columns:
        raise ValueError("Le fichier doit contenir au moins les colonnes 'Content' et 'Level'.")
    df = df[['Content', 'Level']].dropna().reset_index(drop=True)
    return df

def clean_text(text):
    # nettoyage léger, conserve les mots techniques
    text = str(text)
    text = text.lower()
    # remplacer chemins et nombres par token
    text = re.sub(r'\/\S+', ' ', text)
    text = re.sub(r'\d+([:._-]?\d+)*', ' ', text)
    # garder lettres, underscore et espaces
    text = re.sub(r'[^a-z0-9_ ]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Préparation dataset
# ---------------------------
print("Loading data...")
df = load_data(DATA_PATH)
print(f"Loaded {len(df)} lines.")

print("Cleaning content...")
df['clean'] = df['Content'].apply(clean_text)

# labels
print("Encoding labels...")
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['Level'])
label_map = {int(i): lab for i, lab in enumerate(le.classes_)}
print("Label mapping:", label_map)

# split
X = df['clean'].values
y = tf.keras.utils.to_categorical(df['label_enc'].values)
X_train, X_temp, y_train, y_temp = safe_train_test_split(X, y, test_size=0.30, random_state=RANDOM_STATE, stratify_labels=df['label_enc'])
X_val, X_test, y_val, y_test = safe_train_test_split(X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify_labels=np.argmax(y_temp, axis=1))

print(f"Split -> Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Loading data...
Loaded 2000 lines.
Cleaning content...
Encoding labels...
Label mapping: {0: 'ERROR', 1: 'FATAL', 2: 'INFO', 3: 'WARN'}
Split -> Train: 1400 | Val: 300 | Test: 300


In [None]:
# tokenizer
print("Fitting tokenizer...")
tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
vocab_size = min(MAX_VOCAB, len(word_index) + 1)
print("Vocab size:", vocab_size)


Fitting tokenizer...
Vocab size: 317


In [None]:
# sequences
def texts_to_padded_sequences(texts):
    seq = tokenizer.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')

X_train_seq = texts_to_padded_sequences(X_train)
X_val_seq = texts_to_padded_sequences(X_val)
X_test_seq = texts_to_padded_sequences(X_test)


In [None]:
# Build model
# ---------------------------
print("Building model...")
num_classes = y.shape[1]
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Building model...




In [None]:
# callbacks & path
os.makedirs(MODEL_DIR, exist_ok=True)
checkpoint = ModelCheckpoint(MODEL_PATH, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True, mode='max')

# class weights (si déséquilibre)
# calcul à partir des labels d'entraînement
y_integers = np.argmax(y_train, axis=1)
classes, counts = np.unique(y_integers, return_counts=True)
class_weight = {}
for cls, cnt in zip(classes, counts):
    class_weight[cls] = (len(y_integers) / (len(classes) * cnt))
print("Class weights:", class_weight)


Class weights: {np.int64(0): np.float64(3.3333333333333335), np.int64(1): np.float64(350.0), np.int64(2): np.float64(0.4807692307692308), np.int64(3): np.float64(0.6183745583038869)}


In [None]:
# Train
# ---------------------------
print("Training...")
history = model.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint, early],
    class_weight=class_weight,
    verbose=2
)

Training...
Epoch 1/10

Epoch 1: val_accuracy improved from -inf to 0.81333, saving model to saved_model/lstm_Hadoop.h5




22/22 - 19s - 878ms/step - accuracy: 0.5786 - loss: 1.5153 - val_accuracy: 0.8133 - val_loss: 0.5430
Epoch 2/10

Epoch 2: val_accuracy improved from 0.81333 to 0.96333, saving model to saved_model/lstm_Hadoop.h5




22/22 - 18s - 802ms/step - accuracy: 0.9214 - loss: 0.5611 - val_accuracy: 0.9633 - val_loss: 0.1276
Epoch 3/10

Epoch 3: val_accuracy improved from 0.96333 to 0.97333, saving model to saved_model/lstm_Hadoop.h5




22/22 - 20s - 929ms/step - accuracy: 0.9850 - loss: 0.3261 - val_accuracy: 0.9733 - val_loss: 0.0925
Epoch 4/10

Epoch 4: val_accuracy improved from 0.97333 to 0.98667, saving model to saved_model/lstm_Hadoop.h5




22/22 - 20s - 915ms/step - accuracy: 0.9764 - loss: 0.3177 - val_accuracy: 0.9867 - val_loss: 0.0312
Epoch 5/10

Epoch 5: val_accuracy did not improve from 0.98667
22/22 - 11s - 495ms/step - accuracy: 0.9943 - loss: 0.1308 - val_accuracy: 0.9833 - val_loss: 0.0929
Epoch 6/10

Epoch 6: val_accuracy did not improve from 0.98667
22/22 - 12s - 526ms/step - accuracy: 0.9957 - loss: 0.1704 - val_accuracy: 0.9800 - val_loss: 0.0997
Epoch 7/10

Epoch 7: val_accuracy did not improve from 0.98667
22/22 - 21s - 968ms/step - accuracy: 0.9950 - loss: 0.0214 - val_accuracy: 0.9833 - val_loss: 0.0688


In [None]:
# save tokenizer & label map
with open(TOKENIZER_PATH, "w", encoding="utf-8") as f:
    f.write(tokenizer.to_json())
with open(os.path.join(MODEL_DIR, "label_map.json"), "w", encoding="utf-8") as f:
    json.dump(label_map, f, ensure_ascii=False, indent=2)

print("Model and tokenizer saved to", MODEL_DIR)

# ---------------------------
# Evaluation
# ---------------------------
print("Loading best model for evaluation...")
best_model = load_model(MODEL_PATH)

y_pred_prob = best_model.predict(X_test_seq)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Classification report:")
# Get unique labels present in y_true
unique_y_true_labels = np.unique(y_true)
# Filter target_names to match only the present labels
target_names_for_report = [le.classes_[i] for i in unique_y_true_labels]
print(classification_report(y_true, y_pred, labels=unique_y_true_labels, target_names=target_names_for_report))

print("Confusion matrix:")
cm = confusion_matrix(y_true, y_pred)
print(cm)

# Optionnel : sauvegarder résultats
res_df = pd.DataFrame({
    "text": X_test,
    "true": [le.classes_[i] for i in y_true],
    "pred": [le.classes_[i] for i in y_pred],
    "pred_prob": [float(np.max(p)) for p in y_pred_prob]
})
res_df.to_csv(os.path.join(MODEL_DIR, "test_predictions.csv"), index=False)
print("Predictions saved:", os.path.join(MODEL_DIR, "test_predictions.csv"))



Model and tokenizer saved to saved_model
Loading best model for evaluation...
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 124ms/step
Classification report:
              precision    recall  f1-score   support

       ERROR       1.00      0.94      0.97        17
        INFO       0.99      0.99      0.99       161
        WARN       0.99      1.00      1.00       122

    accuracy                           0.99       300
   macro avg       1.00      0.98      0.99       300
weighted avg       0.99      0.99      0.99       300

Confusion matrix:
[[ 16   1   0]
 [  0 160   1]
 [  0   0 122]]
Predictions saved: saved_model/test_predictions.csv
