### Imports

In [89]:

import pandas as pd
from sklearn.model_selection import (train_test_split)
import torch
import torch.nn as nn
from transformers import AutoModel , AutoTokenizer,  TrainingArguments, DataCollatorWithPadding
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm

from sklearn.metrics import accuracy_score, precision_recall_fscore_support


## Preparing the dataset

In [90]:
seq_df  = pd.read_csv('master_tables/encoder_seq.csv')


In [91]:
seq_df_clean = seq_df[(seq_df['FaultType'].notna()) & (seq_df['FaultType'] != 'slowHDFS')].copy()

labels = sorted(seq_df_clean["FaultType"].unique())
num_classes = len(labels)
label_to_id = {lab: i for i, lab in enumerate(labels)}
seq_df_clean["label_id"] = seq_df_clean["FaultType"].map(label_to_id)


print("Taille originale :", len(seq_df))
print("Taille nettoyée  :", len(seq_df_clean))


Taille originale : 370323
Taille nettoyée  : 53253


In [92]:


# 2) split stratifié 70 / 15 / 15
train_df, temp_df = train_test_split(
    seq_df_clean,
    test_size=0.30,
    stratify=seq_df_clean["FaultType"],
    random_state=42,
)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    stratify=temp_df["FaultType"],
    random_state=42,
)

print("Train:", train_df["FaultType"].value_counts())
print("Val:  ", val_df["FaultType"].value_counts())
print("Test: ", test_df["FaultType"].value_counts())

Train: FaultType
killDN          4528
corruptBlk      3748
corruptMeta     3699
disconnectDN    3632
cutMeta         3514
cutBlk          3487
lossBlk         3444
lossMeta        3352
slowDN          3077
suspendDN       1928
panicDN         1384
deadDN          1226
readOnlyDN       258
Name: count, dtype: int64
Val:   FaultType
killDN          970
corruptBlk      803
corruptMeta     793
disconnectDN    778
cutMeta         753
cutBlk          748
lossBlk         738
lossMeta        718
slowDN          659
suspendDN       413
panicDN         297
deadDN          263
readOnlyDN       55
Name: count, dtype: int64
Test:  FaultType
killDN          971
corruptBlk      803
corruptMeta     793
disconnectDN    779
cutMeta         753
cutBlk          747
lossBlk         738
lossMeta        719
slowDN          659
suspendDN       413
panicDN         296
deadDN          262
readOnlyDN       55
Name: count, dtype: int64


In [93]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        enc = self.tokenizer(
            row["EventText"],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(row["label_id"], dtype=torch.long)
        
        return item

## Model 

In [94]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return tokenizer(
        batch["EventText"],
        truncation=True,
        padding=False,  # on laissera le DataCollator gérer
        max_length=256,
    )

In [95]:
train_ds = TextDataset(train_df, tokenizer)
val_ds   = TextDataset(val_df, tokenizer)
test_ds  = TextDataset(test_df, tokenizer)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=32)
test_loader  = DataLoader(test_ds, batch_size=32)

In [96]:
#  modèle BERT + MLP
class BertBinaryClassifier(nn.Module):
    def __init__(self, model_name, num_classes ,  dropout=0.1):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        h = self.bert.config.hidden_size
        self.mlp = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(h, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]   # token [CLS]
        logits = self.mlp(cls)   # (batch,num_classes)
        return logits



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertBinaryClassifier(model_name , num_classes).to(device)

# Poids de classes pour multi-classe
y_train = train_df["label_id"].to_numpy()
classes = np.unique(y_train)
class_weights_np = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train,
)
class_weights = torch.tensor(class_weights_np, dtype=torch.float32, device=device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)



In [None]:

def eval_loop(loader, average="macro"):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)   # (batch,) entiers

            logits = model(input_ids, attention_mask)   # (batch, num_classes)
            preds = logits.argmax(dim=1)               # (batch,)

            # comme dans gnn_FC : on stocke directement en numpy
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    y_pred = np.concatenate(all_preds)
    y_true = np.concatenate(all_labels)

    # même appel que prf_metrics du GNN
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred,
        average=average,
        zero_division=0,
    )
    acc = accuracy_score(y_true, y_pred)

    return acc, p, r, f1


### Training

In [None]:
EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    model.train()
    total_loss = 0.0

    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch} [train]", leave=False)

    for batch in train_pbar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # afficher le loss dans tqdm en temps réel
        train_pbar.set_postfix({"loss": total_loss / (len(train_loader))})

    # Validation
    val_acc, val_p, val_r, val_f1 = eval_loop(val_loader)

    print(f"Epoch {epoch} | "
          f"train loss {total_loss/len(train_loader):.4f} | "
          f"val acc {val_acc:.3f} pr {val_p:.3f} rc {val_r:.3f} f1 {val_f1:.3f}")
