In [None]:
#######################################################################################
!pip install torch torchvision torchaudio --quiet
!pip install numpy pandas scikit-learn tqdm matplotlib --quiet
########################################################################################

!git clone https://github.com/HelenGuohx/logbert.git
%cd logbert


!ls


import sys
sys.path.append("/content/logbert")  # Add root folder to Python path


import torch
from bert_pytorch.model.log_model import BERTLog
from bert_pytorch.model.bert import BERT


vocab_size = 10000        # Total unique log keys in your dataset
max_len = 128             # Sequence length
hidden = 768              # Hidden size
n_layers = 12             # Transformer blocks
attn_heads = 12           # Attention heads
dropout = 0.3
is_logkey = True
is_time = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


bert_model = BERT(
    vocab_size=vocab_size,
    max_len=max_len,
    hidden=hidden,
    n_layers=n_layers,
    attn_heads=attn_heads,
    dropout=dropout,
    is_logkey=is_logkey,
    is_time=is_time
).to(device)


model = BERTLog(bert_model, vocab_size=vocab_size).to(device)



dummy_input = torch.randint(0, vocab_size, (2, max_len)).to(device)
time_info = torch.zeros((2, max_len)).to(device)  # Required even if is_time=False

output = model(dummy_input, time_info)

print("Output type:", type(output))
if isinstance(output, torch.Tensor):
    print("Output shape:", output.shape)
else:
    print("Output:", output)


In [None]:

import json, torch, random, os
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report


train_file = "/content/3.2_TRAIN.json"
test_file = "/content/3.2_test.json"
max_len = 1000  # adjust as needed
hidden = 240
n_layers = 7
attn_heads = 8
dropout = 0.3
batch_size = 64
lr = 2e-3
epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(42)
torch.manual_seed(42)

output_dir = "output3"
os.makedirs(output_dir, exist_ok=True)

# ==============================
#  Build vocab
# ==============================
log2idx = {"<PAD>": 0}

def build_vocab(file_path):
    with open(file_path, "r") as f:
        for line in f:
            item = json.loads(line)
            for seq in item["numeric_sequence"]:
                for x in seq:
                    if x not in log2idx:
                        log2idx[x] = len(log2idx)

# Build vocab from train + test
build_vocab(train_file)
build_vocab(test_file)
vocab_size = len(log2idx)
idx2log = {v:k for k,v in log2idx.items()}
print(f"Vocab size: {vocab_size}")

# Save label mapping
with open(os.path.join(output_dir, "label_map.json"), "w") as f:
    json.dump(log2idx, f, indent=4)

# ==============================
# Dataset
# ==============================
class LazyLogDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = file_path
        with open(file_path, "r") as f:
            self.length = sum(1 for _ in f)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        with open(self.file_path, "r") as f:
            for i, line in enumerate(f):
                if i == idx:
                    item = json.loads(line)
                    seqs = item["numeric_sequence"][0]  # pick first sequence
                    seqs = [log2idx[x] for x in seqs]
                    # pad/truncate
                    seqs += [log2idx["<PAD>"]] * (max_len - len(seqs))
                    seqs = seqs[:max_len]
                    seq_tensor = torch.tensor(seqs, dtype=torch.long)
                    time_tensor = torch.zeros(max_len, dtype=torch.float32)
                    return seq_tensor, time_tensor, item["sequence_id"]

train_dataset = LazyLogDataset(train_file)
test_dataset = LazyLogDataset(test_file)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# ==============================
#  Model
# ==============================
class BERT(nn.Module):
    def __init__(self, vocab_size, max_len, hidden, n_layers, attn_heads, dropout):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, hidden, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden, nhead=attn_heads, dropout=dropout)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.max_len = max_len
        self.hidden = hidden

    def forward(self, x, t=None):
        emb = self.emb(x) * (self.hidden ** 0.5)
        emb = emb.permute(1, 0, 2)  # seq_len x batch x hidden
        enc = self.encoder(emb)
        enc = enc.permute(1, 0, 2)  # batch x seq_len x hidden
        return enc

class BERTLog(nn.Module):
    def __init__(self, bert_model, vocab_size):
        super().__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.hidden, vocab_size)

    def forward(self, x, t=None):
        enc = self.bert(x, t)
        out = self.fc(enc)
        return {"logkey_output": out}

# ==============================
#  Model, criterion, optimizer
# ==============================
bert_model = BERT(vocab_size=vocab_size, max_len=max_len, hidden=hidden,
                  n_layers=n_layers, attn_heads=attn_heads, dropout=dropout).to(device)
model = BERTLog(bert_model, vocab_size=vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# ==============================
#  Training loop with metrics
# ==============================
for epoch in range(epochs):
    model.train()
    running_loss = 0
    for seqs, time_feats, _ in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        seqs, time_feats = seqs.to(device), time_feats.to(device)
        optimizer.zero_grad()
        outputs = model(seqs, time_feats)['logkey_output']
        loss = criterion(outputs.view(-1, vocab_size), seqs.view(-1))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for seqs, time_feats, _ in test_loader:
            seqs, time_feats = seqs.to(device), time_feats.to(device)
            outputs = model(seqs, time_feats)['logkey_output']
            preds = torch.argmax(outputs, dim=-1)
            val_preds.extend(preds.cpu().numpy().flatten())
            val_labels.extend(seqs.cpu().numpy().flatten())

    acc = accuracy_score(val_labels, val_preds) * 100
    f1 = f1_score(val_labels, val_preds, average='weighted', zero_division=0)
    precision = precision_score(val_labels, val_preds, average='weighted', zero_division=0)
    recall = recall_score(val_labels, val_preds, average='weighted', zero_division=0)
    cm = confusion_matrix(val_labels, val_preds)
    report = classification_report(val_labels, val_preds, zero_division=0)

    print(f"\nEpoch {epoch+1}: Train Loss={avg_loss:.4f}, Val Acc={acc:.2f}%")
    print(f"F1={f1:.4f}, Precision={precision:.4f}, Recall={recall:.4f}")
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", report)

# ==============================
# Save model
# ==============================
model_path = os.path.join(output_dir, "model_final.pt")
torch.save(model.state_dict(), model_path)
print(f"Model saved at {model_path}")

# ==============================
#  Load model example
# ==============================
# To reload model for inference/testing/XAI/RAG
loaded_model = BERTLog(BERT(vocab_size, max_len, hidden, n_layers, attn_heads, dropout), vocab_size).to(device)
loaded_model.load_state_dict(torch.load(model_path))
loaded_model.eval()
print("Model loaded and ready for inference.")
