In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import sys
import math
import gc

###############################################################################
# 1) FORCE CUDA USAGE OR EXIT
###############################################################################
if not torch.cuda.is_available():
    print("ERROR: CUDA device not available! Exiting...", flush=True)
    sys.exit(1)
DEVICE = "cuda"
torch.cuda.set_device(0)

###############################################################################
# 2) HYPERPARAMETERS & SETTINGS
###############################################################################
MAX_SEQ_LEN = 32
VOCAB_SIZE = 300
EOS_TOKEN_ID = VOCAB_SIZE - 1   # Stop token
MASK_TOKEN_ID = VOCAB_SIZE - 2  # Mask token
PAD_TOKEN_ID = 0

EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 8
NUM_HEADS = 8

BATCH_SIZE = 400       # Adjust according to your GPU memory
LEARNING_RATE = 1e-4 
NUM_EPOCHS = 5

# L2 regularization (weight decay)
L2_WEIGHT_DECAY = 1e-4

# Dropout rate for input and transformer layers.
DROPOUT_RATE = 0.1

# PENALTY FACTORS
DIGIT_PENALTY = 1.0
SPECIAL_PENALTY = 4.0
EOS_WEIGHT = 0.3

# KL divergence weight for regularizing the learned noise schedule.
KL_WEIGHT = 0.1

# Number of diffusion timesteps
TIMESTEPS = 10

# New: Optimized tau hyperparameter
TAU = 0.05

# Fixed cosine noise schedule (used to initialize the learnable noise schedule)
def fixed_noise_schedule(timesteps, min_noise=0.02, max_noise=0.98):
    return [min_noise + (max_noise - min_noise) * (1 - math.cos(math.pi * (i + 0.5) / timesteps)) / 2
            for i in range(timesteps)]
init_schedule = fixed_noise_schedule(TIMESTEPS, min_noise=0.02, max_noise=0.98)
init_schedule_tensor = torch.tensor(init_schedule, dtype=torch.float32)

PIN_MEMORY = True
NUM_WORKERS = 0

###############################################################################
# 3) DATASET CLASS
###############################################################################
class PasswordDataset(Dataset):
    def __init__(self, file_path, max_seq_len=MAX_SEQ_LEN):
        self.passwords = []
        with open(file_path, "r", encoding="latin-1", errors="ignore") as f:
            for line in f:
                pwd = line.strip()
                if pwd:
                    self.passwords.append(pwd)
        random.shuffle(self.passwords)
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.passwords)

    def __getitem__(self, idx):
        pwd = self.passwords[idx]
        token_ids = []
        for c in pwd[:self.max_seq_len - 1]:
            token = ord(c) - 31
            token = max(1, min(token, VOCAB_SIZE - 3))
            token_ids.append(token)
        token_ids.append(EOS_TOKEN_ID)
        token_ids += [PAD_TOKEN_ID] * (self.max_seq_len - len(token_ids))
        return torch.tensor(token_ids, dtype=torch.long)

###############################################################################
# 4) STOCHASTIC FORWARD DIFFUSION FUNCTION (with learnable noise)
###############################################################################
def forward_diffusion_continuous(x, t, noise_schedule, model, tau=TAU):
    x0_emb = model.token_emb(x)  # shape: (B, T, D)
    B, T, D = x0_emb.shape
    mask_token_tensor = torch.tensor([MASK_TOKEN_ID], device=x.device)
    mask_emb = model.token_emb(mask_token_tensor)  # shape: (1, D)
    r = torch.rand(B, T, device=x.device)
    if t.dim() == 0:
        nf = noise_schedule[int(t.item())].item()
        noise_fraction = torch.full((B, T), nf, device=x.device)
    else:
        noise_fraction = noise_schedule[t].unsqueeze(1).expand(B, T)
    m = torch.sigmoid((r - noise_fraction) / tau)  # (B, T)
    m = m.unsqueeze(-1)  # (B, T, 1)
    return (1 - m) * x0_emb + m * mask_emb

###############################################################################
# 5) DIFFUSION TRANSFORMER MODEL (WITHOUT LENGTH PREDICTION HEAD)
###############################################################################
class DiffusionTransformer(nn.Module):
    def __init__(self,
                 vocab_size,
                 embed_dim,
                 hidden_dim,
                 num_layers,
                 num_heads,
                 max_seq_len):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_seq_len, embed_dim)
        self.time_emb = nn.Embedding(TIMESTEPS, embed_dim)

        self.input_norm = nn.LayerNorm(embed_dim)
        self.input_dropout = nn.Dropout(DROPOUT_RATE)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=DROPOUT_RATE,
            activation='gelu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)
        self.learned_noise_schedule = nn.Parameter(init_schedule_tensor.clone())

    def forward(self, x, t, pre_embedded=False):
        if not pre_embedded:
            tokens_emb = self.token_emb(x)
        else:
            tokens_emb = x

        positions = torch.arange(self.max_seq_len, device=x.device).unsqueeze(0)
        pos_emb = self.pos_emb(positions)

        if t.dim() == 0:
            t_embed = self.time_emb(t).unsqueeze(0).unsqueeze(1).expand(x.size(0), self.max_seq_len, -1)
        else:
            t_embed = self.time_emb(t).unsqueeze(1).expand(-1, self.max_seq_len, -1)

        x_input = tokens_emb + pos_emb + t_embed
        x_input = self.input_norm(x_input)
        x_input = self.input_dropout(x_input)

        encoded = self.transformer_encoder(x_input)
        logits = self.fc_out(encoded)
        return logits

###############################################################################
# 6) REVERSE DIFFUSION UPDATE FUNCTION (with stochastic noise)
###############################################################################
def reverse_diffusion_update(predicted_x0_emb, t, model, base_noise_scale=0.05):
    if t == 0:
        return predicted_x0_emb
    noise_fraction = model.learned_noise_schedule[t - 1]
    mask_token_tensor = torch.tensor([MASK_TOKEN_ID], device=predicted_x0_emb.device)
    mask_emb = model.token_emb(mask_token_tensor)
    mask_emb = mask_emb.unsqueeze(0).expand_as(predicted_x0_emb)
    noise_scale = base_noise_scale * (t / TIMESTEPS)
    noise = noise_scale * torch.randn_like(predicted_x0_emb)
    x_prev = (1 - noise_fraction) * predicted_x0_emb + noise_fraction * mask_emb + noise
    return x_prev

###############################################################################
# 7) TRAINING FUNCTION (LOSS COMPUTED ONLY AT t=0, with KL divergence and
#    repetition penalties for repeating characters and explicit EOS penalty)
###############################################################################
def train_model(model, train_loader, num_epochs):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=L2_WEIGHT_DECAY)
    total_steps = num_epochs * len(train_loader)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=LEARNING_RATE * 2,
        total_steps=total_steps,
        pct_start=0.3,
        anneal_strategy='linear'
    )
    
    token_loss_weights = torch.ones(VOCAB_SIZE)
    for token in range(1, VOCAB_SIZE - 2):
        ascii_val = token + 31
        if 48 <= ascii_val <= 57:
            token_loss_weights[token] = DIGIT_PENALTY
        elif not (65 <= ascii_val <= 90 or 97 <= ascii_val <= 122):
            token_loss_weights[token] = SPECIAL_PENALTY
    token_loss_weights[EOS_TOKEN_ID] = EOS_WEIGHT

    token_criterion = nn.CrossEntropyLoss(weight=token_loss_weights.to(DEVICE), ignore_index=PAD_TOKEN_ID)
    scaler = torch.cuda.amp.GradScaler()

    target_schedule = torch.softmax(init_schedule_tensor.to(DEVICE), dim=0)
    
    REPEAT_WEIGHT = 0.4
    NUM_REPEAT_WEIGHT = 1.0

    for epoch in range(num_epochs):
        total_train_loss = 0.0
        for step, batch in enumerate(train_loader):
            x0 = batch.to(DEVICE)
            batch_size = x0.size(0)
            t_max = TIMESTEPS - 1
            x = forward_diffusion_continuous(x0, torch.tensor(t_max, device=DEVICE),
                                             model.learned_noise_schedule, model, tau=TAU)
            for t in reversed(range(1, TIMESTEPS)):
                t_tensor = torch.full((batch_size,), t, device=DEVICE, dtype=torch.long)
                with torch.cuda.amp.autocast(enabled=True):
                    logits = model(x, t_tensor, pre_embedded=True)
                    probs = torch.softmax(logits, dim=-1)
                    token_emb_weights = model.token_emb.weight
                    predicted_x0_emb = torch.matmul(probs, token_emb_weights)
                    x = reverse_diffusion_update(predicted_x0_emb, t, model)
            t_tensor = torch.full((batch_size,), 0, device=DEVICE, dtype=torch.long)
            with torch.cuda.amp.autocast(enabled=True):
                final_logits = model(x, t_tensor, pre_embedded=True)
                token_loss = token_criterion(final_logits.reshape(-1, model.vocab_size), x0.reshape(-1))
                loss = token_loss

                p = torch.softmax(model.learned_noise_schedule, dim=0)
                kl_loss = torch.sum(p * (torch.log(p + 1e-8) - torch.log(target_schedule + 1e-8)))
                
                final_probs = torch.softmax(final_logits, dim=-1)
                rep_penalty = 0.0
                for i in range(1, MAX_SEQ_LEN):
                    rep_penalty += torch.sum(final_probs[:, i] * final_probs[:, i-1])
                rep_penalty = rep_penalty / (batch_size * (MAX_SEQ_LEN - 1))
                
                digit_indices = torch.arange(17, 27, device=DEVICE)
                num_rep_penalty = 0.0
                tail_range = range(MAX_SEQ_LEN - 3, MAX_SEQ_LEN) if MAX_SEQ_LEN >= 3 else range(1, MAX_SEQ_LEN)
                for i in tail_range:
                    p_i = final_probs[:, i][:, digit_indices]
                    p_im1 = final_probs[:, i-1][:, digit_indices]
                    num_rep_penalty += torch.sum(p_i * p_im1)
                num_rep_penalty = num_rep_penalty / (batch_size * len(tail_range))
                
                total_loss = loss + KL_WEIGHT * kl_loss + REPEAT_WEIGHT * rep_penalty + NUM_REPEAT_WEIGHT * num_rep_penalty
            
            optimizer.zero_grad()
            scaler.scale(total_loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            total_train_loss += total_loss.item()
        avg_train_loss = total_train_loss / len(train_loader)
        print(f"=> Epoch {epoch+1}/{num_epochs} completed. Train Loss: {avg_train_loss:.4f}", flush=True)
    return model

###############################################################################
# 8) DECODER FUNCTION
###############################################################################
def decode_tokens_to_string(token_seq):
    chars = []
    for token_id in token_seq:
        if token_id == EOS_TOKEN_ID:
            break
        elif token_id in (PAD_TOKEN_ID, MASK_TOKEN_ID):
            continue
        chars.append(chr(token_id + 31))
    return "".join(chars)

###############################################################################
# Helper: Top-k Filtering Function
###############################################################################
def top_k_filtering(logits, k):
    if k <= 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1].unsqueeze(1)
    return torch.where(logits < min_values, torch.full_like(logits, -float('Inf')), logits)

###############################################################################
# 9) SAMPLING FUNCTION (with stochastic reverse diffusion and top-k filtering)
###############################################################################
@torch.no_grad()
def sample_passwords_diffusion(model, num_samples=100, final_temperature=1.0, base_noise_scale=0.05, top_k=10):
    model.eval()
    x0_sample = torch.randint(low=1, high=VOCAB_SIZE-2, size=(num_samples, MAX_SEQ_LEN), device=DEVICE)
    x0_sample[:, -1] = EOS_TOKEN_ID
    t_max = TIMESTEPS - 1
    x = forward_diffusion_continuous(x0_sample, torch.tensor(t_max, device=DEVICE),
                                     model.learned_noise_schedule, model, tau=TAU)
    for t in reversed(range(TIMESTEPS)):
        t_tensor = torch.full((num_samples,), t, device=DEVICE, dtype=torch.long)
        logits = model(x, t_tensor, pre_embedded=True)
        if t > 0:
            probs = torch.softmax(logits, dim=-1)
            token_emb_weights = model.token_emb.weight
            predicted_x0_emb = torch.matmul(probs, token_emb_weights)
            x = reverse_diffusion_update(predicted_x0_emb, t, model, base_noise_scale=base_noise_scale)
        else:
            logits = logits / final_temperature
            flat_logits = logits.view(-1, model.vocab_size)
            flat_logits = top_k_filtering(flat_logits, top_k)
            probs = torch.softmax(flat_logits, dim=-1)
            token_indices = torch.multinomial(probs, 1)
            token_indices = token_indices.view(num_samples, MAX_SEQ_LEN)
    samples = [decode_tokens_to_string(token_indices[i].cpu().numpy()) for i in range(num_samples)]
    return samples

###############################################################################
# 10) MAIN FUNCTION: TRAIN MODEL & GENERATE SAMPLES
###############################################################################
def main():
    dataset_file = r"B:\Nasko\rockyou5MIL.txt"
    full_dataset = PasswordDataset(dataset_file)
    train_loader = DataLoader(full_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              drop_last=True, pin_memory=PIN_MEMORY, num_workers=NUM_WORKERS)
    model = DiffusionTransformer(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM,
                                 hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS,
                                 num_heads=NUM_HEADS, max_seq_len=MAX_SEQ_LEN).to(DEVICE)
    model = train_model(model, train_loader, NUM_EPOCHS)
    model_save_path = "5MIL_Reguralization.pth"
    torch.save(model.state_dict(), model_save_path)
    print(f"[DEBUG] Saved new model to {model_save_path}", flush=True)
    
    samples = sample_passwords_diffusion(model, num_samples=100, final_temperature=0.7, base_noise_scale=0.05, top_k=10)
    print("[DEBUG] Generated Samples:")
    for i, sample in enumerate(samples, 1):
        print(f"Sample #{i}: {sample}")
    
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast(enabled=True):
  with torch.cuda.amp.autocast(enabled=True):


=> Epoch 1/5 completed. Train Loss: 3.5438
=> Epoch 2/5 completed. Train Loss: 3.4943
=> Epoch 3/5 completed. Train Loss: 3.4922
=> Epoch 4/5 completed. Train Loss: 3.4914
=> Epoch 5/5 completed. Train Loss: 3.4909
[DEBUG] Saved new model to 5MIL_Reguralization.pth
[DEBUG] Generated Samples:
Sample #1: paoeaea1
Sample #2: baira0
Sample #3: mimsa212
Sample #4: tanlne
Sample #5: stnsler1
Sample #6: brra1eon
Sample #7: tonlisoa121
Sample #8: vamota23111
Sample #9: paan1229
Sample #10: benaoo
Sample #11: riakae10an0eo¸
Sample #12: tannai2e
Sample #13: saslera21
Sample #14: siorioa
Sample #15: sirots
Sample #16: seaetn
Sample #17: bolninao3o
Sample #18: suneiio01oa
Sample #19: riaaeree21a
Sample #20: ponirenae
Sample #21: suosneea
Sample #22: taellni
Sample #23: soallsee
Sample #24: tiare21a
Sample #25: punl1e2n
Sample #26: yiilai1n0
Sample #27: poaoler3
Sample #28: sitolii0n
Sample #29: mimee20221
Sample #30: saot1erea
Sample #31: yanairna
Sample #32: toneya2a2
Sample #33: tiae1e2en
Sample

In [1]:
#Generate passwords
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random
import sys
import math
import gc

###############################################################################
# 1) FORCE CUDA USAGE OR EXIT
###############################################################################
if not torch.cuda.is_available():
    print("ERROR: CUDA device not available! Exiting...", flush=True)
    sys.exit(1)
DEVICE = "cuda"
torch.cuda.set_device(0)

###############################################################################
# 2) HYPERPARAMETERS & SETTINGS
###############################################################################
MAX_SEQ_LEN = 32
VOCAB_SIZE = 300
EOS_TOKEN_ID = VOCAB_SIZE - 1   # Stop token
MASK_TOKEN_ID = VOCAB_SIZE - 2  # Mask token
PAD_TOKEN_ID = 0

EMBED_DIM = 256
HIDDEN_DIM = 512
NUM_LAYERS = 8
NUM_HEADS = 8

# Sampling hyperparameters
FINAL_TEMPERATURE = 0.8
BASE_NOISE_SCALE = 0.05
TOP_K = 10

# Optimized tau hyperparameter
TAU = 0.05

# Fixed cosine noise schedule (for initializing the learned noise schedule)
def fixed_noise_schedule(timesteps, min_noise=0.02, max_noise=0.98):
    return [min_noise + (max_noise - min_noise) * (1 - math.cos(math.pi * (i + 0.5) / timesteps)) / 2
            for i in range(timesteps)]
TIMESTEPS = 10
init_schedule = fixed_noise_schedule(TIMESTEPS, min_noise=0.02, max_noise=0.98)
init_schedule_tensor = torch.tensor(init_schedule, dtype=torch.float32)

PIN_MEMORY = True
NUM_WORKERS = 0

###############################################################################
# 5) DIFFUSION TRANSFORMER MODEL (WITHOUT LENGTH PREDICTION HEAD)
###############################################################################
class DiffusionTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_heads, max_seq_len):
        super().__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len

        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_seq_len, embed_dim)
        self.time_emb = nn.Embedding(TIMESTEPS, embed_dim)

        self.input_norm = nn.LayerNorm(embed_dim)
        self.input_dropout = nn.Dropout(0.1)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim,
            dropout=0.1,
            activation='gelu',
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc_out = nn.Linear(embed_dim, vocab_size)
        self.learned_noise_schedule = nn.Parameter(init_schedule_tensor.clone())

    def forward(self, x, t, pre_embedded=False):
        if not pre_embedded:
            tokens_emb = self.token_emb(x)
        else:
            tokens_emb = x
        positions = torch.arange(self.max_seq_len, device=x.device).unsqueeze(0)
        pos_emb = self.pos_emb(positions)
        if t.dim() == 0:
            t_embed = self.time_emb(t).unsqueeze(0).unsqueeze(1).expand(x.size(0), self.max_seq_len, -1)
        else:
            t_embed = self.time_emb(t).unsqueeze(1).expand(-1, self.max_seq_len, -1)
        x_input = tokens_emb + pos_emb + t_embed
        x_input = self.input_norm(x_input)
        x_input = self.input_dropout(x_input)
        encoded = self.transformer_encoder(x_input)
        logits = self.fc_out(encoded)
        return logits

###############################################################################
# 6) STOCHASTIC FUNCTIONS
###############################################################################
def forward_diffusion_continuous(x, t, noise_schedule, model, tau=TAU):
    x0_emb = model.token_emb(x)  # shape: (B, T, D)
    B, T, D = x0_emb.shape
    mask_token_tensor = torch.tensor([MASK_TOKEN_ID], device=x.device)
    mask_emb = model.token_emb(mask_token_tensor)
    r = torch.rand(B, T, device=x.device)
    if t.dim() == 0:
        nf = noise_schedule[int(t.item())].item()
        noise_fraction = torch.full((B, T), nf, device=x.device)
    else:
        noise_fraction = noise_schedule[t].unsqueeze(1).expand(B, T)
    m = torch.sigmoid((r - noise_fraction) / tau)
    m = m.unsqueeze(-1)
    return (1 - m) * x0_emb + m * mask_emb

def reverse_diffusion_update(predicted_x0_emb, t, model, base_noise_scale=BASE_NOISE_SCALE):
    if t == 0:
        return predicted_x0_emb
    noise_fraction = model.learned_noise_schedule[t - 1]
    mask_token_tensor = torch.tensor([MASK_TOKEN_ID], device=predicted_x0_emb.device)
    mask_emb = model.token_emb(mask_token_tensor)
    mask_emb = mask_emb.unsqueeze(0).expand_as(predicted_x0_emb)
    noise_scale = base_noise_scale * (t / TIMESTEPS)
    noise = noise_scale * torch.randn_like(predicted_x0_emb)
    return (1 - noise_fraction) * predicted_x0_emb + noise_fraction * mask_emb + noise

def top_k_filtering(logits, k):
    if k <= 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1].unsqueeze(1)
    return torch.where(logits < min_values, torch.full_like(logits, -float('Inf')), logits)

def sample_passwords_diffusion(model, num_samples=100, final_temperature=FINAL_TEMPERATURE,
                               base_noise_scale=BASE_NOISE_SCALE, top_k=TOP_K):
    model.eval()
    x0_sample = torch.randint(low=1, high=VOCAB_SIZE-2, size=(num_samples, MAX_SEQ_LEN), device=DEVICE)
    x0_sample[:, -1] = EOS_TOKEN_ID  # force EOS
    t_max = TIMESTEPS - 1
    x = forward_diffusion_continuous(x0_sample, torch.tensor(t_max, device=DEVICE),
                                     model.learned_noise_schedule, model, tau=TAU)
    for t in reversed(range(TIMESTEPS)):
        t_tensor = torch.full((num_samples,), t, device=DEVICE, dtype=torch.long)
        logits = model(x, t_tensor, pre_embedded=True)
        if t > 0:
            probs = torch.softmax(logits, dim=-1)
            token_emb_weights = model.token_emb.weight
            predicted_x0_emb = torch.matmul(probs, token_emb_weights)
            x = reverse_diffusion_update(predicted_x0_emb, t, model, base_noise_scale=BASE_NOISE_SCALE)
        else:
            logits = logits / final_temperature
            flat_logits = logits.view(-1, model.vocab_size)
            flat_logits = top_k_filtering(flat_logits, top_k)
            probs = torch.softmax(flat_logits, dim=-1)
            token_indices = torch.multinomial(probs, 1)
            token_indices = token_indices.view(num_samples, MAX_SEQ_LEN)
    samples = []
    for i in range(num_samples):
        s = []
        for token in token_indices[i]:
            if token.item() == EOS_TOKEN_ID:
                break
            if token.item() in (PAD_TOKEN_ID, MASK_TOKEN_ID):
                continue
            s.append(chr(token.item() + 31))
        samples.append("".join(s))
    return samples

###############################################################################
# 7) MAIN FUNCTION: LOAD MODEL, GENERATE SAMPLES IN BATCHES, AND WRITE TO FILE
###############################################################################
def main():
    # Initialize model architecture and load saved weights.
    model = DiffusionTransformer(vocab_size=VOCAB_SIZE, embed_dim=EMBED_DIM,
                                 hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS,
                                 num_heads=NUM_HEADS, max_seq_len=MAX_SEQ_LEN).to(DEVICE)
    model_load_path = "KL_divergence.pth"  # Adjust path as needed.
    model.load_state_dict(torch.load(model_load_path, map_location=DEVICE))
    model.eval()
    
    output_file = "PassDiffusion100K"
    total_passwords = 500
    batch_size = 50
    num_batches = total_passwords // batch_size
    remainder = total_passwords % batch_size

    with open(output_file, "w", encoding="latin-1") as f:
        print(f"[DEBUG] Generating {total_passwords} passwords in batches of {batch_size}...")
        for batch in range(num_batches):
            samples = sample_passwords_diffusion(model, num_samples=batch_size, final_temperature=FINAL_TEMPERATURE,
                                                  base_noise_scale=BASE_NOISE_SCALE, top_k=TOP_K)
            for sample in samples:
                f.write(sample + "\n")
            if (batch + 1) % 100 == 0:
                print(f"[DEBUG] Completed batch {batch + 1}/{num_batches}")
        
        # If there's a remainder, generate and write them as well.
        if remainder > 0:
            samples = sample_passwords_diffusion(model, num_samples=remainder, final_temperature=FINAL_TEMPERATURE,
                                                  base_noise_scale=BASE_NOISE_SCALE, top_k=TOP_K)
            for sample in samples:
                f.write(sample + "\n")
    
    print(f"[DEBUG] Password generation complete. Passwords saved to '{output_file}'.")
    
    torch.cuda.empty_cache()
    gc.collect()

if __name__ == "__main__":
    main()


  model.load_state_dict(torch.load(model_load_path, map_location=DEVICE))


[DEBUG] Generating 500 passwords in batches of 50...
[DEBUG] Password generation complete. Passwords saved to 'PassDiffusion100K'.
