In [None]:
import os
import random
import json
import re
from typing import List, Dict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt
from torch.utils.data import random_split
from wordcloud import WordCloud

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def seed_worker(worker_id):
    worker_seed = SEED + worker_id
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(SEED)

In [None]:
DATA_PATH = 'dataset/maria.csv'
BASE_MODEL_PATH = 'password_lstm.pth'
PERSONAL_MODEL_PATH = 'personal_lstm.pth'
PATTERN_OUTPUT_PATH = 'personal_password_patterns.json'

In [None]:
max_sequence_length = 50
embedding_dim = 64
hidden_dim = 128
batch_size = 64
num_epochs = 10
learning_rate = 1e-3

In [None]:
class PasswordLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [None]:
def save_model(model, optimizer, epoch, file_path):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, file_path)
    print(f"✅ Модель сохранена в '{file_path}'")

In [None]:
def load_model(model, optimizer, file_path):
    if os.path.exists(file_path):
        checkpoint = torch.load(file_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch']
        print(f"✅ Модель загружена из '{file_path}' — продолжаем с эпохи {start_epoch + 1}")
        return start_epoch
    else:
        print("⚠️ Файл с моделью не найден — начинаем обучение с нуля.")
        return 0


In [None]:
base_chars = set(
    'abcdefghijklmnopqrstuvwxyz'
    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    '0123456789!@#$%^&*()-_=+[]{}|;:,.<>?/\\'
)
char_to_idx = {char: idx + 1 for idx, char in enumerate(base_chars)}
char_to_idx['<PAD>'] = 0
char_to_idx['<UNK>'] = len(char_to_idx) + 1
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [None]:
def prepare_data(passwords: List[str], char_to_idx: Dict[str, int]):
    sequences = [
        [char_to_idx.get(c, char_to_idx['<UNK>']) for c in pwd]
        for pwd in passwords
    ]
    sequences = [
        seq[:max_sequence_length] + [char_to_idx['<PAD>']] * (max_sequence_length - len(seq))
        for seq in sequences
    ]
    X = torch.tensor(sequences, dtype=torch.long)
    y = torch.tensor([seq[1:] + [char_to_idx['<PAD>']] for seq in sequences], dtype=torch.long)
    dataset = TensorDataset(X, y)
    return DataLoader(
        dataset, batch_size=batch_size, shuffle=True,
        worker_init_fn=seed_worker, generator=g
    )

In [None]:
vocab_size = len(char_to_idx)
base_model = PasswordLSTM(vocab_size, embedding_dim, hidden_dim)
base_optimizer = optim.Adam(base_model.parameters(), lr=learning_rate)
start_epoch = load_model(base_model, base_optimizer, BASE_MODEL_PATH)

In [None]:
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"⚠️ Файл '{DATA_PATH}' не найден!")
personal_data = pd.read_csv(DATA_PATH)['String'].tolist()
personal_loader = prepare_data(personal_data, char_to_idx)

In [None]:
personal_model = PasswordLSTM(vocab_size, embedding_dim, hidden_dim)
personal_optimizer = optim.Adam(personal_model.parameters(), lr=learning_rate)
personal_model.load_state_dict(base_model.state_dict())

In [None]:
dataset = personal_loader.dataset
dataset_size = len(dataset)
train_size = int(0.8 * dataset_size)
val_size = dataset_size - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=g)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                          worker_init_fn=seed_worker, generator=g)
val_loader   = DataLoader(val_dataset,   batch_size=batch_size,
                          worker_init_fn=seed_worker, generator=g)

In [None]:
def train_and_validate(model, train_loader, val_loader, optimizer, loss_fn, num_epochs):
    history = {'train_loss': [], 'val_loss': []}
    for epoch in range(1, num_epochs + 1):
        # Train
        model.train()
        total_train = 0.0
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = loss_fn(outputs.view(-1, model.fc.out_features), y_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_train += loss.item()
        avg_train = total_train / len(train_loader)
        history['train_loss'].append(avg_train)

        # Validation
        model.eval()
        total_val = 0.0
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                outputs = model(x_batch)
                loss = loss_fn(outputs.view(-1, model.fc.out_features), y_batch.view(-1))
                total_val += loss.item()
        avg_val = total_val / len(val_loader)
        history['val_loss'].append(avg_val)

        print(f"Epoch {epoch}/{num_epochs} — Train Loss: {avg_train:.4f}, Val Loss: {avg_val:.4f}")

    return history

loss_fn = nn.CrossEntropyLoss()
history = train_and_validate(
    personal_model, train_loader, val_loader,
    personal_optimizer, loss_fn, num_epochs
)

In [None]:
plt.figure(figsize=(8,5))
plt.plot(history['train_loss'], label='Train Loss', marker='o')
plt.plot(history['val_loss'],   label='Val Loss',   marker='o')
plt.title('Train vs Val Loss по эпохам')
plt.xlabel('Эпоха')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
save_model(personal_model, personal_optimizer, num_epochs - 1, PERSONAL_MODEL_PATH)

In [None]:
def extract_mask(pwd: str) -> str:
    return ''.join([
        'X' if c.isalpha() else
        'D' if c.isdigit() else
        'S' if re.match(r'[!@#$%^&*()\-_=+]', c) else '_'
        for c in pwd
    ])

def classify_mask(mask: str) -> str:
    if all(c == 'D' for c in mask): return "цифровой"
    if all(c == 'X' for c in mask): return "словесный"
    if all(c == 'S' for c in mask): return "спецсимволы"
    if 'X' in mask and 'D' in mask and 'S' in mask: return "гибрид: буквы + цифры + символы"
    if 'X' in mask and 'D' in mask: return "гибрид: буквы + цифры"
    if 'D' in mask and 'S' in mask: return "гибрид: цифры + символы"
    if 'X' in mask and 'S' in mask: return "гибрид: буквы + символы"
    if mask.lower() != mask and mask.upper() != mask and 'X' in mask: return "зигзаг-регистр"
    return "другое"

results = analyze_passwords = lambda pwds: None  # placeholder if needed

In [None]:
results = None

In [None]:
word_freq = results["words"] if results else {}
if word_freq:
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title("🔥 Часто встречающиеся подстроки в паролях")
    plt.show()

if results:
    mutation_percent = float(results["profile"]["mutation_usage"].strip('%'))
    plt.pie(
        [mutation_percent, 100 - mutation_percent],
        labels=["С мутациями", "Без мутаций"],
        autopct='%1.1f%%'
    )
    plt.title("🧬 Использование мутаций в паролях")
    plt.show()
