In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random
import string
import pandas as pd

In [4]:
df= pd.read_csv('password_analytics.csv')

In [5]:
data = list(zip(df["password"], df["pw_entropy"]))
data

[('RXujZkrxUgKhWW2', 0.991),
 ('2eXrHHw9S7iU2EN', 0.984),
 ('M7nc3G2iwA7gf9M', 0.984),
 ('vGA6X7j9ixFj2mH', 0.991),
 ('ucX3vskRPzBEKZv', 0.991),
 ('ADSXPfZkwFm4xGM', 1.0),
 ('9Bkin54AhK8H6Qt', 1.0),
 ('j8U79DwdDCweipp', 0.978),
 ('G3X8P45HEjathtb', 0.991),
 ('PsQPHDfNU7jh2pq', 0.991),
 ('RyU23tG4xiZLrEt', 0.991),
 ('jEkMNJA3imXdVba', 1.0),
 ('RHuAhZHFuEwm9r6', 0.984),
 ('TDcXP7TNcB8kyti', 0.984),
 ('CWLw85JXa8w4uKt', 0.984),
 ('4YHdcG8Z9ufjze3', 1.0),
 ('eTmwHRWWGX8rKga', 0.991),
 ('4cVy5AxZgBHfxtV', 0.984),
 ('k9Pzf3Zx5EHW87u', 1.0),
 ('jTKchFe853Zg3qv', 0.991),
 ('T8ceE6FmM5eKfxf', 0.984),
 ('5YvaXGVdNq5mJUt', 0.991),
 ('JWbR7GyCZNdPfyW', 0.984),
 ('t7w9ACbDpMuxsi8', 1.0),
 ('4MnvFaUyVkJsxS9', 1.0),
 ('TdJi47F8BsaNutT', 0.991),
 ('StAikt4QvJiSkj6', 0.975),
 ('s9q9cM7aC5d8b7G', 0.984),
 ('3znRGHw32HUJ3c8', 0.964),
 ('cUWpp4uMEZMjHrL', 0.984),
 ('RK2kdy62Sp8HvBi', 0.991),
 ('iAVZKWbUe2ddm4m', 0.984),
 ('QfLpLczj44mQ5Ee', 0.978),
 ('L8KaBYNybuMGXG8', 0.984),
 ('NEXQN836XNvDf8w', 0.961),

In [6]:
# Extract passwords and pw_entropy from the dataset
passwords = [password for password, _ in data]
pw_entropy = torch.tensor([entropy for _, entropy in data], dtype=torch.float)


In [7]:
# Tokenize the passwords by converting characters to unique indices
all_characters = string.printable  # Consider all printable characters
vocab = {char: idx for idx, char in enumerate(all_characters)}

In [8]:
# Convert passwords to numerical sequences
numerical_data = [[vocab[char] for char in password] for password in passwords]

# Pad sequences to a fixed length (adjust as needed)
max_sequence_length = max(len(seq) for seq in numerical_data)
padded_data = [seq + [0] * (max_sequence_length - len(seq)) for seq in numerical_data]

In [9]:
# Convert to PyTorch tensors
X = torch.tensor(padded_data, dtype=torch.long)
y = pw_entropy  # Use pw_entropy as the target variable

In [10]:
# Define the dataset class
class PasswordDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [11]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Create DataLoader instances for training and testing
train_dataset = PasswordDataset(X_train, y_train)
test_dataset = PasswordDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [13]:
# Define the GRU model
class PasswordGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output[:, -1, :])
        return output

In [14]:
# Initialize the model
embedding_dim = 10  # Adjust as needed
hidden_dim = 20  # Adjust as needed
vocab_size = len(all_characters)
model = PasswordGenerator(vocab_size, embedding_dim, hidden_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Use Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

    # Evaluation on the test set
    model.eval()
    with torch.no_grad():
        mse_loss = nn.MSELoss()
        test_loss = 0.0
        for inputs, labels in test_loader:
            outputs = model(inputs)
            test_loss += mse_loss(outputs.squeeze(), labels).item()

        average_test_loss = test_loss / len(test_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Test MSE Loss: {average_test_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Test MSE Loss: 0.0107
Epoch [2/10], Test MSE Loss: 0.0026
Epoch [3/10], Test MSE Loss: 0.0013
Epoch [4/10], Test MSE Loss: 0.0010
Epoch [5/10], Test MSE Loss: 0.0009
Epoch [6/10], Test MSE Loss: 0.0008
Epoch [7/10], Test MSE Loss: 0.0007
Epoch [8/10], Test MSE Loss: 0.0006
Epoch [9/10], Test MSE Loss: 0.0006
Epoch [10/10], Test MSE Loss: 0.0005


**Testing over Epochs we like seeing the MSE or Mean Squared Error decrease as there are more tests**

In [16]:
# Password generation function
def generate_password(model, vocab, max_length=15):
    model.eval()
    with torch.no_grad():
        generated_password = []
        input_char = random.choice(list(vocab.keys()))
        generated_password.append(input_char)

        for _ in range(max_length - 1):
            input_tensor = torch.tensor([[vocab[input_char]]], dtype=torch.long)
            output = model(input_tensor)
            predicted_entropy = output.squeeze().item()  # Assuming model predicts entropy
            threshold = 0.5  # Adjust as needed
            random_value = random.random()

            if random_value < threshold:
                # Add an uppercase letter
                next_character = random.choice(string.ascii_uppercase)
            elif random_value < 2 * threshold:
                # Add a lowercase letter
                next_character = random.choice(string.ascii_lowercase)
            else:
                # Add a digit
                next_character = random.choice(string.digits)

            generated_password.append(next_character)

        return "".join(generated_password)


In [25]:
for _ in range(5):
    generated_password = generate_password(model, vocab)
    print(f"Generated Password: {generated_password}")

Generated Password: ]GoRdEMKOSyyfKn
Generated Password: PpOsqQdWUkHihei
Generated Password: wvfDTPRrhFQIMmU
Generated Password: CVjlmrYIyxRpFNw
Generated Password: EtRFrpjbIcFtiIF


#### Going to join all passwords into a string
**By creating this text corpus the model will have an easier time reading the passwords and be able to make better predictions**