In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import random
import string
import pandas as pd

### Importing the data that the model will be trained on

In [2]:
df= pd.read_csv('password_analytics.csv')

In [3]:
data = list(zip(df["password"], df["pw_entropy"]))
data[400:410]

[('PgRWQH*L8MV@2hx', 1.0),
 ('U@LcyNTGZp67scz', 0.991),
 ('!5EW4h$p!k$ysH8', 0.984),
 ('pFt2P*mxKK$ZYgn', 0.991),
 ('$7ez3DFRc6eJe.E', 0.97),
 ('#BG$Qw4GT$zqtj6', 0.984),
 ('FmbQd333P3d!G9#', 0.937),
 ('*E6!WMT3wM3ejq@', 0.984),
 ('Th7j*aGiAkqvSFq', 0.991)]

**With the data being ingested by the model being defined, I must take every password and break it down into tokens so the model is able to comprehend what it is ingesting.**

In [4]:
# Extract passwords and pw_entropy from the dataset
passwords = [password for password, _ in data]
pw_entropy = torch.tensor([entropy for _, entropy in data], dtype=torch.float)


In [5]:
# Tokenize the passwords by converting characters to unique indices
all_characters = string.printable  # Looking for all printable characters
vocab = {char: idx for idx, char in enumerate(all_characters)}

In [6]:
# Convert passwords to numerical sequences
numerical_data = [[vocab[char] for char in password] for password in passwords]

# Pad sequences to a fixed length (adjust as needed)
max_sequence_length = max(len(seq) for seq in numerical_data)
padded_data = [seq + [0] * (max_sequence_length - len(seq)) for seq in numerical_data]

In [7]:
# Convert to PyTorch tensors
X = torch.tensor(padded_data, dtype=torch.long)
y = pw_entropy  # Use pw_entropy as the target variable

In [8]:
# Define the dataset class
class PasswordDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

**Setting up the test and train datasets for the Pytorch model which will be used to generate passwords**

In [9]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create DataLoader instances for training and testing
train_dataset = PasswordDataset(X_train, y_train)
test_dataset = PasswordDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

In [11]:
# Define the GRU model
class PasswordGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PasswordGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.gru(embedded)
        output = self.fc(output[:, -1, :])
        return output

In [12]:
# Initialize the model
embedding_dim = 10  # Adjust as needed
hidden_dim = 20  # Adjust as needed
vocab_size = len(all_characters)
model = PasswordGenerator(vocab_size, embedding_dim, hidden_dim)

# Define loss function and optimizer
criterion = nn.MSELoss()  # Use Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [13]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()

    # Evaluation on the test set
    model.eval()
    with torch.no_grad():
        mse_loss = nn.MSELoss()
        test_loss = 0.0
        for inputs, labels in test_loader:
            outputs = model(inputs)
            test_loss += mse_loss(outputs.squeeze(), labels).item()

        average_test_loss = test_loss / len(test_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Test MSE Loss: {average_test_loss:.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/10], Test MSE Loss: 0.0117
Epoch [2/10], Test MSE Loss: 0.0032
Epoch [3/10], Test MSE Loss: 0.0015
Epoch [4/10], Test MSE Loss: 0.0011
Epoch [5/10], Test MSE Loss: 0.0009
Epoch [6/10], Test MSE Loss: 0.0010
Epoch [7/10], Test MSE Loss: 0.0007
Epoch [8/10], Test MSE Loss: 0.0007
Epoch [9/10], Test MSE Loss: 0.0005
Epoch [10/10], Test MSE Loss: 0.0005


**Testing over Epochs, we like seeing the Mean Squared Error (MSE) decrease as there are more tests**

In [14]:
# Password generation function
def generate_password(model, vocab, max_length=15):
    model.eval()
    with torch.no_grad():
        generated_password = []
        input_char = random.choice(list(vocab.keys()))
        generated_password.append(input_char)

        for _ in range(max_length - 1):
            input_tensor = torch.tensor([[vocab[input_char]]], dtype=torch.long)
            output = model(input_tensor)
            predicted_entropy = output.squeeze().item()  # Assuming model predicts entropy
            threshold = 0.5  # Adjust as needed
            random_value = random.random()

            if random_value < threshold:
                # Add an uppercase letter
                next_character = random.choice(string.ascii_uppercase)
            elif random_value < 2 * threshold:
                # Add a lowercase letter
                next_character = random.choice(string.ascii_lowercase)
            else:
                # Add a digit
                next_character = random.choice(string.digits)

            generated_password.append(next_character)

        return "".join(generated_password)


### Password Generation

In [16]:
for _ in range(5):
    generated_password = generate_password(model, vocab)
    print(f"Generated Password: {generated_password}")

Generated Password: +DWXSaFWohorbcW
Generated Password: 4PALJOHduuvlnFD
Generated Password: GSleGLVAeQkquVe
Generated Password: ANokgLCbAXOQRrJ
Generated Password: LpXLYrdBKWGgPCh
