This notebook was used for scratch and testing purposes. `PassGAN.ipynb` has more updated code.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import numpy as np
import datetime
import pickle

np.random.seed(42)
torch.manual_seed(0)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.get_device_name(), "|", torch.cuda.is_available())

# Data Loading

In [None]:
def load_dataset(max_vocab_size=2048):
    path = "Data/rockyou_processed.txt"
    with open(path, 'r') as f:
        lines = [line for line in f]

    np.random.shuffle(lines)

    import collections
    counts = collections.Counter(char for line in lines for char in line if char != "\n")

    charmap = {'unk':0}
    inv_charmap = ['unk']

    for char,count in counts.most_common(max_vocab_size-1):
        if char not in charmap:
            charmap[char] = len(inv_charmap)
            inv_charmap.append(char)

    filtered_lines = []
    for line in lines:
        filtered_line = []
        for char in line:
            if char in charmap:
                filtered_line.append(char)
            else:
                filtered_line.append('unk')
        filtered_lines.append(tuple(filtered_line))

    print("loaded {} lines in dataset".format(len(lines)))
    return filtered_lines, charmap, inv_charmap

filtered_lines, charmap, inv_charmap = load_dataset()

In [None]:
pickle.dump(filtered_lines, open("Data/rockyou.pickle", 'wb'))
pickle.dump(charmap, open("Checkpoints/rockyou_charmap.pickle", 'wb'))
pickle.dump(inv_charmap, open("Checkpoints/rockyou_inv_charmap.pickle", 'wb'))

In [None]:
t = datetime.datetime.now()
filtered_lines = pickle.load(open("Data/rockyou.pickle", 'rb'))
charmap = pickle.load(open("Checkpoints/rockyou_charmap.pickle", "rb"))
inv_charmap = pickle.load(open("Checkpoints/rockyou_inv_charmap.pickle", "rb"))
print(datetime.datetime.now() - t)

In [None]:
def dataloader(lines, batch_size):
    while True:
        np.random.shuffle(lines)
        for i in range(len(lines) // batch_size):
            yield torch.tensor(lines[i*batch_size:(i+1)*batch_size]).to(device=device)
        
def translate(passwords):
    return ["".join([inv_charmap[c] for c in password]) for password in passwords]

# train = dataloader(filtered_lines, 4)
# translate(next(train))

# Model

In [None]:
class ResidualBlock(nn.Module):
    """
    Residual blocks take BATCH_SIZE x CHANNELS x LENGTH -> BATCH_SIZE x CHANNELS x LENGTH
    """
    def __init__(self, n_channels, kernel_size=3):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=n_channels, out_channels=n_channels, kernel_size=kernel_size, padding=kernel_size // 2)
        self.conv2 = nn.Conv1d(in_channels=n_channels, out_channels=n_channels, kernel_size=kernel_size, padding=kernel_size // 2)
        
    def forward(self, inputs):
        x = F.relu(inputs)
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        outputs = inputs + x * 0.3
        return outputs
    
class Generator(nn.Module):
    def __init__(self, charmap, kernel_size=3):
        super(Generator, self).__init__()
        self.lin = nn.Linear(in_features=128, out_features=128*10) #Channels x Length
        self.block1 = ResidualBlock(128)
        self.block2 = ResidualBlock(128)
        self.block3 = ResidualBlock(128)
        self.block4 = ResidualBlock(128)
        self.block5 = ResidualBlock(128)
        self.conv = nn.Conv1d(in_channels=128, out_channels=len(charmap), kernel_size=kernel_size, padding=kernel_size//2)
        
    
    def forward(self, inputs):
        x = self.lin(inputs).reshape(-1, 128, 10) # for residual blocks
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.conv(x).permute(0, 2, 1)
        x = F.softmax(x, dim=2)
        return x
    
class Discriminator(nn.Module):
    def __init__(self, charmap, kernel_size=3):
        super(Discriminator, self).__init__()
        self.length_charmap = len(charmap)
        self.conv1 = nn.Conv1d(self.length_charmap, 128, kernel_size=kernel_size, padding=kernel_size // 2)
        self.block1 = ResidualBlock(128)
        self.block2 = ResidualBlock(128)
        self.block3 = ResidualBlock(128)
        self.block4 = ResidualBlock(128)
        self.block5 = ResidualBlock(128)
        self.flatten = nn.Flatten()
        self.lin = nn.Linear(in_features=128 * 10, out_features=1)
    
    def forward(self, inputs): #one-hot is input to the discriminator
#        x = F.one_hot(inputs, num_classes=self.length_charmap).permute(0, 2, 1).float()
        x = inputs.permute(0, 2, 1)
        x = self.conv1(x)
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.flatten(x)
        outputs = self.lin(x)
        return outputs

In [None]:
inputs = torch.randn(4, 128, 10).to(device=device)
ResidualBlock(128).to(device=device)(inputs).shape

In [None]:
gen_inputs = torch.randn(5, 128).to(device=device)
gen = Generator(charmap).to(device=device)
gen_outputs = gen(gen_inputs).argmax(dim=2)
gen_outputs.shape

In [None]:
to_train = next(train)

discrim = Discriminator(charmap).to(device=device)
inputs = F.one_hot(to_train, num_classes=len(charmap)).to(device=device).float()
discrim_outputs = discrim(inputs)
print(inputs.shape, discrim_outputs.shape)

In [None]:
gen_outputs_onehot = F.one_hot(gen_outputs, num_classes=len(charmap)).float()
discrim(gen_outputs_onehot)

# Training Loop

In [None]:
def calc_gradient_penalty(netD, real_data, fake_data):
    alpha = torch.rand(batch_size, 1, 1)
    alpha = alpha.expand(real_data.size()).to(device=device)

    interpolates = (alpha * real_data + ((1 - alpha) * fake_data)).to(device=device)
    interpolates = autograd.Variable(interpolates, requires_grad=True)

    disc_interpolates = netD(interpolates)

    # TODO: Make ConvBackward diffentiable
    gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates,
                              grad_outputs=torch.ones(disc_interpolates.size()).to(device=device),
                              create_graph=True, retain_graph=True, only_inputs=True)[0]

    gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * LAMBDA
    return gradient_penalty

In [None]:
print(f"max_iters: {len(filtered_lines) // 10 // 128}")

In [None]:
# torch.manual_seed(0)
lambda_ = 10
n_critic_iters_per_generator_iter = 10
batch_size = 16
lr = 1e-4
adam_beta1 = 0.5
adam_beta2 = 0.9
iterations = 9000

one = one = torch.tensor(1, dtype=torch.float).to(device=device)
mone = -1 * one

netG = Generator(charmap).to(device=device)
netD = Discriminator(charmap).to(device=device)

# if continue_training:
    
#     netG = Generator(charmap).to(device)
#     netG.load_state_dict(torch.load("/home/nvijayakumar/gcp-gan/Checkpoints/netG100004:06:51AM_12-03-20"))
    
#     netG = Generator(charmap).to(device)
#     netG.load_state_dict(torch.load("/home/nvijayakumar/gcp-gan/Checkpoints/netG100004:06:51AM_12-03-20"))

optimG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.9))
optimD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.9))
    
train = dataloader(filtered_lines, batch_size)

for iteration in range(1, iterations + 1):
    for p in netD.parameters():  # reset requires_grad
        p.requires_grad = True  # they are set to False below in netG update
        
        
    for i in range(n_critic_iters_per_generator_iter):
        real_inputs_discrete = next(train)
        real_data = F.one_hot(real_inputs_discrete, num_classes=len(charmap)).float()
        real_data_v = autograd.Variable(real_data)
        
        netD.zero_grad()
        
        D_real = netD(real_data_v)
        D_real = D_real.mean()
        # print D_real
        # TODO: Waiting for the bug fix from pytorch
        D_real.backward(mone)
        
        noise = torch.randn(batch_size, 128).to(device=device)
        with torch.no_grad():
            noisev = autograd.Variable(noise)  
        fake = autograd.Variable(netG(noisev).data)
        inputv = fake
        D_fake = netD(inputv)
        D_fake = D_fake.mean()
        # TODO: Waiting for the bug fix from pytorch
        D_fake.backward(one)

        
        gradient_penalty = calc_gradient_penalty(netD, real_data_v.data, fake.data)
        gradient_penalty.backward()
        
        optimD.step()
        netD.zero_grad()
    
    for p in netD.parameters():
        p.requires_grad = False  # to avoid computation
    netG.zero_grad()

    noise = torch.randn(batch_size, 128).to(device=device)
    noisev = autograd.Variable(noise)
    fake = netG(noisev)
    G = netD(fake)
    G = G.mean()
    G.backward(mone)
    G_cost = -G
    optimG.step()

    if iteration % 500 == 0 or iteration == 1:
        print(f"iterations {iteration}")
        real_translation = translate(real_inputs_discrete[:5].cpu().numpy())
        fake_translation = translate(fake[:5].detach().cpu().numpy().argmax(axis=2))
        print(f"\tFake: {fake_translation}\n\tReal: {real_translation}")
        time = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=-5))).strftime("%I:%M:%S%p_%m-%d-%y")
        torch.save(netG.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netG{iteration}{time}")
        torch.save(netD.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netD{iteration}{time}")

In [None]:
# netG = Generator(charmap).to(device)
# netG.load_state_dict(torch.load("/home/nvijayakumar/gcp-gan/Checkpoints/netG103:47:35AM_12-03-20"))

# netD = Discriminator(charmap).to(device)
# netD.load_state_dict(torch.load("/home/nvijayakumar/gcp-gan/Checkpoints/netD103:47:35AM_12-03-20"))

In [None]:
latent_noise = torch.randn(10, 128).to(device=device)
generated_passwords = netG(latent_noise).argmax(dim=2)
translate(generated_passwords)

# Training Loop Scratch

In [None]:
# torch.manual_seed(0)

lambda_ = 10
LAMBDA = 10
n_critic_iters_per_generator_iter = 10
batch_size = 128
lr = 1e-4
adam_beta1 = 0.5
adam_beta2 = 0.9
iterations = 9000
continue_training = True
netG_checkpoint = "/home/nvijayakumar/gcp-gan/Checkpoints/netGnottrash-600011:44:28PM_12-03-20"
netD_checkpoint = "/home/nvijayakumar/gcp-gan/Checkpoints/netDnottrash-600011:44:28PM_12-03-20"

netG = Generator(charmap).to(device=device)
netD = Discriminator(charmap).to(device=device)
    
train = dataloader(filtered_lines, batch_size)


if continue_training:
    netG.load_state_dict(torch.load(netG_checkpoint))
    netD.load_state_dict(torch.load(netD_checkpoint))
    start_iter = int(netG_checkpoint.split(":")[0].split("-")[-1][:-2])
    for _ in range(start_iter): #look up better way to do this
        next(train)
        pass
    print(f"Model loaded, starting at {start_iter}...")
else: 
    start_iter = 1
    
optimG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.9))
optimD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.9))

In [None]:
for iteration in range(start_iter, iterations + 1):
    for p in netD.parameters():  # reset requires_grad
        p.requires_grad = True  # they are set to False below in netG update
        
    for i in range(n_critic_iters_per_generator_iter):
        real_inputs_discrete = next(train)
        real_data = F.one_hot(real_inputs_discrete, num_classes=len(charmap)).float() #x
        latent_variable = torch.randn(batch_size, 128).to(device=device) #z
        alpha = torch.rand(batch_size, 1, 1).to(device=device) #epsilon
        
        fake_data = netG(latent_variable) #x_tilde
        
        #print(alpha.shape, fake_data.shape, real_data.shape)
        interpolates = alpha * real_data + ((1 - alpha) * fake_data) #x_hat
        interpolates = interpolates.clone().detach().requires_grad_(True) #x_hat
        disc_interpolates = netD(interpolates) #D_w(x_hat)
        gradients = autograd.grad(outputs=disc_interpolates, inputs=interpolates, #grad D_w(x_hat)
                          grad_outputs=torch.ones(disc_interpolates.size()).to(device=device),
                          create_graph=True, retain_graph=True, only_inputs=True)[0] #doesn't populate grad attributes
        
        gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean() * lambda_
        
        disc_real = netD(real_data).mean() #D_w(x)
        disc_fake = netD(fake_data).mean() #D_w(x_tilde)
        
        loss = disc_fake - disc_real + gradient_penalty #L
        loss.backward()
        optimD.step()
        netD.zero_grad()
    
    for p in netD.parameters():
        p.requires_grad = False  # to avoid computation
    netG.zero_grad()
    
    latent_variable = torch.randn(batch_size, 128).to(device=device) #z
    fake_data = netG(latent_variable)    
#    noise = torch.randn(batch_size, 128).to(device=device)
    G = -netD(fake_data).mean()
    G.backward()
    optimG.step()

    if iteration % 500 == 0 or iteration == 1:
        print(f"iterations {iteration}")
        real_translation = translate(real_inputs_discrete[:10].cpu().numpy())
        fake_translation = translate(fake_data[:10].detach().cpu().numpy().argmax(axis=2))
        print(f"\tFake: {fake_translation}\n\tReal: {real_translation}")
        time = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=-5))).strftime("%I:%M:%S%p_%m-%d-%y")
        torch.save(netG.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netGfinalrock-{iteration}{time}")
        torch.save(netD.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netDfinalrock-{iteration}{time}")
        
# iterations 1
# Fake: ['♠ｘﾁこο|خþฒו', 'vXూာင)ףำฝA', '˹ß>ιجאㄓOỳ\x08', 'J¸ฐXЬに♂Aẻฑ', 'ק唔δฎ̉שþ８Æల', 'үﾝŁ年�淘Ổ中ổй', 'cﾐδκجƒာూㅕą', '我ๅä!နòေåΗΙ', 'Áผศ箮Ϊọעฟ๔E', 'κฺ3ｘ7HѕŁЗ͓']
# Real: ['sebasqz|||', 'pipoylove|', 'waffle07||', 'black5678|', '6818597|||', 'ERICILUVU*', 'JUAND|||||', '21801881||', 'farouk93||', 'choiran|||']
# iterations 500
# Fake: ['uu||ll1uu|', 'uuue||||2e', 'uue|2euuu|', '8uuuuu9|||', 'l11uuuue||', 'l8uuee|22|', 'll11uuue9|', 'uuueee8uu|', 'uuell1uu9|', '8uuu|||29|']
# Real: ['sf42007|||', 'kimvan221|', '733390||||', 'kitten7791', 'Schatz4me|', '729tev||||', '2commit|||', '5625381|||', '311y5y9|||', '2262065|||']
# iterations 1000
# Fake: ['p-$1b0057|', 's150||057|', 'ma|elp-17|', 'san0s15|||', 'pw-15|||||', 'p-11nmm5||', 'mnns1a||||', 'p@a708-1a|', 's150|005||', 'm1nm57||||']
# Real: ['2041005|||', 'jaskolka24', 'spindel|||', 'katlen221|', 'maggie1021', 'STAYOUT1||', '0818615011', 'miss0ula!|', '070928945|', 'hondagrand']     

# Backup #0

In [None]:
lambda_ = 10
LAMBDA = 10
n_critic_iters_per_generator_iter = 10
batch_size = 128
lr = 1e-4
adam_beta1 = 0.5
adam_beta2 = 0.9
iterations = 9000
netG = Generator(charmap).to(device=device)
netD = Discriminator(charmap).to(device=device)

optimG = optim.Adam(netG.parameters(), lr=lr, betas=(0.5, 0.9))
optimD = optim.Adam(netD.parameters(), lr=lr, betas=(0.5, 0.9))
    
train = dataloader(filtered_lines, batch_size)

In [None]:
for iteration in range(1, iterations):
    for i in range(n_critic_iters_per_generator_iter):
        real_inputs_discrete = next(train)
        real_inputs = F.one_hot(real_inputs_discrete, num_classes=len(charmap)).float()
        latent_data = torch.randn(batch_size, 128).to(device=device)
        fake_inputs = netG(latent_data)
        fake_inputs_discrete = fake_inputs.argmax(dim=2)
        
        disc_real = netD(real_inputs)
        disc_fake = netD(fake_inputs)
#        disc_cost = torch.mean(disc_fake) - torch.mean(disc_real)
#         gen_cost = -torch.mean(disc_fake)
        
        alpha = torch.rand(batch_size, 1, 1).to(device=device)
        differences = fake_inputs - real_inputs
        
        interpolates = real_inputs + alpha * differences
        interpolates.retain_grad()
        
        temp = netD(interpolates)
        temp.mean().backward(retain_graph=True)
       
        gradients = interpolates.grad

        gradient_penalty = ((gradients.norm(2, dim=1) - 1) ** 2).mean()
        
        disc_cost = torch.mean(disc_fake) - torch.mean(disc_real) + lambda_ * gradient_penalty
        
        disc_cost.backward()
        optimD.step()
        netD.zero_grad()
    
    latent_data = torch.randn(batch_size, 128).to(device=device)
    fake_inputs = netG(latent_data)
    disc_fake = netD(fake_inputs)
    gen_cost = -torch.mean(disc_fake)
    gen_cost.backward()
    optimG.step()
    netG.zero_grad()
    netD.zero_grad()

    if iteration % 5 == 0:
        print(iteration)
        
    if iteration % 1000 == 0 or iteration == 1:
        print(f"iterations {iteration}")
        real_translation = translate(real_inputs_discrete[:5].cpu().numpy())
        fake_translation = translate(fake_inputs_discrete[:5].cpu().numpy())
        print(f"\tFake: {fake_translation}\n\tReal: {real_translation}")
        torch.save(netG.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netG-{iteration}{time}")
        torch.save(netD.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netD-{iteration}{time}")

# Backup

In [None]:
lambda_ = 10
LAMBDA = 10
n_critic_iters_per_generator_iter = 10
batch_size = 128
lr = 1e-4
adam_beta1 = 0.5
adam_beta2 = 0.9
iterations = 9000
generator = Generator(charmap).to(device=device)
discriminator = Discriminator(charmap).to(device=device)

optimG = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.9))
optimD = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.9))
    
train = dataloader(filtered_lines, batch_size)

In [None]:
for iteration in range(1, iterations):
    for p in discriminator.parameters():  # reset requires_grad
        p.requires_grad = True  # they are set to False below in netG update
        
    for i in range(n_critic_iters_per_generator_iter):
        real_data = next(train)
        real_data_onehot = F.one_hot(real_data, num_classes=len(charmap)).float()
        
        latent_data = torch.randn(batch_size, 128).to(device=device)
        fake_data_onehot = generator(latent_data)
        #fake_data = fake_data_onehot.argmax(dim=2)
        
        epsilon = torch.rand(batch_size, 1, 1).to(device=device) #[0,1]
        xhat = (epsilon * real_data_onehot + (1 - epsilon) * fake_data_onehot)
        xhat.retain_grad()
        
        xhat_discrim = discriminator(xhat)
        torch.mean(xhat_discrim).backward(retain_graph=True)
        
        gradient_term = lambda_ * torch.square(torch.norm(xhat.grad, dim=1, keepdim=True) - 1.)
        loss = (discriminator(fake_data_onehot).mean() - discriminator(real_data_onehot).mean() + gradient_term).mean()
    
        loss.backward(retain_graph=True)
        optimD.step()
        discriminator.zero_grad()
        
    for p in discriminator.parameters():
        p.requires_grad = False  # to avoid computation
        
    latent_data_gen = torch.randn(batch_size, 128).to(device=device)
    fake_data_gen = generator(latent_data)
    loss_gen = -discriminator(fake_data_gen)
    loss_gen.mean().backward()
    optimG.step()
    generator.zero_grad()
    discriminator.zero_grad()
    if iteration % 1000 == 0 or iteration == 1:
        print(f"iterations {iteration}")
        real_translation = translate(real_data[:5].cpu().numpy())
        fake_translation = translate(fake_data_gen[:5].argmax(dim=2).cpu().numpy())
        print(f"\tFake: {fake_translation}\n\tReal: {real_translation}")
        time = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=-5))).strftime("%I:%M:%S%p_%m-%d-%y")
        torch.save(generator.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netG-{iteration}{time}")
        torch.save(discriminator.state_dict(), f"/home/nvijayakumar/gcp-gan/Checkpoints/netD-{iteration}{time}")

# Test Area

In [None]:
x = torch.tensor([1., 2., 3.], requires_grad=True)
y = torch.tensor([6., 2, 10], requires_grad=True)
z = (x * y).sum()

w = z ** 2

In [None]:
z.backward()

In [None]:
x.grad

In [None]:
y

In [None]:
autograd.grad(outputs=z, inputs=x, grad_outputs=torch.ones(z.size()))