# PassGAN

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import numpy as np
import pandas as pd
import datetime
import pickle
import glob

from collections import OrderedDict

from models import Generator, write_predictions, predict_many, predict_one
from data import load_dataset, dump_txt_to_pickle, load_data_from_pickle, dataloader, translate
from training import training_loop

device = "cuda" if torch.cuda.is_available() else "cpu"
print(torch.cuda.get_device_name(), "|", torch.cuda.is_available())

Tesla T4 | True


# Loading Data

In [2]:
path = "Data/dubsmash_processed.txt"
dataset_name = "dubsmash"

# filtered_lines, charmap, inv_charmap = load_dataset(path)
# dump_txt_to_pickle(path, dataset_name, test_size=0.1)

t = datetime.datetime.now()
train_lines, test_lines, charmap, inv_charmap = load_data_from_pickle(dataset_name, test_data=True)
print(datetime.datetime.now() - t)

0:00:23.469253


In [3]:
print(f"\nTraining Size:\t{len(train_lines):>7}\nTesting Size:\t{len(test_lines):>7}")


Training Size:	19458235
Testing Size:	1024117


In [4]:
train = dataloader(train_lines, 8)
translate(next(train), inv_charmap)

['12345|||||',
 'momdad1727',
 '123456789|',
 'cutiekay89',
 'haley|||||',
 '123abc||||',
 'christina|',
 'nina123|||']

# Training loop

Function parameters

In [5]:
lines = train_lines
dataloader = dataloader #function from data.py

args = {}
args['lambda_'] = 10
args['n_critic_iters_per_generator_iter'] = 10
args['batch_size'] = 128
args['lr'] = 1e-4
args['adam_beta1'] = 0.5
args['adam_beta2'] = 0.9
args['iterations'] = 3000
args['continue_training'] = True
args['netG_checkpoint'] = "Checkpoints/netG-200002:00:07AM_12-04-20"
args['netD_checkpoint'] = "Checkpoints/netD-200002:00:07AM_12-04-20"

training_loop(lines, charmap, inv_charmap, dataloader, args)

Model loaded, starting at 2000...
iterations 2000
	Fake: ['113406||||', 'dMar30||||', 'parenp1133', 'piMamk11||', 'Pareverle|', 'poskas||||', 'beMamag|||', '12345|||||', 'drlowis|||', 'MeeRas8as|']
	Real: ['102030||||', 'ivona.a|||', '232323||||', 'Passw0rd1|', 'nextbest15', 'BOOMPANES!', '696969||||', '12345|||||', 'carlos||||', 'grace|||||']
iterations 3000
	Fake: ['Fa84an8450', 'cupetvr123', '12345|||||', 'seamed1|||', 'doenen||||', 'madseme123', 'fneenis|||', 'meeo2840||', 'cosros8800', 'fehi123|||']
	Real: ['panthers||', 'mango50|||', 'jcs102888|', '12346|||||', 'Qwerty123|', 'Lenkasm84|', 'tahari00||', 'speedpr23|', '123456789|', '456852||||']


# Prediction

In [6]:
dataset_name = "dubsmash"
t = datetime.datetime.now()
test_data, charmap, inv_charmap = load_data_from_pickle(dataset_name, train_data=False, test_data=True)
print(datetime.datetime.now() - t)

0:00:05.600600


In [7]:
netG = Generator(charmap).to(device)
netG.load_state_dict(torch.load("Checkpoints/netG-15800002:26:12PM_12-05-20", map_location=torch.device(device))) # latest model
netG.eval()

Generator(
  (lin): Linear(in_features=128, out_features=1280, bias=True)
  (block1): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block2): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block3): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block4): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block5): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (conv): Conv1d(128,

In [8]:
batch_size = 100000
batches_per_file = 1
num_files = 1

write_predictions(netG, inv_charmap, batches_per_file, num_files, batch_size)

Generating 100000 passwords...
File 1
	Generating output...
	Translating output...
	Writing output...
	0:00:01.004930
	Total: 0:00:01.006752


How Predictions Work

Samples the latent space
```python
latent_noise = torch.randn(batch_size, 128).to(device=device)
```

Produces vectors of probabilities for each class for each character
```python
pred = netG(latent_noise)
```

Find the character with the highest probability for each character and translate numeric to character
```python
translated_pred = translate(pred.argmax(dim=2), inv_charmap)
```

In [9]:
batch_size = 4
latent_noise = torch.randn(batch_size, 128).to(device=device)
pred = netG(latent_noise)
translated_pred = translate(pred.argmax(dim=2), inv_charmap)
translated_pred

['mohom514||', 'aliria||||', 'hunna|||||', '55575|||||']

# Predictions Across Checkpoints

In [10]:
checkpoints_to_predict = OrderedDict()
checkpoints_to_predict['1'] = "Checkpoints/netG-101:32:11AM_12-04-20"
checkpoints_to_predict['1000'] = 'Checkpoints/netG-100001:46:05AM_12-04-20'
checkpoints_to_predict['5000'] = "Checkpoints/netG-500002:41:42AM_12-04-20"
checkpoints_to_predict['10000'] = "Checkpoints/netG-1000003:50:37AM_12-04-20"
checkpoints_to_predict['25000'] = "Checkpoints/netG-2500007:19:11AM_12-04-20"
checkpoints_to_predict['75000'] = "Checkpoints/netG-7500006:46:08PM_12-04-20"
checkpoints_to_predict['158000'] = "Checkpoints/netG-15800002:26:12PM_12-05-20"

In [36]:
for key, value in checkpoints_to_predict.items():
    netG = Generator(charmap).to(device)
    netG.load_state_dict(torch.load(value, map_location=torch.device(device)))
    print(f"Checkpoint iterations {key:>6}: \t {predict_one(netG, inv_charmap, 5)}")

Checkpoint iterations      1: 	 ['$?5lKqnN#<', '_,|V+Vzo0$', 'Ej[22_Nw78', '-D{7unkQkI>O', '7,[#Mz/M_Q']
Checkpoint iterations   1000: 	 ['aracarie16', 'bines2||||', '123456te6|', 'kiinel||||', 'hote5|||||']
Checkpoint iterations   5000: 	 ['borinyon||', '12345|||||', 'soinh1||||', 'mam1231|||', 'seyis|1|||']
Checkpoint iterations  10000: 	 ['16128yn6||', 'Huweme||||', '123456||||', 'Perweblo|.', 'snbslo||||']
Checkpoint iterations  25000: 	 ['pugharda||', 'almarty8ip', '1234567893', 'minhe01|||', 'piprica1||']
Checkpoint iterations  75000: 	 ['aneyha||||', 'vergMet55|', 'tadona||||', '123456789|', 'teottme|||']
Checkpoint iterations 158000: 	 ['2121912|||', '12345|||||', '12345678||', 'kanda|||||', '123756789|']


# Experimental Results

In [12]:
pred_paths = glob.glob("Predictions/predfile_*_10000000.txt")
t = datetime.datetime.now()
list_of_dfs = [pd.read_table(path, names=["Password"]) for path in pred_paths]
predictions = pd.concat(list_of_dfs, axis=0, ignore_index=True)
print(datetime.datetime.now() - t)

orig_test_data = pd.Series(translate(test_data, inv_charmap))
test_data = orig_test_data.drop_duplicates()

0:07:14.361398


In [13]:
predictions

Unnamed: 0,Password
0,obeve797||
1,rinline1||
2,woley14|||
3,12345|||||
4,12345|||||
...,...
999999995,buepyr||||
999999996,almbers|||
999999997,12345|||||
999999998,041010||||


In [14]:
t = datetime.datetime.now()
unique_predictions = predictions['Password'].drop_duplicates()
print(datetime.datetime.now() - t)
unique_predictions

0:09:40.631642


0            obeve797||
1            rinline1||
2            woley14|||
3            12345|||||
5            blar||||||
                ...    
999999955    tirian20||
999999973    taywl2s14|
999999975    makimuraty
999999978    moobovass|
999999983    marenebaR|
Name: Password, Length: 163341589, dtype: object

In [15]:
test_data

0          123456789|
1          09090909||
2          naveen||||
3          jerry|||||
4          jultomten|
              ...    
1024107    malakhai15
1024108    alejo17|||
1024111    91gosane||
1024115    Lilybug5||
1024116    drpepper22
Length: 346442, dtype: object

In [16]:
t = datetime.datetime.now()
preds_mask = unique_predictions.isin(test_data)
matched_preds = unique_predictions[preds_mask]
num_unique_generated = len(unique_predictions)
unmatched_generated_pws = unique_predictions[~preds_mask].sample(n=100)
test_matched_mask = orig_test_data.isin(matched_preds)
proportion_in_unique_predictions_that_matched = preds_mask.mean()
proportion_of_test_set_that_matched = orig_test_data.isin(matched_preds).mean()
proportion_of_deduped_test_set_that_matched = test_data.isin(matched_preds).mean()
proportion_that_was_uniquely_generated = num_unique_generated/len(predictions)
print(datetime.datetime.now() - t)

0:00:54.602040


**Sample of Matched Passwords**

In [37]:
matched_preds.str.replace("|", "").sample(n=10)

15563444        mollyy
1168622       angles99
193304551    sahara123
1821719       m1234567
211847         melinda
170792819     amera123
211771734       jersey
11814803        101105
472676459    darla2013
6666740       28082003
Name: Password, dtype: object

**Proportion of unique predictions generated that matched with a password in the test set**

In [18]:
proportion_in_unique_predictions_that_matched

0.00020288770424536521

**Proportion of test set whose passwords were found**

In [19]:
proportion_of_test_set_that_matched

0.47678341439503497

**Proportion of deduped test set whose passwords were found**

In [20]:
proportion_of_deduped_test_set_that_matched

0.09565814768417225

**Number of unique passwords that were generated**

In [21]:
num_unique_generated

163341589

**Proportion of generated passwords that are unique**

In [22]:
proportion_that_was_uniquely_generated

0.163341589

**Sample of unmatched generated passwords**

In [42]:
unmatched_generated_pws.sample(n=10)

2321788      miay1333||
269112706    to412515||
450393001    Porthe608|
156489414    Perkla98||
315043617    Aohstriey|
982030372    gabaebn50|
950503444    cedeni14||
203685236    17061311||
28032982     oMkazkioty
281175338    peypal01||
Name: Password, dtype: object

**Sample of unmatched test set passwords**

In [24]:
orig_test_data[~test_matched_mask].sample(n=10)

162258    joey1021||
6906      1607681|||
375976    9946805264
583564    purple01||
799334    9oo7548154
532069    titanic|||
296639    killbill||
579585    0987654321
812319    07272001||
927112    sherlock||
dtype: object