In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
import numpy as np
import pandas as pd
import datetime
import pickle

from collections import OrderedDict

from models import Generator, predict_many, predict_one
from data import load_data_from_pickle, translate, load_dataset, dump_txt_to_pickle

device = "cuda" if torch.cuda.is_available() else "cpu"
#print(torch.cuda.get_device_name(), "|", torch.cuda.is_available())

# Data Loading

In [2]:
# path = "Data/dubsmash_processed.txt"
# dataset_name = "dubsmash"

# t = datetime.datetime.now()
# filtered_lines, charmap, inv_charmap = load_dataset(path)
# dump_txt_to_pickle(path, dataset_name, test_size=0.1)
# print(datetime.datetime.now() - t)

In [3]:
dataset_name = "dubsmash"
t = datetime.datetime.now()
test_data, charmap, inv_charmap = load_data_from_pickle(dataset_name, train_data=False, test_data=True)
print(datetime.datetime.now() - t)

0:00:01.770379


In [4]:
orig_test_data = pd.Series(translate(test_data, inv_charmap))
test_data = orig_test_data.drop_duplicates()

In [5]:
len(test_data), len(charmap)

(650695, 95)

# Loading Latest Model

In [8]:
netG = Generator(charmap).to(device)
netG.load_state_dict(torch.load("Checkpoints/netG-15800002:26:12PM_12-05-20", map_location=torch.device(device)))
netG.eval()

Generator(
  (lin): Linear(in_features=128, out_features=1280, bias=True)
  (block1): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block2): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block3): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block4): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (block5): ResidualBlock(
    (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  )
  (conv): Conv1d(128,

# Generating Passwords from Latest Model

In [None]:
batch_size = 100000
batches_per_file = 100
num_files = 100

print(f"Generating {batch_size * batches_per_file * num_files} passwords...")

In [None]:
def write_predictions(netG, inv_charmap, batches_per_file, num_files, batch_size=100000):
    print(f"Generating {batch_size * batches_per_file * num_files} passwords...")
    t_total = datetime.datetime.now()
    for i in range(1, num_files+1):
        t = datetime.datetime.now()
        print(f"File {i}")
        print("\tGenerating output...")
        preds = []
        for _ in range(batches_per_file):
            preds.append(netG(torch.randn(batch_size, 128).to(device=device)).argmax(dim=2)) #max 100k fits in memory
        list_of_preds = torch.stack(preds).reshape((-1, 10)).cpu().tolist()
        del preds
        print("\tTranslating output...")
        translated_preds = translate(list_of_preds, inv_charmap)

        del list_of_preds
        print("\tWriting output...")
        with open(f"Predictions/predfile_{i}_{batch_size*batches_per_file}.txt", 'w+') as f:
            for pred in translated_preds:
                f.write(pred + "\n")
        print(f"\t{datetime.datetime.now() - t}")
        del translated_preds
    print(f"\tTotal: {datetime.datetime.now() - t_total}")

In [9]:
write_predictions(netG, inv_charmap, batches_per_file, num_files, batch_size)

Generating 1000000000 passwords...


In [12]:
!wc -l Predictions/predfile_1_10000000.txt # 10million = 105MB
!ls -lh Predictions

10000000 Predictions/predfile_1_10000000.txt
total 11G
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 04:00 predfile_100_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:49 predfile_10_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:50 predfile_11_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:52 predfile_12_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:53 predfile_13_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:55 predfile_14_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:56 predfile_15_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:58 predfile_16_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 01:59 predfile_17_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 02:01 predfile_18_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105M Dec  6 02:02 predfile_19_10000000.txt
-rw-rw-r-- 1 nvijayakumar nvijayakumar 105

Samples from various checkpoints

# Predictions from Various Checkpoints

In [12]:
checkpoints_to_predict = OrderedDict()
checkpoints_to_predict['1'] = "Checkpoints/netG-101:32:11AM_12-04-20"
checkpoints_to_predict['5000'] = "Checkpoints/netG-500002:41:42AM_12-04-20"
checkpoints_to_predict['10000'] = "Checkpoints/netG-1000003:50:37AM_12-04-20"
checkpoints_to_predict['25000'] = "Checkpoints/netG-2500007:19:11AM_12-04-20"
checkpoints_to_predict['75000'] = "Checkpoints/netG-7500006:46:08PM_12-04-20"
checkpoints_to_predict['158000'] = "Checkpoints/netG-15800002:26:12PM_12-05-20"

In [21]:
for key, value in checkpoints_to_predict.items():
    netG = Generator(charmap).to(device)
    netG.load_state_dict(torch.load(value))
    print(f"Checkpoint iterations {key}: \t {predict_one(netG, inv_charmap, 5)}")

Checkpoint iterations 1: 	 ['EbApeY>bK=', 'A[;OFq!ajK', "Pb(d7unk'qtP", '..}&}c05np', 'k!=G a.YCz']
Checkpoint iterations 5000: 	 ['howoriy8||', 'ilintnlo||', 'cortoe2345', '12brrtto||', 'samirta|||']
Checkpoint iterations 10000: 	 ['auman!||||', 'variane|||', 'bebrw6133|', 'awerebyw9|', 'fanrarg12|']
Checkpoint iterations 25000: 	 ['blericas||', 'Wapererah|', 'allafosty|', '12345|||||', 'sikcrisho|']
Checkpoint iterations 75000: 	 ['041515bo||', 'jamera||||', 'simsah1|||', 'carny}20||', 'danithi2||']
Checkpoint iterations 158000: 	 ['l3048806||', 'towe170509', 'camrrsa|||', '12345|||||', '00301|||||']


# Experimental Results and Analysis via Dask

In [6]:
import dask.dataframe as dd
from dask.distributed import Client
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:55941  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 17.18 GB


In [7]:
t = datetime.datetime.now()
predictions = dd.read_table("Predictions/predfile_*_10000000.txt", names=["Passwords"]) #reads all generated passwords
print(len(predictions))
print(datetime.datetime.now() - t)
predictions

1000000000
0:02:07.150311


Unnamed: 0_level_0,Passwords
npartitions=200,Unnamed: 1_level_1
,object
,...
...,...
,...
,...


In [8]:
t = datetime.datetime.now()
unique_predictions = predictions['Passwords'].drop_duplicates()
print(datetime.datetime.now() - t)
t = datetime.datetime.now()
preds_mask = unique_predictions.isin(test_data)
print(datetime.datetime.now() - t)
t = datetime.datetime.now()
matched_preds = unique_predictions[preds_mask]
print(datetime.datetime.now() - t)

0:00:00.001887
0:00:00.141336
0:00:00.000258


In [9]:
t = datetime.datetime.now()
num_unique_generated = len(unique_predictions)
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
unmatched_generated_pws = unique_predictions[~preds_mask].sample(frac=0.0001).compute()
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
matched_passwords = matched_preds.compute()
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
test_matched_mask = orig_test_data.isin(matched_passwords)
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
proportion_in_unique_predictions_that_matched = preds_mask.mean().compute()
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
proportion_of_test_set_that_matched = orig_test_data.isin(matched_passwords).mean()
print(datetime.datetime.now() - t)

t = datetime.datetime.now()
proportion_that_was_uniquely_generated = num_unique_generated/len(predictions)
print(datetime.datetime.now() - t)



KeyboardInterrupt: 

**Sample of Matched Passwords**

In [None]:
matched_passwords.str.replace("|", "").to_list()[:100]

**Proportion of unique predictions generated that matched with a password in the test set**

In [None]:
proportion_in_unique_predictions_that_matched

**Proportion of test set whose passwords were found**

In [None]:
proportion_of_test_set_that_matched

**Number of unique passwords that were generated**

In [None]:
num_unique_generated

**Proportion of generated passwords that are unique**

In [None]:
proportion_that_was_uniquely_generated

**Sample of unmatched generated passwords**

In [None]:
unmatched_generated_pws

**Sample of unmatched test set passwords**

In [None]:
orig_test_data[~test_matched_mask].iloc[:25]

# Alternative to Dask

In [None]:
please_work = pd.read_fwf("Predictions/predfile_1_10000000.txt")

In [23]:
t = datetime.datetime.now()
please_work = pd.read_table("Predictions/predfile_1_10000000.txt", names=["Password"])['Password'].drop_duplicates()
print(datetime.datetime.now() - t)

0:00:07.678020


In [27]:
please_work.memory_usage(deep=True)

340648125

In [33]:
340648125/1000000000

0.340648125

# Dask

In [1]:
import dask.dataframe as dd
from dask.distributed import Client
client = Client()

0,1
Client  Scheduler: tcp://127.0.0.1:54929  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 17.18 GB


In [5]:
predictions = dd.read_table("Predictions/predfile_*_10000000.txt", names=["Passwords"])
predictions

Unnamed: 0_level_0,Passwords
npartitions=200,Unnamed: 1_level_1
,object
,...
...,...
,...
,...


In [7]:
please_work = predictions.drop_duplicates().sample(frac=0.0001).compute()



KeyboardInterrupt: 

In [6]:
please_work.shape

(45420, 1)