In [1]:
import os, sys
sys.path.append(os.getcwd())

import time
import pickle
import numpy as np
import tensorflow as tf
import tflib.plot
import utils
import tflib as lib
import tflib.linear
import tflib.conv1d
import models
from random import shuffle

In [2]:
class VirtualArgparse:
    
    # Path to dataset
    #training_data = "GAN_input.txt"
    training_data = "final_lstm_op.txt"
    
    # Name of directory to output
    output_dir = "pretrained"
    
    save_every = 5000   #5000
    iters = 20000   #200000
    batch_size = 64
    seq_length = 10
    layer_dim = 128
    critic_iters = 10
    lamb = 10

In [3]:
args = VirtualArgparse

In [4]:
if not os.path.isdir(args.output_dir):
    os.makedirs(args.output_dir)

if not os.path.isdir(os.path.join(args.output_dir, 'checkpoint')):
    os.makedirs(os.path.join(args.output_dir, 'checkpoint'))

if not os.path.isdir(os.path.join(args.output_dir, 'samples')):
    os.makedirs(os.path.join(args.output_dir, 'samples'))

In [5]:
#load the dataset
lines, charmp, inv_charmp = utils.load_dataset(
    path=args.training_data,
    max_length=args.seq_length)

loaded 774592 lines in dataset


In [6]:
#store the charmap weigths
with open(os.path.join(args.output_dir, 'charmap.pickle'), 'wb') as f:
    pickle.dump(charmp, f)

with open(os.path.join(args.output_dir, 'charmap_inv.pickle'), 'wb') as f:
    pickle.dump(inv_charmp, f)
    
print("Number of unique characters in dataset: {}".format(len(charmp)))

Number of unique characters in dataset: 124


In [7]:

real_inputs_discrete = tf.placeholder(tf.int32, shape=[args.batch_size, args.seq_length])
real_inputs = tf.one_hot(real_inputs_discrete, len(charmp))

In [8]:
#inputs to discriminator
print(real_inputs_discrete)
print(real_inputs)

Tensor("Placeholder:0", shape=(64, 10), dtype=int32)
Tensor("one_hot:0", shape=(64, 10, 124), dtype=float32)


In [9]:
#input random noise to generator
fake_inputs = models.Generator(args.batch_size, args.seq_length, args.layer_dim, len(charmp))
fake_inputs_discrete = tf.argmax(fake_inputs, fake_inputs.get_shape().ndims-1)

W1212 00:01:54.253783  3860 deprecation_wrapper.py:119] From C:\Users\Piyush's PC\DLproject_2019\models.py:64: The name tf.random_normal is deprecated. Please use tf.random.normal instead.



In [10]:
print(fake_inputs)

Tensor("Reshape_2:0", shape=(64, 10, 124), dtype=float32)


In [11]:
#discriminator real and fake 
disc_real = models.Discriminator(real_inputs, args.seq_length, args.layer_dim, len(charmp))
disc_fake = models.Discriminator(fake_inputs, args.seq_length, args.layer_dim, len(charmp))

discriminator_cost = tf.reduce_mean(disc_fake) - tf.reduce_mean(disc_real)
generator_cost = -tf.reduce_mean(disc_fake)

In [12]:
# WGAN lipschitz-penalty
alpha = tf.random_uniform(
    shape=[args.batch_size,1,1],
    minval=0.,
    maxval=1.
)

differences = fake_inputs - real_inputs
interpolates = real_inputs + (alpha*differences)
gradients = tf.gradients(models.Discriminator(interpolates, args.seq_length, args.layer_dim, len(charmp)), [interpolates])[0]
slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1,2]))
gradient_penalty = tf.reduce_mean((slopes-1.)**2)
discriminator_cost += args.lamb * gradient_penalty

gen_params = lib.params_with_name('Generator')
disc_params = lib.params_with_name('Discriminator')

gen_train_op = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9).minimize(generator_cost, var_list=gen_params)
disc_train_op = tf.train.AdamOptimizer(learning_rate=1e-4, beta1=0.5, beta2=0.9).minimize(discriminator_cost, var_list=disc_params)

W1212 00:01:58.103499  3860 deprecation.py:323] From C:\Users\installations\lib\site-packages\tensorflow\python\ops\math_grad.py:1205: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [13]:
# Dataset iterator
def inf_train_gen():
    while True:
        np.random.shuffle(lines)
        for i in range(0, len(lines)-args.batch_size+1, args.batch_size):
            yield np.array(
                [[charmp[c] for c in l] for l in lines[i:i+args.batch_size]],
                dtype='int32'
            )

In [14]:
# During training we monitor JS divergence between the true & generated ngram
# distributions for n=1,2,3,4. To get an idea of the optimal values, we
# evaluate these statistics on a held-out set first.
true_char_ngram_lms = [utils.NgramLanguageModel(i+1, lines[10*args.batch_size:], tokenize=False) for i in range(4)]
validation_char_ngram_lms = [utils.NgramLanguageModel(i+1, lines[:10*args.batch_size], tokenize=False) for i in range(4)]
for i in range(4):
    print("validation set JSD for n={}: {}".format(i+1, true_char_ngram_lms[i].js_with(validation_char_ngram_lms[i])))
true_char_ngram_lms = [utils.NgramLanguageModel(i+1, lines, tokenize=False) for i in range(4)]

validation set JSD for n=1: 0.004985971679623904
validation set JSD for n=2: 0.0667152892948728
validation set JSD for n=3: 0.3281906483195437
validation set JSD for n=4: 0.6475340885547007


In [15]:
#training usually takes longer time for every 100 steps
with tf.Session() as session:

    # Time stamp
    localtime = time.asctime( time.localtime(time.time()) )
    print("Starting TensorFlow session...")
    print("Local current time :", localtime)
    
    # Start TensorFlow session...
    session.run(tf.global_variables_initializer())

    def generate_samples():
        samples = session.run(fake_inputs)
        samples = np.argmax(samples, axis=2)
        decoded_samples = []
        for i in range(len(samples)):
            decoded = []
            for j in range(len(samples[i])):
                decoded.append(inv_charmp[samples[i][j]])
            decoded_samples.append(tuple(decoded))
        return decoded_samples

    gen = inf_train_gen()

    for iteration in range(args.iters + 1):
        
        start_time = time.time()
        
        # Train Generator
        if iteration > 0:
            _ = session.run(gen_train_op)
        
        
        # Train Discriminator
        for i in range(args.critic_iters):
            _data = next(gen)
            
            _gen_cost, _ =session.run(
                [generator_cost,gen_train_op],
            feed_dict={fake_inputs_discrete:_data}
            
            )
                
            _disc_cost, _ = session.run(
                [discriminator_cost, disc_train_op],
                feed_dict={real_inputs_discrete:_data }
            
            )
          
       
        lib.plot.output_dir = args.output_dir
        lib.plot.plot('time', time.time() - start_time)
        lib.plot.plot('train generator cost', _gen_cost)
        
        
        lib.plot.plot_generator('time', time.time() - start_time)
        lib.plot.plot_generator('train discriminator cost', _disc_cost)

        # Output to text file after every 100 samples
        if iteration % 100 == 0 and iteration > 0:

            samples = []
            for i in range(10):
                samples.extend(generate_samples())

            for i in range(4):
                lm = utils.NgramLanguageModel(i+1, samples, tokenize=False)
                lib.plot.plot('js{}'.format(i+1), lm.js_with(true_char_ngram_lms[i]))

            with open(os.path.join(args.output_dir, 'samples', 'samples_{}.txt').format(iteration), 'w') as f:
                for s in samples:
                    s = "".join(s)
                    f.write(s + "\n")
                    
                    
        if iteration % args.save_every == 0 and iteration > 0:
            model_saver = tf.train.Saver()
            model_saver.save(session, os.path.join(args.output_dir, 'checkpoint', 'checkpoint_{}.ckpt').format(iteration))
            print("{} / {} ({}%)".format(iteration, args.iters, iteration/args.iters*100.0 ))

        if iteration == args.iters:
            print("...Training done.")
            
           
        
        if iteration % 100 == 0:
            
            lib.plot.flush_generator()   
            lib.plot.flush()
        
        lib.plot.tick()
        
        
            
# Time stamp
localtime = time.asctime( time.localtime(time.time()) )
print("Ending TensorFlow session.")
print("Local current time :", localtime)

Starting TensorFlow session...
Local current time : Thu Dec 12 00:02:53 2019


KeyboardInterrupt: 

In [None]:
def randomString(stringLength=4):
    """Generate a random string of fixed length """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))

In [None]:
with open('pretrained/samples/samples_3900.txt', 'r') as k:
    keywords = k.read().splitlines()

with open('rockyou_dataset.txt',errors='ignore') as f, open('output_23nov.txt', 'w') as o:
    for line in f:
        if any(key in line for key in keywords):
            o.writelines(line)
o.close()

In [None]:
result = []
with open('pretrained/samples/samples_3900.txt', 'r') as k:
    keywords = k.read().splitlines()

result.append(keywords)
user = input()

if  re.match(r"{}.+".format(user), keywords) in keywords:
    with open('output_guess.txt', 'w') as o:
        o.writeline(user)
else:
    print("add to lookup for next time")

In [None]:
out_file = open('output_23nov.txt', 'r')
appnd=[]
user = input()
for line in out_file:
    if re.match(r"{}.+".format(user), line) or len(line)==200:
        print("recommendations are",line)
        
        #appnd.append(line.rstrip('\n')+keys)
        appnd.append(line.rstrip('\n')+keys)
        #file_ip = [line.split(',') for line in thefile.readlines()]
        #print(file_ip)
        
        #closeMatches(line, file_ip) 
        #newlist_words=[x.replace('\n', '') for x in appnd]
print("your input", user)  

print(appnd) 

In [None]:
user=input()
thefile = open("pretrained/samples/samples_1800.txt", "r")
appnd=[]

keys=randomString()
for line in thefile:
    if re.match(r"{}.+".format(user), line) or len(line)==200:
        print("recommendations are",line)
        
        #appnd.append(line.rstrip('\n')+keys)
        appnd.append(line.rstrip('\n')+keys)
        #file_ip = [line.split(',') for line in thefile.readlines()]
        #print(file_ip)
        
        #closeMatches(line, file_ip) 
        #newlist_words=[x.replace('\n', '') for x in appnd]
print("your input", user)  

print(appnd) 

In [None]:
topgan = open("pretrained/samples/top10GAN.txt", "r")
#userip = open("pretrained/samples/userip.txt","r")

from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline
user = 'yars'
count = 0
list_best = []
n_list=[]

ganrange=topgan.readlines()
for i in ganrange :
#     for j in userip.readlines():

    list_best.append(fuzz.ratio(user, i))
        


print(list_best)
x_list=[]
for count in range(0,len(ganrange)):
    x_list.append(count)
low_score = 20
c=0
for i in list_best : 
    if i > low_score : 
        c+= 1

        
maxpos = list_best.index(max(list_best)) 

topgan
print ("The numbers greater than low score : " + str(c)) 
x=x_list
y=list_best
plt.xlabel("iter")
plt.ylabel("score")
plt.plot(x,y)

print("done")

#epoch=range(0,len(list_best))


plt.show()
                
            

In [None]:
ganip = open("data/threethpasswords.txt","r",errors='ignore')
ganop = open("pretrained/samples/sample6000.txt", "r")


from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import matplotlib.pyplot as plt 
import numpy as np
%matplotlib inline
user = 'yars'
count = 0
list_best = []


ganrange=ganip.readlines()
for i in ganrange :
    for j in ganop.readlines():

        list_best.append(fuzz.ratio(j, i))
        


print(len(list_best))
x_list=[]
for count in range(0,len(ganrange)):
    x_list.append(count)


low_score = 50
c=0
for i in list_best : 
    if i > low_score : 
        c +=1
        
print ("The numbers greater than low score : " + str(c)) 

x=x_list
y=list_best
plt.xlabel("iterations")
plt.ylabel("score %")
plt.title('Loss plot (similarity score)')
plt.plot(x,y)

print("done")

#epoch=range(0,len(list_best))


plt.show()
                
            

In [None]:
with open('pretrained/samples/sample6000.txt','r') as f:                                                                                                                                                                                                                                                 
    distinct_content=set(f.readlines())                                                                                                                                                                                                                                                   

to_file=""                                                                                                                                                                                                                                                                       
for element in distinct_content:                                                                                                                                                                                                                                                               
    to_file=to_file+element                                                                                                                                                                                                                                                           
with open('output_unique6k','w') as w:                                                                                                                                                                                                                                                  
    w.write(to_file) 

In [None]:
#####################unique words#######################


In [None]:
documents = []
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')
def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()
cos_similarity(documents)

In [None]:
plt.figure(figsize=(12,4))
plt.hist(Tfidf_scores, bins = 200)
plt.xlim(0,1)
plt.show()

In [None]:
pwd

In [None]:
latest = tf.train.latest_checkpoint('pretrained/checkpoint/')

In [None]:
print(latest)

In [None]:
#edit dist
import stringdist


txt1 = open("data/threethpasswords.txt",errors='ignore')
txt2 = open("pretrained/samples/samples_500.txt")

stringdist.levenshtein(txt1, txt2)

In [None]:
#333 similar chars
#51957
with open('pretrained/samples/samples_1000.txt',errors='ignore') as infile:
    lines=0
    words=0
    characters_1=0
    for line in infile:
        wordslist=line.split()
        lines=lines+1
        words=words+len(wordslist)
        characters_1 += sum(len(word) for word in wordslist)
print(lines)
print(words)
print(characters_1)

In [None]:
with open('data/threethpasswords.txt',errors='ignore') as infile:
    lines=0
    words=0
    characters_2=0
    for line in infile:
        wordslist=line.split()
        lines=lines+1
        words=words+len(wordslist)
        characters_2+= sum(len(word) for word in wordslist)
print(lines)
print(words)
print(characters_2)

In [None]:
tot_characters = characters_1 + characters_2
tot_characters

In [None]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
import textdistance
l1=open("pretrained/samples/samples_3900.txt", "r") 
d1=l1.readlines()
        
l2=open("data/top100leakedwords.txt","r")

d2=l2.readlines()
# txt1 = open("data/threethpasswords.txt",errors='ignore').read()
# txt2 = open("pretrained/samples/samples_1000.txt").read()

score= textdistance.levenshtein( d2, d1)

score_sim=textdistance.hamming.similarity(d1,d2)

# from difflib import SequenceMatcher

# val=similar(d1,d2)
# print(val)

In [None]:
score

In [None]:
score_sim

In [None]:
# import numpy as np
# import sklearn.cluster
# import distance

# words = "apple".split(" ") #Replace this line
# words = np.asarray(words) #So that indexing with a list will work
# text_file = open("pretrained/samples/samples_500.txt", "r")
# lines = text_file.readlines()
# print(lines)
# print(len(lines))

# lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in text_file])
# lev_similarity.reshape(-1,1)

# affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
# affprop.fit(lev_similarity)
# for cluster_id in np.unique(affprop.labels_):
#     exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
#     cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
#     cluster_str = ", ".join(cluster)
#     print(" - *%s:* %s" % (exemplar, cluster_str))

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt

# N = 50
# x = np.random.rand(score)
# y = np.random.rand(tot_characters)

# plt.scatter(x, y)
# plt.show()

In [None]:
def randomString(stringLength=4):
    """Generate a random string of fixed length """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))


In [None]:
import re
import difflib
from difflib import get_close_matches 
import random 
import string
user=input()
thefile = open("pretrained/samples/samples_1800.txt", "r")

appnd=[]
file_ip=[]
keys=randomString()
for line in thefile:
    if re.match(r"{}.+".format(user), line) or len(line)==200:
        print("recommendations are",line)
        
        #appnd.append(line.rstrip('\n')+keys)
        appnd.append(line.rstrip('\n')+keys)
        #file_ip = [line.split(',') for line in thefile.readlines()]
        #print(file_ip)
        
        #closeMatches(line, file_ip) 
        #newlist_words=[x.replace('\n', '') for x in appnd]
print("your input", user)  

print(appnd)     




#print(thefile.find("data/threethpasswords.txt"))
############################################################################################
#print(thefile.findall(''.format(user),2))

In [None]:
import textdistance
l1=open("pretrained/samples/samples_1800.txt", "r") 
d1=l1.readlines()
        
l2=open("data/top100leakedwords.txt","r")

d2=l2.readlines()
# txt1 = open("data/threethpasswords.txt",errors='ignore').read()
# txt2 = open("pretrained/samples/samples_1000.txt").read()

score= textdistance.levenshtein( d2, d1)
score_sim = textdistance.hamming.similarity(d2,d1)

In [None]:
score

In [None]:
score_sim

In [None]:
# thefile = open("pretrained/samples/samples_1000.txt", "r")


# user = input() 
# print(str(user))
# # returns first occurrence of Substring 
# result = thefile.find(user) 
# print ("Substring  found at index:", result ) 


# #fileNameOnly = thefile[:thefile.find(user)]
# print(fileNameOnly)

In [None]:
input_english = open("data/threethpasswords.txt","r",errors = 'ignore')
ganfile = open("pretrained/samples/samples_3900.txt", "r")
dict1=input_english.readlines()
dict2 = ganfile.readlines()

df = [ x for x in dict1 if x not in dict2 ]
print(len(df))
#print(thefile.find("data/threethpasswords.txt"))

In [None]:
import pickle

In [None]:
file = "pretrained/charmap_inv.pickle"
with open(file, 'rb') as f1:  
    lgr = pickle.load(f1)
f1.close()

In [None]:
lgr