In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git


In [1]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [11]:
# encode context the generation is conditioned on
# input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='tf')

urls = ['google.com/', 'youtube.com/', 'github.com/', 'facebook.com/', 'amazon.com/', 'whatsapp.com/',
       'linkedin.com', 'microsoft.com']

"""
urls = [
    'weba3e13b0e15094bdfb1.com',
    'downloadbed04b373caed221f5.co.uk',
    'internetc85b5187e7b1572ce4.org',
    'linuxda81dea9231d975291.fr',
    'servere3d8fc263b3bb7eac9.co.il',
    'serverf187b7803eb8021406.com',
    'winzipg4599a8f9403b88bf6.co.uk',
    'winziphfdd7727ea03806fcb.org',
    'webi71ae8a35ce343c91e.fr'
    ]
"""

maxLength = 20

print("Output:\n" + 100 * '-')

for url in urls:
    input_ids = tokenizer.encode(url, return_tensors='tf')

    # generate text until the output length (which includes the context length) reaches 50
    greedy_output = model.generate(input_ids, maxLength)

    print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))


Output:
----------------------------------------------------------------------------------------------------
google.com/downloads/file.php?id=834

The following is
youtube.com/watch?v=XqYXqYX-X0A

github.com/bitcoin/bitcoin-wallet/

https://github.com/bitcoin/
facebook.com/events/8397777/the-great-great-great-great
amazon.com/product/B00DQQQQ/

The Best of the
whatsapp.com/

http://www.facebook.com/pages/The-
linkedin.com/

http://www.facebook.com/groups/102345
microsoft.com/en-us/library/bb81701(v=vs.85


In [64]:
# activate beam search and early_stopping

print("Output:\n" + 100 * '-')

for url in urls:
    input_ids = tokenizer.encode(url, return_tensors='tf')
    
    beam_output = model.generate(
        input_ids, 
        maxLength, 
        num_beams=5, 
        early_stopping=True
    )

    print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/media/catalog/product
youtube.com/watch?v=Xq
github.com/bitcoin/bitcoin-qt/
facebook.com/en/photos/tnc_
amazon.com/images/content/dam/
whatsapp.com/wp-content/
linkedin.com

http://www.
microsoft.com/en-us/library/


In [65]:
# set no_repeat_ngram_size to 2
print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    beam_output = model.generate(
        input_ids, 
        maxLength, 
        num_beams=5, 
        no_repeat_ngram_size=2, 
        early_stopping=True
    )

    print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/media/catalog/product
youtube.com/watch?v=Xq
github.com/bitcoin/wiki/Bitcoin-
facebook.com/en/photos/tnc_
amazon.com/images/content/dam/
whatsapp.com/wp-content/
linkedin.com

http://www.
microsoft.com/en-us/library/


In [66]:
# set return_num_sequences > 1

print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    beam_outputs = model.generate(
        input_ids, 
        maxLength, 
        num_beams=5, 
        no_repeat_ngram_size=2, 
        num_return_sequences=5, 
        early_stopping=True
    )

    # now we have 3 output sequences

    for i, beam_output in enumerate(beam_outputs):
      print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: google.com/media/catalog/product
1: google.com/en-us/library/
2: google.com/en-us/files/
3: google.com/downloads/file.php
4: google.com/media/catalog/metadata
0: youtube.com/watch?v=Xq
1: youtube.com/watch?v=XQ
2: youtube.com/watch?v=8q
3: youtube.com/watch?v=3q
4: youtube.com/watch?v=8X
0: github.com/bitcoin/wiki/Bitcoin-
1: github.com/bitcoin/wiki/Bitcoin_
2: github.com/bitcoin/commit/b9
3: github.com/bitcoin/commit/b7
4: github.com/bitcoin/commit/3c
0: facebook.com/en/photos/tnc_
1: facebook.com/en/photos/a.
2: facebook.com/en/photos/tnc/
3: facebook.com/en/photos/tnc-
4: facebook.com/photos/a.13315
0: amazon.com/images/content/dam/
1: amazon.com/images/forsale/
2: amazon.com/images/futures/
3: amazon.com/images/finance/prim
4: amazon.com/images/themes/photos
0: whatsapp.com/wp-content/
1: whatsapp.com/en-us/
2: whatsapp.com/en-US/
3: whatsapp.com/

http://
4: whatsapp.com

In [67]:
# set seed to reproduce results. Feel free to change the seed though to get different results
print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # activate sampling and deactivate top_k by setting top_k sampling to 0
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=10,
        top_k=0
    )

    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/store/apps/details?
youtube.com/watch?v=7N
github.com/sequio/sequio-
facebook.com/banner

E-
amazon.com/ Jim Packard 19/12
whatsapp.com/2016/09/
linkedin.com/pyconf/


microsoft.com/product/79206036


In [68]:
# set seed to reproduce results. Feel free to change the seed though to get different results

print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # use temperature to decrease the sensitivity to low probability candidates
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=10, 
        top_k=0, 
        temperature=0.7
    )


    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/watch?v=5K
youtube.com/watch?v=5L
github.com/rs/bb/sig
facebook.com/us/rodney-j
amazon.com/item/27382526
whatsapp.com/2015/07/
linkedin.com/2017/08/the
microsoft.com/...

4. B


In [72]:
# set seed to reproduce results. Feel free to change the seed though to get different results

print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # set top_k to 50
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=10, 
        top_k=50
    )

    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/item/com.hud
youtube.com/watch?v=4K
github.com/zendesk/p
facebook.com/en/photos/a.
amazon.com/collections/s/S
whatsapp.com/2014/09/
linkedin.com/post/202220
microsoft.com/media/catalog/product


In [71]:
# set seed to reproduce results. Feel free to change the seed though to get different results

print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # deactivate top_k sampling and sample only from 92% most likely words
    sample_output = model.generate(
        input_ids, 
        do_sample=True, 
        max_length=10, 
        top_p=0.92, 
        top_k=0
    )

    print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
google.com/group/SHOS.

youtube.com/watch?v=6M
github.com/chengamp@gmail.
facebook.com/21TheLifeSpin/
amazon.com/2015/07/01/
whatsapp.com/listen/39
linkedin.com/group/head_ru
microsoft.com/infosec/workflow


In [15]:
# set seed to reproduce results. Feel free to change the seed though to get different results

print("Output:\n" + 100 * '-')

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    sample_outputs = model.generate(
        input_ids,
        do_sample=True, 
        max_length=10, 
        top_k=50, 
        top_p=0.95, 
        num_return_sequences=3
    )

    for i, sample_output in enumerate(sample_outputs):
      print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: google.com/item/v3-4
1: google.com/watch?v=t-
2: google.com/downloads/details?id
0: youtube.com/watch?v=4K
1: youtube.com/watch?v=r-
2: youtube.com/watch?v=gZ
0: github.com/xenocontrol/
1: github.com/u/gameloft/
2: github.com/google/android/framework/
0: facebook.com/en/photos/a.
1: facebook.com/en-us/articles/
2: facebook.com/news/local/local/
0: amazon.com/collections/s/S
1: amazon.com/collections/books/1
2: amazon.com/media/catalog/product
0: whatsapp.com/2014/09/
1: whatsapp.com/2014/11/
2: whatsapp.com/product/1806
0: linkedin.com/2013/05/the
1: linkedin.com/v/1478-
2: linkedin.com/wp-content/uploads
0: microsoft.com/product/15202214
1: microsoft.com/en-US/x86
2: microsoft.com/images/article/news/


In [3]:
import pandas as pd

### create data set of new generated urls based on popular websites

In [25]:

urls = []

urls_data = pd.read_csv("data/url_data_scraping.csv")
url_str_list = urls_data["url"]
for column in url_str_list:
    urls.append(str(column))

# create lists to store the urls and the labels
url_list = []
label_list = []

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    sample_outputs = model.generate(
        input_ids,
        do_sample=True, 
        max_length=10, 
        top_k=50, 
        top_p=0.95, 
        num_return_sequences=3
    )

    for i, sample_output in enumerate(sample_outputs):
        url_list.append(tokenizer.decode(sample_output, skip_special_tokens=True))
        label_list.append('good')
        
# create data frame of urls
url_data_frame = pd.DataFrame(url_list, columns=['url'])
# create data frame of labels
label_data_frame = pd.DataFrame(label_list, columns=['label'])

# create new data frame of both urls and labels
X = [url_data_frame["url"], label_data_frame["label"]]
headers = ["url", "label"]
X = pd.concat(X, axis=1, keys=headers)
X.drop_duplicates(subset=["url", "label"], keep=False, inplace=True)

print(X)

# write the new data set to csv file
to_csv = X.to_csv("data/transformers_data.csv", index=False)        



                                  url label
0        support.google.com/books?id=  good
1           support.google.com/en-GB/  good
2     support.google.com/site/public/  good
3              youtube.com/watch?v=7X  good
4              youtube.com/watch?v=0O  good
...                               ...   ...
1495            bp0.blogger.com/2015/  good
1496            bp0.blogger.com/2011/  good
1497         xda-developers.com/2015/  good
1498            xda-developers.com/~j  good
1499     xda-developers.com/developer  good

[1474 rows x 2 columns]


### create new generated urls based on DGA urls 

In [4]:


urls = [
    'weba3e13b0e15094bdfb1.com/',
    'downloadbed04b373caed221f5.co.uk/',
    'internetc85b5187e7b1572ce4.org/',
    'linuxda81dea9231d975291.fr/',
    'serverf187b7803eb8021406.com/',
    'winziphfdd7727ea03806fcb.org/',
    'webi71ae8a35ce343c91e.fr/'
    ]

url_list = []
label_list = []

for url in urls:
    
    input_ids = tokenizer.encode(url, return_tensors='tf')

    tf.random.set_seed(0)

    # set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
    sample_outputs = model.generate(
        input_ids,
        do_sample=True, 
        max_length=20, 
        top_k=50, 
        top_p=0.95, 
        num_return_sequences=3
    )

    for i, sample_output in enumerate(sample_outputs):
        url_list.append(tokenizer.decode(sample_output, skip_special_tokens=True))
        label_list.append('bad')
        
# create data frame of urls
url_data_frame = pd.DataFrame(url_list, columns=['url'])
# create data frame of labels
label_data_frame = pd.DataFrame(label_list, columns=['label'])

# create new data frame of both urls and labels
X = [url_data_frame["url"], label_data_frame["label"]]
headers = ["url", "label"]
X = pd.concat(X, axis=1, keys=headers)
X.drop_duplicates(subset=["url", "label"], keep=False, inplace=True)

print(X)

# write the new data set to csv file
to_csv = X.to_csv("data/transformers_dga_data.csv", index=False) 

                                                  url label
0                   weba3e13b0e15094bdfb1.com/go/main   bad
1                    weba3e13b0e15094bdfb1.com/go/get   bad
2               weba3e13b0e15094bdfb1.com/content/dam   bad
3        downloadbed04b373caed221f5.co.uk/item/282801   bad
4         downloadbed04b373caed221f5.co.uk/item/v2537   bad
5    downloadbed04b373caed221f5.co.uk/showthread.php?   bad
6            internetc85b5187e7b1572ce4.org/lib/openv   bad
7              internetc85b5187e7b1572ce4.org/lib/x86   bad
8       internetc85b5187e7b1572ce4.org/default.aspx (   bad
9            linuxda81dea9231d975291.fr/lib/openvda-7   bad
10             linuxda81dea9231d975291.fr/lib/x86_64-   bad
11   linuxda81dea9231d975291.fr/build/src/sys/modules   bad
12        serverf187b7803eb8021406.com/file.php?id=11   bad
13       serverf187b7803eb8021406.com/file.php\n\n"\n   bad
14  serverf187b7803eb8021406.com/content/dam/tnc/n...   bad
17        winziphfdd7727ea03806fcb.org/p