In [None]:
"""
The goal here is to process each dataset so that each of them can be concatenated together into one big dataset.
Then that one big dataset can be used to train, validate, and test the model.
"""


In [87]:
import pandas as pd
from sklearn.utils import shuffle

In [64]:
df_dataset1 = pd.read_csv("Dataset #1.csv")
print(df_dataset1.head())

                         url        type
0     https://www.google.com  legitimate
1    https://www.youtube.com  legitimate
2   https://www.facebook.com  legitimate
3      https://www.baidu.com  legitimate
4  https://www.wikipedia.org  legitimate


In [65]:
print(f"Dataset #1 has {df_dataset1.shape[0]} URLs" )

Dataset #1 has 450176 URLs


In [74]:
#phishing == 1
#benign == 0
df_dataset1.loc[:, 'type'] = df_dataset1['type'].replace({'phishing': 1, 'legitimate': 0})
print(df_dataset1)

                                                      url  type
0                                  https://www.google.com     0
1                                 https://www.youtube.com     0
2                                https://www.facebook.com     0
3                                   https://www.baidu.com     0
4                               https://www.wikipedia.org     0
...                                                   ...   ...
450171        http://ecct-it.com/docmmmnn/aptgd/index.php     1
450172  http://faboleena.com/js/infortis/jquery/plugin...     1
450173  http://faboleena.com/js/infortis/jquery/plugin...     1
450174                             http://atualizapj.com/     1
450175  http://writeassociate.com/test/Portal/inicio/I...     1

[450176 rows x 2 columns]


In [88]:
df_dataset1 = shuffle(df_dataset1)
print(df_dataset1)

                                                      url  type
379713  http://dmx2.com.br/data1/images/index.php?emai...     1
205029  https://www.goal.com/en-us/news/2617/youth-soc...     0
317469              https://www.veromi.com/Ryan-Cupp.aspx     0
383183  http://revistamercado.com.do/suscripciones/scr...     1
162219       https://www.chelkogroup.com/coreTeam_Jim.php     0
...                                                   ...   ...
370143                                 http://atacre.com/     1
168398  https://www.corporationwiki.com/people/index.a...     0
30700   https://www.en.wikipedia.org/wiki/Brian_Johnso...     0
418035                      http://dcfkafkeacbmnlaa.com /     1
119790   https://www.wn.com/Nickelodeon_Animation_Studios     0

[450176 rows x 2 columns]


In [79]:
df_dataset2 = pd.read_csv("Dataset #2.csv")
print(df_dataset2.head())

                                                 url  length_url  \
0              http://www.crestonwood.com/router.php          37   
1  http://shadetreetechnology.com/V4/validation/a...          77   
2  https://support-appleld.com.secureupdate.duila...         126   
3                                 http://rgipt.ac.in          18   
4  http://www.iracing.com/tracks/gateway-motorspo...          55   

   length_hostname  ip  nb_dots  nb_hyphens  nb_at  nb_qm  nb_and  nb_or  ...  \
0               19   0        3           0      0      0       0      0  ...   
1               23   1        1           0      0      0       0      0  ...   
2               50   1        4           1      0      1       2      0  ...   
3               11   0        2           0      0      0       0      0  ...   
4               15   0        2           2      0      0       0      0  ...   

   domain_in_title  domain_with_copyright  whois_registered_domain  \
0                0                

In [80]:
print(f"Dataset #2 has {df_dataset2.shape[0]} URLs" )

Dataset #2 has 11430 URLs


In [81]:
df_dataset2 = df_dataset2[['url', 'status']]

df_dataset2.loc[:, 'status'] = df_dataset2['status'].replace({'phishing': 1, 'legitimate': 0})
print(df_dataset2)

                                                     url status
0                  http://www.crestonwood.com/router.php      0
1      http://shadetreetechnology.com/V4/validation/a...      1
2      https://support-appleld.com.secureupdate.duila...      1
3                                     http://rgipt.ac.in      0
4      http://www.iracing.com/tracks/gateway-motorspo...      0
...                                                  ...    ...
11425      http://www.fontspace.com/category/blackletter      0
11426  http://www.budgetbots.com/server.php/Server%20...      1
11427  https://www.facebook.com/Interactive-Televisio...      0
11428             http://www.mypublicdomainpictures.com/      0
11429  http://174.139.46.123/ap/signin?openid.pape.ma...      1

[11430 rows x 2 columns]


  df_dataset2.loc[:, 'status'] = df_dataset2['status'].replace({'phishing': 1, 'legitimate': 0})


In [89]:
df_dataset2 = shuffle(df_dataset2)
print(df_dataset2)

                                                     url status
4124                  http://sekabetgiriss1.blogspot.com      1
3039   https://docs.google.com/forms/d/e/1FAIpQLSe0H5...      1
3769                     https://www.hersenstichting.nl/      0
9261                         https://www.autoshowny.com/      0
7741                                 https://py.pl/WE8nQ      1
...                                                  ...    ...
10161  http://www.investopedia.com/terms/p/peertopeer...      0
10914  https://wakeup-world.com/2017/06/09/how-music-...      0
5828        http://dic.academic.ru/dic.nsf/ruwiki/233279      0
7844                    http://limitlessearn.com/log_in/      1
94                                    https://apshop.vn/      0

[11430 rows x 2 columns]


In [82]:
df_dataset3 = pd.read_csv("Dataset #3.csv")
print(df_dataset3.head())

                                                 url        type
0                                   br-icloud.com.br    phishing
1                mp3raid.com/music/krizz_kaliko.html      benign
2                    bopsecrets.org/rexroth/cr/1.htm      benign
3  http://www.garage-pirenne.be/index.php?option=...  defacement
4  http://adventure-nicaragua.net/index.php?optio...  defacement


In [84]:
df_dataset3 = df_dataset3.loc[df_dataset3['type'] != 'defacement'] #since this dataset has a third class of defacement URLs, we will remove the defacement URLs to only have phishing or benign
print(f"Dataset #3 has {df_dataset3.shape[0]} URLs" )

Dataset #3 has 554734 URLs


In [85]:
df_dataset3.loc[:, 'type'] = df_dataset3['type'].replace({'phishing': 1, 'benign': 0})
print(df_dataset3)

                                                      url type
0                                        br-icloud.com.br    1
1                     mp3raid.com/music/krizz_kaliko.html    0
2                         bopsecrets.org/rexroth/cr/1.htm    0
5       http://buzzfil.net/m/show-art/ils-etaient-loin...    0
6           espn.go.com/nba/player/_/id/3457/brandon-rush    0
...                                                   ...  ...
651186            xbox360.ign.com/objects/850/850402.html    1
651187       games.teamxbox.com/xbox-360/1860/Dead-Space/    1
651188         www.gamespot.com/xbox360/action/deadspace/    1
651189      en.wikipedia.org/wiki/Dead_Space_(video_game)    1
651190          www.angelfire.com/goth/devilmaycrytonite/    1

[554734 rows x 2 columns]


In [90]:
df_dataset3 = shuffle(df_dataset3)
print(df_dataset3)

                                                      url type
562856  markewarn.com/wordpress/wp-includes/theme-comp...    0
365725  itunes.apple.com/us/podcast/ring-of-fire-green...    0
490530  gobearcats.com/sports/m-baskbl/spec-rel/cinn-m...    0
100843                                 phsfoundation.org/    0
225969  http://graphicriver.net/item/the-killer-combo-...    0
...                                                   ...  ...
131036  https://soundcloud.com/ffrr-records/oliver-hel...    0
640729      lxer.com/module/newswire/view/8009/index.html    1
123470  diaryofahollywoodstreetking.com/eddie-griffin-...    0
14356                     youtube.com/watch?v=HulylNbIK8c    0
14322            fanpix.net/gallery/max-gail-pictures.htm    0

[554734 rows x 2 columns]
