# Extracting the features from the URL

### Importing the libraries and loading the Dataset

In [1]:
import pandas as pd

### Loading the dataset from the 'Phishtank' website
Note that the file is downloaded priorly in the local system. If you want it to load from the website then run the command


!wget http://data.phishtank.com/data/online-valid.csv

In [2]:
df = pd.read_csv('2.online-valid.csv')

In [3]:
df.head() # Prints top-5 rows

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6557033,http://u1047531.cp.regruhosting.ru/acces-inges...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:43+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
1,6557032,http://hoysalacreations.com/wp-content/plugins...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:37+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
2,6557011,http://www.accsystemprblemhelp.site/checkpoint...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:54:31+00:00,yes,2020-05-09T21:55:38+00:00,yes,Facebook
3,6557010,http://www.accsystemprblemhelp.site/login_atte...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:53:48+00:00,yes,2020-05-09T21:54:34+00:00,yes,Facebook
4,6557009,https://firebasestorage.googleapis.com/v0/b/so...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:49:27+00:00,yes,2020-05-09T21:51:24+00:00,yes,Microsoft


#### Among the whole dataset, pick up randomly about 7000 points for Phishing

In [4]:
phish = df.sample(n= 7000, random_state= 42).copy()

In [5]:
phish = phish.reset_index(drop= True)

In [6]:
phish

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6549743,https://iptf.ir/.well-known/acme-challenge/cha...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-06T08:00:53+00:00,yes,2020-05-06T08:02:24+00:00,yes,Other
1,6524799,https://lynshirt.com/wp-admin/PayPal/customer_...,http://www.phishtank.com/phish_detail.php?phis...,2020-04-23T19:00:26+00:00,yes,2020-04-23T19:01:57+00:00,yes,Other
2,6509811,https://hotdealsaz.com/Secure/inline.php,http://www.phishtank.com/phish_detail.php?phis...,2020-04-16T16:08:38+00:00,yes,2020-05-03T04:04:08+00:00,yes,PayPal
3,6546380,http://lz5.1ee.myftpupload.com/mvc/b105e5a192f...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-04T16:06:41+00:00,yes,2020-05-04T16:51:52+00:00,yes,Other
4,4495683,http://claassistencia.com.br/wp-admin/includes...,http://www.phishtank.com/phish_detail.php?phis...,2016-09-28T17:14:00+00:00,yes,2016-09-28T22:16:18+00:00,yes,Other
...,...,...,...,...,...,...,...,...
6995,6319152,http://vergaralandscaping.com/infosnet/,http://www.phishtank.com/phish_detail.php?phis...,2019-12-13T15:00:06+00:00,yes,2019-12-13T15:01:37+00:00,yes,PayPal
6996,6443699,http://mcsquareintl.com:32000/mail/expiredpass...,http://www.phishtank.com/phish_detail.php?phis...,2020-03-11T00:45:17+00:00,yes,2020-03-11T13:54:12+00:00,yes,Other
6997,6549801,http://grupdewasa18.whatsappp.my.id/,http://www.phishtank.com/phish_detail.php?phis...,2020-05-06T09:00:58+00:00,yes,2020-05-06T09:04:08+00:00,yes,Other
6998,6546050,https://rebrand.ly/zitln6v,http://www.phishtank.com/phish_detail.php?phis...,2020-05-04T14:31:56+00:00,yes,2020-05-04T14:33:22+00:00,yes,Other


### Loading the Benign Dataset

In [7]:
df1 = pd.read_csv('1.Benign_list_big_final.csv')

In [8]:
df1.columns = ['URLs']

In [9]:
df1

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...
...,...
35372,https://lastpass.com/signup2.php?ac=1&from_uri...
35373,https://lastpass.com/signup2.php?ac=1&from_uri...
35374,https://lastpass.com/signup2.php?ac=1&from_uri...
35375,https://lastpass.com/signup2.php?ac=1&from_uri...


#### Randomly pick up about 10000 points from the dataset for Legitimate

In [10]:
leg = df1.sample(n= 10000, random_state= 42).copy()
leg

Unnamed: 0,URLs
29758,http://correios.com.br/Para-governo/tribunais-...
9985,http://caixa.gov.br/voce/habitacao/financiamen...
21232,http://olx.ua/uk/list/q-%D0%BF%D0%BB%D0%B0%D1%...
200,http://emgn.com/entertainment/10-films-that-en...
27781,http://metro.co.uk/2015/04/11/one-direction-re...
...,...
2859,http://ringring.vn/6-tran-chien-ac-liet-nhat-t...
26780,http://motthegioi.vn/thoi-su-facebook/chi-175-...
1916,http://thechive.com/2015/05/12/snuggle-up-with...
409,http://khabaronline.ir/(X(1)S(ejqjokzltt2kxp0k...


In [11]:
leg = leg.reset_index(drop= True)
leg

Unnamed: 0,URLs
0,http://correios.com.br/Para-governo/tribunais-...
1,http://caixa.gov.br/voce/habitacao/financiamen...
2,http://olx.ua/uk/list/q-%D0%BF%D0%BB%D0%B0%D1%...
3,http://emgn.com/entertainment/10-films-that-en...
4,http://metro.co.uk/2015/04/11/one-direction-re...
...,...
9995,http://ringring.vn/6-tran-chien-ac-liet-nhat-t...
9996,http://motthegioi.vn/thoi-su-facebook/chi-175-...
9997,http://thechive.com/2015/05/12/snuggle-up-with...
9998,http://khabaronline.ir/(X(1)S(ejqjokzltt2kxp0k...


# Feature Extraction

In [12]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

## Address Bar Based Features:
Many features can be extracted that can be consided as address bar base features. Out of them, below mentioned were considered for this project.

•	Domain of URL

•	IP Address in URL

•	"@" Symbol in URL

•	Length of URL

•	Depth of URL

•	Redirection "//" in URL

•	"http/https" in Domain name

•	Using URL Shortening Services “TinyURL”

•	Prefix or Suffix "-" in Domain

Each of these features are explained and the coded below:

### Domain of the URL
Here, we are just extracting the domain present in the URL. This feature doesn't have much significance in the training. May even be dropped while training the model.

In [13]:
def getDomain(url):  
    domain = urlparse(url).netloc
    if re.match(r"^www.",domain):
        domain = domain.replace("www.","")
    return domain

### IP address in the URL
Checks for the presence of IP address in the URL. URLs may have IP address instead of domain name. If an IP address is used as an alternative of the domain name in the URL, we can be sure that someone is trying to steal personal information with this URL.

In [14]:
def havingIP(url):
    try:
        ipaddress.ip_address(url)
        ip = 1 #phishing
    except:
        ip = 0 #legitimate
    return ip

### "@" Symbol in URL
Checks for the presence of '@' symbol in the URL. Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.

In [15]:
def haveAtSign(url):
    if "@" in url:
        at = 1 #phishing   
    else:
        at = 0 #legitimate   
    return at

### Length of the URL
Computes the length of the URL. Phishers can use long URL to hide the doubtful part in the address bar.

In [16]:
def getLength(url):
    if len(url) < 54:
        length = 0            
    else:
        length = 1            
    return length

### Depth of the URL
Computes the depth of the URL. This feature calculates the number of sub pages in the given url based on the '/'.

In [17]:
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth+1
    return depth

### Redirection "//" in URL
Checks the presence of "//" in the URL. The existence of “//” within the URL path means that the user will be redirected to another website. The location of the “//” in URL is computed. We find that if the URL starts with “HTTP”, that means the “//” should appear in the sixth position. However, if the URL employs “HTTPS” then the “//” should appear in seventh position.

In [18]:
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1 #phishing
        else:
            return 0 #legitimate
    else:
        return 0

### "http/https" in Domain name
Checks for the presence of "http/https" in the domain part of the URL. The phishers may add the “HTTPS” token to the domain part of a URL in order to trick users.

In [19]:
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

### Using URL shortening services- "Tiny URLs"
URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an “HTTP Redirect” on a domain name that is short, which links to the webpage that has a long URL.

In [20]:
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

##### Checking for Shortening Services in URL (Tiny_URL)

In [21]:
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

### Prefix or Suffix "-" in Domain
Checking the presence of '-' in the domain part of URL. The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.

In [22]:
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1  # phishing
    else:
        return 0  # legitimate

## Domain based Features

•	DNS Record

•	Website Traffic

•	Age of Domain

•	End Period of Domain

In [23]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

### DNS Record
For phishing websites, either the claimed identity is not recognized by the WHOIS database or no records founded for the hostname

### Web Traffic
This feature measures the popularity of the website by determining the number of visitors and the number of pages they visit. However, since phishing websites live for a short period of time, they may not be recognized by the Alexa database (Alexa the Web Information Company., 1996). By reviewing our dataset, we find that in worst scenarios, legitimate websites ranked among the top 100,000.

In [24]:
def web_traffic(url):
    try:
    #Filling the whitespaces in the URL if any
        url = urllib.parse.quote(url)
        rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
        rank = int(rank)
    except TypeError:
        return 1
    if rank <100000:
        return 1
    else:
        return 0

### Age of the Domain
This feature can be extracted from WHOIS database. Most phishing websites live for a short period of time. The minimum age of the legitimate domain is considered to be 12 months for this project.

In [25]:
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date
    if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
        try:
            creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
              return 1
    if ((expiration_date is None) or (creation_date is None)):
        return 1
    elif ((type(expiration_date) is list) or (type(creation_date) is list)):
        return 1
    else:
        ageofdomain = abs((expiration_date - creation_date).days)
        if ((ageofdomain/30) < 6):
            age = 1
        else:
            age = 0
    return age

### End period of Domain
This feature can be extracted from WHOIS database. For this feature, the remaining domain time is calculated by finding the different between expiration time & current time.

In [26]:
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date
    #print(type(expiration_date))
    if isinstance(expiration_date,str):
        try:
            expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
        except:
            return 1
    if (expiration_date is None):
        return 1
    elif (type(expiration_date) is list):
        return 1
    else:
        today = datetime.now()
            #print(type(today))
        end = abs((expiration_date - today).days)
    if ((end/30) < 6):
        end = 0
    else:
        end = 1
    return end

## HTML and JavaScript based features
•	IFrame Redirection

•	Status Bar Customization

•	Disabling Right Click

•	Website Forwarding


### IFrame Redirection
IFrame is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders.

In [27]:
import requests

In [28]:
def iframe(response):
    if response == "":
        return 1
    else:
        if re.findall(r"[<iframe>|<frameBorder>]", response.text):
            return 0
        else:
            return 1

### Status bar customisation
Phishers may use JavaScript to show a fake URL in the status bar to users. To extract this feature, we must dig-out the webpage source code, particularly the “onMouseOver” event, and check if it makes any changes on the status bar

In [29]:
def mouseOver(response): 
    if response == "" :
        return 1
    else:
        if re.findall("<script>.+onmouseover.+</script>", response.text):
            return 1
        else:
            return 0

### Disabling the Right Click
Phishers use JavaScript to disable the right-click function, so that users cannot view and save the webpage source code. This feature is treated exactly as “Using onMouseOver to hide the Link”. Nonetheless, for this feature, we will search for event “event.button==2” in the webpage source code and check if the right click is disabled.

In [30]:
def rightClick(response):
    if response == "":
        return 1
    else:
        if re.findall(r"event.button ?== ?2", response.text):
            return 0
        else:
            return 1

### Website forwarding
The fine line that distinguishes phishing websites from legitimate ones is how many times a website has been redirected. In our dataset, we find that legitimate websites have been redirected one time max. On the other hand, phishing websites containing this feature have been redirected at least 4 times.

In [31]:
def forwarding(response):
    if response == "":
        return 1
    else:
        if len(response.history) <= 2:
            return 0
        else:
            return 1

### Now, Computing URL features

In [32]:
def featureExtraction(url,label):

    features = []
  #Address bar based features (10)
    features.append(getDomain(url))
    features.append(havingIP(url))
    features.append(haveAtSign(url))
    features.append(getLength(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(tinyURL(url))
    features.append(prefixSuffix(url))
  
  #Domain based features (4)
    dns = 0
    try:
        domain_name = whois.whois(urlparse(url))
    except:
        dns = 1

    features.append(dns)
    features.append(web_traffic(url))
    features.append(1 if dns == 1 else domainAge(domain_name))
    features.append(1 if dns == 1 else domainEnd(domain_name))
  
  # HTML & Javascript based features (4)
    try:
        response = requests.get(url)
    except:
        response = ""
        
        
    features.append(iframe(response))
    features.append(mouseOver(response))
    features.append(rightClick(response))
    features.append(forwarding(response))
    features.append(label)
  
    return features

### Legitimate URLs:
Feature Extraction is done on legitimate URLs. 

In [33]:
legi_features = []
label = 0

for i in range(0, 100):
    url = leg['URLs'][i]
    print(url)
    legi_features.append(featureExtraction(url,label))

http://correios.com.br/Para-governo/tribunais-e-poder-judiciario/solucoes-gratuitas-de-apoio-e-gestao/construtor-para-mala-direta-postal
http://caixa.gov.br/voce/habitacao/financiamento/aquisicao-imovel-usado/Paginas/default.aspx
http://olx.ua/uk/list/q-%D0%BF%D0%BB%D0%B0%D1%82%D1%8C%D0%B5/bIE64RPKp0urlDE3vSbHw8dRLeUY1kOFK2_KOEmw9UT.A7
http://emgn.com/entertainment/10-films-that-ended-the-careers-of-successful-actors/
http://metro.co.uk/2015/04/11/one-direction-release-their-first-poster-without-zayn-malik-for-on-the-road-again-tour-5145802/
http://techcrunch.com/video/colleen-taylor-ingrid-lunden-talk-battlefield-finalists/518810296/
http://bdnews24.com/environment/2015/03/19/for-the-love-of-art-and-passion-to-innovate
http://khabaronline.ir/(X(1)S(0vpjhjtd02z3pdvgdfyukshr))/detail/416398/World/diplomacy
http://udn.com/news/story/7482/858498-%E3%80%8C%E6%B2%99%E8%A5%BF%E7%B1%B3%E3%80%8D%E7%94%B1%E6%B7%BA%E5%B1%A4%E6%83%85%E8%89%B2-%E7%9C%8B%E9%8A%80%E5%B9%95%E8%A3%A1%E5%A4%96%E5%A4%9A

http://gtbank.com/media-centre/gtbank-in-the-news/14-media/press-releases/539-gtbank-adjudged-best-bank-in-nigeria-and-africa-at-the-2014-euromoney-awards-1st-nigerian-bank-to-win-euromoney-best-bank-in-africa-award
http://khabaronline.ir/(X(1)S(zv0jvfqnirzv02vdv33nlcia))/detail/405392/Economy/macroeconomics
http://buzzfil.net/m/show-art/voici-des-photos-uniques-qui-ont-etaient-prises-au-moment-parfaits-2.html
http://torcache.net/torrent/58698A276E0CFAAA57FBFB6B7177AE4705BCECF4.torrent?title=[kickass.to]gotham.s01e18.hdtv.x264.lol.ettv
http://distractify.com/post/related/id/5457e73d4a0c4b4a65fed9b0/skip/10/limit/10/back/0
http://gizmodo.com/how-heartbleed-works-the-code-behind-the-internets-se-1561341209
http://torcache.net/torrent/16CFF1F484DE83E44B082960683C6BF3A90FD5AE.torrent?title=[kickass.to]the.official.uk.top.40.singles.chart.10th.may.2015.mp3.320.kbps.tx
http://udn.com/news/story/7238/901281-%E8%B2%A1%E9%83%A8%E6%AA%A2%E8%A8%8E%E5%85%AC%E9%8A%80-%E6%94%BF%E7%AD%96%E9%9D%9E%E8%

In [None]:
for i in range(200, 1000):
    url = leg['URLs'][i]
    print(url)
    legi_features.append(featureExtraction(url, label))

http://mediaset.it/mediasetextra/articoli/pasion-morena-le-trame-dal-23-al-27-febbraio_11441.shtml
http://sfglobe.com/2015/02/23/when-mom-leaves-her-son-in-the-car-he-pays-the-price/
http://9gag.tv/p/a5QaPd/this-be-my-eyes-app-lets-people-with-sight-guide-blind-people-over-video-chat
http://correios.com.br/Para-governo/tribunais-e-poder-judiciario/encomendas/logistica-integrada
http://atwiki.jp/wiki/%E8%A9%A6%E9%A8%93%E4%B8%AD%20%E9%9C%87%E7%81%BD%E5%AF%BE%E5%BF%9C
http://fishki.net/upload/users/108221/201505/12/small/ed6e3b0fe7775aeff576b8896adecf16.jpg
http://elitedaily.com/music/jay-zs-tidal-may-already-flop-just-two-weeks-announced/1010108/
http://hollywoodlife.com/2015/03/28/iggy-azalea-kca-hair-makeup-2015-nickelodeon-kids-choice-awards/
http://io9.com/this-masterful-scene-from-lord-of-the-rings-gives-me-ch-1702905920/module/
http://techcrunch.com/2013/02/07/t-mobile-aiming-to-be-first-with-blackberry-z10-launch-in-the-u-s-with-mid-march-release/
https://medium.com/ted-fellows/op

In [None]:
for i in range(2000, 3000):
    url = leg['URLs'][i]
    print(url)
    legi_features.append(featureExtraction(url,label))

In [None]:
for i in range(3000, 4000):
    url = leg['URLs'][i]
    #print(url)
    legi_features.append(featureExtraction(url,label))

In [None]:
for i in range(4000, 5000):
    url = leg['URLs'][i]
    #print(url)
    legi_features.append(featureExtraction(url,label))

In [65]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,correios.com.br,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0
1,caixa.gov.br,0,0,1,6,0,0,0,0,1,1,1,1,0,0,1,1,0
2,olx.ua,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,1,0
3,emgn.com,0,0,1,2,0,0,0,0,1,1,1,1,1,1,1,1,0
4,metro.co.uk,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0


In [67]:
legitimate = legitimate.drop(['Domain'], axis=1)

In [68]:
legitimate

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0
1,0,0,1,6,0,0,0,0,1,1,1,1,0,0,1,1,0
2,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,1,0
3,0,0,1,2,0,0,0,0,1,1,1,1,1,1,1,1,0
4,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0
5,0,0,1,3,0,0,0,0,1,1,1,1,0,0,1,0,0
6,0,0,1,5,0,0,0,0,1,1,1,1,0,0,1,0,0
7,0,0,1,5,0,0,0,0,1,1,1,1,0,0,1,1,0
8,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0
9,0,0,1,2,0,0,0,0,1,1,1,1,0,0,1,0,0


In [69]:
legitimate.to_csv('legitimate_dup.csv', index= False)

### Phishing URLs:
Feature Extraction is done on phishing URLs.

In [None]:
phish_features = []
label = 1
for i in range(0, 5000):
    url = phish['url'][i]
    print(url)
    phish_features.append(featureExtraction(url,label))

https://iptf.ir/.well-known/acme-challenge/chase/firstlog.php?public/enroll/IdentifyUser-aspx-LOB=RBGLogon=MTk1MzA2NjQzNA==MTk1MzA2NjQzNA==&amp;session=MTk1MzA2NjQzNA==MTk1MzA2NjQzNA==
https://lynshirt.com/wp-admin/PayPal/customer_center/customer-IDPP00C512/signin/signin.php?cmd=_update-information&account_update=d39ad7799cf2c846b011b0f6dd39d8e2&lim_session=7f30141666932f7d373f6e1fc3df8a3c54e25211
https://hotdealsaz.com/Secure/inline.php
http://lz5.1ee.myftpupload.com/mvc/b105e5a192f80ef3ec4ee4756af089a3
http://claassistencia.com.br/wp-admin/includes/drnewh/
https://grp02-id-rakuten-co-jp.com/
http://www.cwa4501.org/ControlPanel
http://www.i-m.mx/hawaii/hawaii/
https://5000-aruba.com/staff
https://docs.google.com/forms/d/e/1FAIpQLSc-3F1sBmq3GM-5HK8YBH3cdv1Ea8YRNNE50KmnOnsQswFFxg/viewform?usp=sf_link
https://www.hortipower.co.uk/recepit46/customer_center/customer-IDPP00C845/myaccount/signin
https://benson6303.com/wp-includes/SimplePie/Content/docfile/Docuimage/?email=nobody@mycraftmail.

http://bcaffe.ro/dhl/dhl/index.php
http://cetobacco.com/wp-content/plugins/wise-chat/js/3rdparty/pholidolite/
http://mythree.xyz/banks/
https://talkingflight.com/wp-admin/js//SBHDS/sbc/sbc/sbcglobal.net.htm
http://theingredients.net/var/_message
https://rcone.kvhkosher.org/wp-admin/css/login/?email
http://16tiles.com/files/
https://docs.google.com/forms/d/e/1FAIpQLSf5upFBpxCw_aeYIHL4kPWtoI7Gv1skfDRN3-2QTuw9kW1cpw/closedform
https://drjbsclinic.com/56/ourtime/
https://storage.googleapis.com/update-securities20420.appspot.com/%2525%2525%2525%2525%2525%2525/login.html#test@fairpoint.net
http://verfiayiosnmer.com/chase/identification.php?search?q=eew&amp;oq=eew&amp;aqs=chrome..69i57j0l5.833j0j8&amp;sourceid=chrome&amp;ie=UTF-8/search?num=100&amp;dcr=0&amp;source=hp&amp;q=eyy6&amp;oq=eyy6&amp;gs_l=psy-ab.3...885.1646.0.2222.6.5.0.0.0.0.326.616.2-1j1.2.0....0...1.1.64.psy-ab..4.1.325.0..0j35i39k1j0i131k1.0.TRU-4uQcrqI/search?q=gjjd&amp;ie=utf-8&amp;oe=utf-8&amp;client=firefox-b-ab&amp;gfe_rd

https://wdestaques15.otimosnegociosparavoce.com/superpromocao/189/produto/134510102/c21hcnRwaG
https://orangeetmoi.fr-assurancev1.cf/Login/index.php?ID=2401453326
https://bcpzonasegurabeta-viabcp-com.northerncomcap.com/pagina/inicia-sesion
http://brussel-airport.be/hohvib7ffiolsgsg
https://ip-socialmedia.com/seed-social/jb/DHL_J/dhl/DHL_Login_2020/index.php
https://oninet.al/wordpress/wp-content/plugins/Manage/Protect-lD-8456132134561az3154832s1fds84631564fds4684fd31qf6ds468dqf1651dqs6f81/
http://tuliving.com.ar/img/
http://xxn.getit12.com/
https://atendimento-pg-series.joomla.com/br/netflix/index.php?q=2sptq6sf2spt-zkat-2sptzkatzkat-1ailzkat&s=2sptq6sf2spt-zkat-2sptzkatzkat-1ailzkat&n=2sptq6sf2spt-zkat-2sptzkatzkat-1ailzkat
http://atlantabasecamp.com/fax/office365/02ea14f46793eb8127da72221818b229/authentication.php
http://bizwebnature.wr02.dhrcenter.com/wp-content/themes/Blog_Shop/libs/sitemap1.php/racsc/eba/?sense=cmavw1af0777m
https://app.aktifitypages2020.my.id/
http://sadeg.club/4

http://qrxhskj.cluster028.hosting.ovh.net/impots/remboursement/id8327732098784.3782/client/
http://ebay-co-uk.com/my-ebay-watch-list/my-ebay-watch-list-login
http://tny.sh/pSt3MCe
http://mail.wealthwelldone.net/
http://www.sieck-kuehlsysteme.de/userdata/images/Produktion/login?email=jsmith@imaphost.com
http://www.lterbisjyougds.buzz
http://spectralwirejewelry.com/https/1/sucursalpersonas.transaccionesbancolombia.com/mua/VALIDATOR_TC2.php
http://desdeelamor.com/wp-includes/SimplePie/Parse/hd/demx/index.php
http://yorkshire.es/c/
http://rangeeladecorator.in/wp-ping/CHASE.COM/chase/user/znjzknzk=/myaccount
http://comoliberarhuawei.com/mon/id/pst/b8da3/
http://businesstalkdailyuk.com/wp-includes/hl/2020dhl_topscript/dhl_topscript/source/index.php
http://www.dalatngaynay.com/image/cdevio/index.htm
http://physics.uctm.edu/csemc/sites/default/files/nls/
http://floorsdirectltd.co.uk/chase/surf3.php
http://lankeshpatrike.com/cmx/comsx
https://app-original.wpdevcloud.com/wp-includes/SimplePie/HT

http://bookitbaby.net/https/34.237.113.1137221/sucursalpersonas.transaccionesbancolombia.com/mua/index.html
http://art-sticker.fr/humby/librairies/manual/ab40db1a73005ddbeab463fa60681225/
http://btlinternetco.com/logins/login/index2.php
https://icipedudu-my.sharepoint.com/personal/mkaranja_icipe_org/_layouts/15/WopiFrame.aspx?guestaccesstoken=re27H63FLCklE8EZ9uJ3%2bmbypFu8Te0J3ODTDaeiFlU%3d&docid=1_1bdc33023238341e8b1471eb8a883076b&wdFormId=%7B24125711%2D8AD2%2D4CA2%2DBFD8%2D5B64DCC4E62D%7D&action=formsubmit
https://spatialsys.com.ru/alibaba.com/
http://obecimso.net/ckfinder/userfiles/_thumbs/Images/-/EN-OTP/2020/Login/
http://m5wx.com/news/wp-content/plugins/fyaoutu/yo/wells
http://www.paypalservicegifts.com/blockchain/c5a5e9167f64a9b04cfab0ce5/verify.html
https://www.alertastelecredito.site/
http://krutiznaspb.ru/wp-content/uploads/2020/03/login.php
https://lantikiradar.com/rice/onedri/one/
https://mail.glosowanie-kliknij.eu
http://vitinhopark.com.br/vendor/guzzle/guzzle/tests/Guzzle

https://congaban.com/image/aaaa/aaaa/PortailAS/fr/ameliassurane/assure_somtc=true/5cadc1a977025726f9119db1d15886de/
https://www.amieuerxcaess.com/input.php
http://ukbusinessdaily.com/wp-admin/includes/access/wells
http://cda-interiordesign.com/wp-content/plugins/default0/xndetail/pictures/t-blog-landing/subcription.php/mqx/dqrgm
https://trocarponto.com/
http://staging.theelegantyou.com/wp-admin/wp-user/wp-user/auth.php
https://outlookhelpdesk.activehosted.com/admin/
http://bit.ly/35Kp3gG
http://lonestarsanitation.com/wp-includes/SimplePie/Cache/zamalik/dafa/myaccount/update_carding
http://pekersigorta.com/blog/control/
https://www.apple.com.belgie.support/
http://sweettravel.it/wp-includes/fonts/logon.do.php
https://satoracpo1980.blogspot.com/
http://haraktis.ru/includes
https://www.yoho.com.tw/views/0ffice/step2.php?ul=_JeHDCXZFUq_VJOXFGVQWHtoersdfGYDw17742523456676546418%26fid.13InboxLight.aspxn.1774256418%26fid.1252899642528194560278553343InboxLighwsd5dfgsgfgyuiokjlt996123212342_Pro

http://gkjx168.com/images
https://central-de-vendas.com/americanas/176975498/?geladeirarefrigerador-electrolux-french-door-dm84x-579-litros-inox-nas-americanas&skullid=215097509&cart=MjE1MDk3NTA5
http://lgt-online.com
http://www.speedy.com.pk/wp-content/plugins/rbcbank19/rbcgi3m01.php?rbaccess
https://firebasestorage.googleapis.com/v0/b/pfie-2b80c.appspot.com/o/index.html?alt=media&amp;token=16b51ad4-785e-4345-887b-6450883a0b6b#
https://composito.com.br/wp-includes/fonts/ayyf/
http://loveandwishes.in/redirect.html
http://www.pd-iskra-lj.si/signin/customer_center/customer-IDPP00C544/
http://www.fuentesfidedignas.com.mx/portal2014/modules/mod_finder/www.bradesco.com.br/identificacao.jsa?email
http://www.entraegypt.com/alfa
https://mazeadvokater-my.sharepoint.com/:b:/g/personal/sh_mazeadvokater_se/EQK6ogpy2wVKr18Ck_YFDOUBHXQg6-_nZP2Ol6nLRetZ1w
https://forms.office.com/Pages/ResponsePage.aspx?id=DQSIkWdsW0yxEjajBLZtrQAAAAAAAAAAAANAASEDM8VUM1ZRRk9ZV1JZSjlVQVRaWFgyRTBXVTJBOC4u
https://maclim

http://www.sastaservices.com/dd/67858731802b5ec26544badb93912aea/
https://bd-times.com/vendor/symfony/console/Event/acces/WellsFargo/WellsFargo/wells/
https://onedrive.live.com/redir?resid=CC542B30A231222A%21104&authkey=%21AAgyx6EYLXTVS0I&page=View&wd=target%28SouthWest%20Funding.one%7Cdb02ff26-a000-4a11-a770-a766574c7395%2FEric%20Barefoot%C2%A0has%20shared%20a%20file%20with%20you%7Cbbd45792-c84c-4ac7-b2f5-1368247a21f4%2F%29
http://aegisredmedia.com/wp-content/plugins/sid/engl/WeTransfer/WeTransfer
http://zerbe-medien.com/blog/control/
https://www.millmarkgroup.com/wp-admin/css/update_account/b18c7811f8fbac87de1faa5e52edb623/?dispatch=DpnORwTN1NTWIsQNEUFJIWlOD2UZTNtjj2vKRT7xjVPK4lZE7R&amp;email=
https://www.zonasegura-vialbcp.com/iniciar-sesion
http://www.hitsem.com/images?hxxp:/us.battle.net/login/en/?ref=hxxp:/awegazous.battle.net/d3/en/index&amp;
http://www.dhlexpressmail.com/
http://stolizaparketa.ru/wp-content/themes/twentyfifteen/css/read/chinavali/index.php?email=abuse@fit-onlin

http://nordichairistanbul.com/wp-admin/oWa/e95d4f8ad5219173693b0a3fb48ea900
http://ltaucard.com/itaucard/
https://resin3dprinterstore.com/_a/?email=helen.giza@takeda.com
http://www.gcspolk.com/wp-snapshots/egb/DHLAUTO
https://securesquared.co.uk/templates/system/mtb2020/redirect.html
https://lightandlight.in/Application/Approval/.indexmployeeexport.main_pagemain_page/.main_pagemain_page.quarantrineemails/._emailquarantrine-emailquarantrine/.EmployeeAdministrator/.EmployeePanel/.oluwagbemilanolorun/.quarantineemailnow/._emailquarantrine-emailquarantrine/?email=mhill@smtcorp.com&amp;option=com_fabrik&amp;view=visualization&amp;id=5&amp;fk_etapes___voyage_id_raw=408&amp;resetfilters=1&amp;tmpl=component&amp;iframe=1
https://maranathastudio.com/wp-includes/images/wellsfargo/
http://oralco.co/wp-admin/images/n/WE-TRANSFER/login.php
https://boma-ren.firebaseapp.com/
https://mygshock.com/new.po/pdf/login.html
http://www.equalchances.org/net/page/myaccount/settings
http://kaizenpak.com/1/https

https://acct-famres-inv.com/adobe/pdf/authentication
https://shippeirfinances.com/prop/secure/mic/webmail.php
http://smiles.al-wed.com//https
http://fb-recovery-10000076932-it.tk/update_security.htm
http://join-groub-whatsapp34.25u.com/
http://kadubeureum-serang.desa.id/wp-content/auto/autolink/autolinkauto/mailboxx/mailbox/index.php?email=_xxx@yyy.com____/
https://onedrive.live.com/?authkey=!ADixRSjRdlSoZ7Q&amp;cid=F36853A446C64CD2&amp;id=F36853A446C64CD2!1056&amp;parId=root&amp;o=OneUp
https://secure.runescape.com-en.ru/m=weblogin/loginform945,925,691,53774327,2167
http://hitsem.com/images?http:/us.battle.net/login/en?ref=http:/awegazous.battle.net/d3/en/index
https://makemoneymonkey8888.xyz/wp-includes/sodium_compat/src/Core/Curve25519/Ge/login.php?cmd=login_submit&id=98c4128f62d5383886bcc7b5ad43bb6d98c4128f62d5383886bcc7b5ad43bb6d&session=98c4128f62d5383886bcc7b5ad43bb6d98c4128f62d5383886bcc7b5ad43bb6d
http://stolizaparketa.ru/wp-content/themes/twentyfifteen/css/read/chinavali/inde

http://sieck-kuehlsysteme.de/userdata/images/Produktion/login/?email=nobody@mycraftmail.com
https://unitedmetalshit.com/file/webMail/index.php?email
https://banethokdeurali.org.np/Nora/excel/others/7gbt6hdwrqn7etvj74ejkxxt.php?rand=13InboxLightaspxn.1774256418&fid.4.1252899642&fid=1&fav.1&rand.13InboxLight.aspxn.1774256418&fid.1252899642&fid.1&fav.1&email=nobody@mycraftmail.com&emailID=nobody&.rand=13InboxLight.aspx?n=1774256418&fid=4
http://ofertasdamagalupravoce.com/promocao.php
http://algotextil.com.br/
http://173.199.186.15/qabdlshop/libraries/joomla/environment/secure/signin/
http://po.do/U0
http://bitoex-claimextrabonuses.000webhostapp.com/
https://firebasestorage.googleapis.com/v0/b/payqredvupdateroll2020.appspot.com/o/index.html?alt=media&amp;token=a1c12e92-87c6-4770-9c25-70493d221b9b#me@your.mom
https://atendimento-sac-pg.joomla.com/br/netflix/index.php?q=evscdjo3evsc-nlyy-evscnlyynlyy-z12xnlyy&s=evscdjo3evsc-nlyy-evscnlyynlyy-z12xnlyy&n=evscdjo3evsc-nlyy-evscnlyynlyy-z12xnlyy

https://orientationinlaws.com/hoffmanbrown/document/login.php?cmd=login_submit&amp;id=edb2a0f7a80a873d4155afdc836860dcedb2a0f7a80a873d4155afdc836860dc&amp;session=edb2a0f7a80a873d4155afdc836860dcedb2a0f7a80a873d4155afdc836860dc
http://mandan.com.tr/af343s612d/~Netflix-Meses-Gratis/
http://lijstenmakerijvanantwerpen.nl/cs/
https://aberturasagma.com/wp-includes/css/file/0nfile/oneddrive/login-option.php
https://eye-lucir.com/?d=
https://sunge-ode.firebaseapp.com/
https://osmaisveedidos.com/americanas/orders/
https://undermart.in/wp-content/themes/mapro/js/identification.htm
http://rioverdepar.com.br/plan/Seclude/login.php?l=_JeHFUq_VJOXK0QWHtoGYDw0774256418&amp;fid.13InboxLight.aspxn.0774256418&amp;fid.125289964252813InboxLight99642_Product-email&amp;email=
https://brasil-acesso.pagedemo.co/
https://madrugadaolanches.com.br/css/gti/apple.com/
http://endo.ba/bancor/8bb8e1892ee9af81dfc6645a49e0320b
http://www.amex.alert.account.logogistics.com.au/1/login.php?cmd=login_submit&id=d9e762e2001

https://savesignin.app.link/eoq64d04w1vf5bouxskocwcby
https://www.momos-ambachtelijkeijs.nl/https/99.235.104.1697667/sucursalpersonas.transaccionesbancolombia.com/mua/index.html
http://www.sastaservices.com/dd/45eb62820fa08d98cc0b08f6199fe3ee
http://u1035597.cp.regruhosting.ru/acces-inges-20200104-t452/d01cc/sms2.html
http://mamaregy.com/webmail.php?email=deniserrowley@prepaidlegal.com
http://dhl.export.dns-cloud.net/sign/in
http://paxful.com.transfer.boubit.com
http://www.lgsolar.de/global/img/css.php
https://docs.google.com/forms/d/e/1FAIpQLScWCsH_Hzs9Vbuy8eBI9HDEd6rNQndfZFph5b4ehFs_KpJfiQ/viewform?usp=sf_link
https://infopublishersassociation.com/wp-includes/images/smilies/bofa/x1x/v2/cee69007311e1da/email.php
https://sandjef03.typeform.com/to/xPxr65
http://mamangothounga.com/wsbcglobal.htm
http://ie.monclernorway.com
https://storage.googleapis.com/outlook-webapp-832020.appspot.com/%2525%2525%2525%2525%2525%2525%2525%2525%2525%2525%2525%2525/owa.html#admin@example.com
http://info.li

http://jlgraphic.fr/js/wepmail00/step1.php
http://castromonitoramento.com.br/rule/assume
http://prsicilacap.org/theme/
http://paypal.letima.de/
http://onlinebusservice.com/-/telekom
https://jbbc.com.tw/wp-admin/network/MTB-ELA7A-LAST/MTB-ELA7A/dashboard/details.php
http://stokeharvester.com/rbc2/details.php
http://gg.gg/itoken-itaucard2-0
http://16tiles.com/files
https://globalnetinternet.com/js/Netflix/refund/
https://smsorangephonemail.myfreesites.net/
https://dota2asia.com/
https://golfencordoba.com/base/enterpassword.php?3DGe96158230643978bfec15544d8033e6b15c1e9bc5a08378bfec15544d8033e6b15c1e9bc5a08378bfec15544d8033e6b15c1e9bc5a08378bfec15544d8033e6b15c1e9bc5a08378bfec15544d8033e6b15c1e9bc5a083&amp;AP___=3Dstacey.c=&amp;error=
http://creativecombat.com/wp-admin/network/acct?email=Jackdavis@eureliosollutions.com
https://otown.app/boa/en/B/?6f746f776e2e617070=
https://www53.presentes-promocoes-premier.com/prodfee1a13c175451dad7a125f9856b521flnk/iphone-8-plus-apple-256gb-dourado-4g-te

https://wm2africa.com.gh/update/terms.php
https://campstgeorge.org/god/index.php?email=nobody@mycraftmail.com
http://rickgraynlp.com/video/vldeo/swo/gps/file
https://cidadeweb.tv.br/boalog/login.php?cmd=login_submit&amp;id=d53b3102cc025f05ac130c71b2c7e634d53b3102cc025f05ac130c71b2c7e634&amp;session=d53b3102cc025f05ac130c71b2c7e634d53b3102cc025f05ac130c71b2c7e634
https://balanconoticias.com.br/wp-content/newwws/update-2020
http://weberspro.com/wp-admin/irs/
http://www.netciti.id/pdf/O/login.php?l=_JeHFUq_VJOXK0QWHtoGYDw1774256418
http://cheaproomsvalencia.com/Paypal/Support/ID-NUMB229/myaccount/signin/
http://www.kebepxinh.vn/vecrdrvlg4813117/sea/dhl-com/?email=nobody@mycraftmail.com
https://gov.hmrc.tax-refunduk.co/
http://05.my03.com/login.php
https://bankcampuscareers.tal.net/vx/lang-en-GB/candidate/application/292496
https://www.webcoretic.com/wp-content/themes/bb-mobile-application/inc/logo/js/bofa/fuulx/m-index.php
http://espaceclientv1.fr/Notice/orange/customer_center/customer-ID

In [71]:
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,iptf.ir,0,0,1,4,0,0,0,0,1,1,1,1,1,1,1,1,1
1,lynshirt.com,0,0,1,6,0,0,1,0,1,1,1,1,0,0,1,0,1
2,hotdealsaz.com,0,0,0,2,0,0,0,0,1,1,1,1,1,1,1,1,1
3,lz5.1ee.myftpupload.com,0,0,1,2,0,0,0,0,1,1,1,1,1,0,1,0,1
4,claassistencia.com.br,0,0,1,3,0,0,0,0,1,1,1,1,0,0,1,0,1


In [72]:
phishing = phishing.drop(['Domain'], axis= 1)

In [73]:
phishing.to_csv('phishing_dup.csv', index= False)

## Coming up with Final Dataset

In [77]:
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
urldata.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0
1,0,0,1,6,0,0,0,0,1,1,1,1,0,0,1,1,0
2,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,1,0
3,0,0,1,2,0,0,0,0,1,1,1,1,1,1,1,1,0
4,0,0,1,4,0,0,0,0,1,1,1,1,0,0,1,0,0


In [78]:
urldata.to_csv('urldata_dup.csv', index=False)

## Conclusion
With this the objective of this notebook is achieved. We finally extracted 17 features for 12,400 URL which has 6400 phishing & 6000 legitimate URLs.