In [1]:
import pandas as pd

## Phishing URLs

In [2]:
data0 = pd.read_csv('verified_online.csv')
data0.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,8446675,http://accesso.91-92-254-219.cprapid.com,http://www.phishtank.com/phish_detail.php?phis...,2024-02-09T09:16:49+00:00,yes,2024-02-09T09:21:48+00:00,yes,Other
1,8446671,https://cloudflare-ipfs.com/ipfs/bafybeigghtyr...,http://www.phishtank.com/phish_detail.php?phis...,2024-02-09T08:56:53+00:00,yes,2024-02-09T09:04:12+00:00,yes,Other
2,8446670,https://allegroinform.com/,http://www.phishtank.com/phish_detail.php?phis...,2024-02-09T08:51:37+00:00,yes,2024-02-09T09:04:12+00:00,yes,Allegro
3,8446669,https://allegroinform.com/product/?ad=delivery...,http://www.phishtank.com/phish_detail.php?phis...,2024-02-09T08:46:10+00:00,yes,2024-02-09T08:52:48+00:00,yes,Allegro
4,8446667,https://push-akt2.net/,http://www.phishtank.com/phish_detail.php?phis...,2024-02-09T08:30:12+00:00,yes,2024-02-09T08:33:19+00:00,yes,Other


In [3]:
phishurl = data0.sample(n = 5000, random_state = 12).copy()
phishurl = phishurl.reset_index(drop=True)
phishurl.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,8187024,https://docs.google.com/presentation/d/e/2PACX...,http://www.phishtank.com/phish_detail.php?phis...,2023-06-19T17:33:27+00:00,yes,2023-06-19T17:43:15+00:00,yes,Other
1,8399620,https://s-teamg.com/p/wvc-jtrd/vrawqtgf/,http://www.phishtank.com/phish_detail.php?phis...,2023-12-21T09:02:07+00:00,yes,2023-12-21T09:14:28+00:00,yes,Other
2,8344943,https://cloudflare-ipfs.com/ipfs/bafybeibnczkx...,http://www.phishtank.com/phish_detail.php?phis...,2023-10-26T19:23:10+00:00,yes,2023-10-26T19:43:17+00:00,yes,Other
3,8216497,https://bafybeictw2hplh3akxmcbsiibnh6lrp7xgw7m...,http://www.phishtank.com/phish_detail.php?phis...,2023-07-08T00:28:19+00:00,yes,2023-07-08T00:34:25+00:00,yes,Other
4,8323078,https://brwtihfyemdrt.web.app/,http://www.phishtank.com/phish_detail.php?phis...,2023-10-06T02:57:30+00:00,yes,2023-10-06T03:14:10+00:00,yes,Other


In [4]:
phishurl.shape

(5000, 8)

## Legitimate URLs

In [5]:
data1 = pd.read_csv('Benign_list_big_final.csv')
data1.columns = ['URLs']
data1.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [6]:
legiurl = data1.sample(n = 5000, random_state = 12).copy() 
legiurl = legiurl.reset_index(drop=True)
legiurl.head()

Unnamed: 0,URLs
0,http://graphicriver.net/search?date=this-month...
1,http://ecnavi.jp/redirect/?url=http://www.cros...
2,https://hubpages.com/signin?explain=follow+Hub...
3,http://extratorrent.cc/torrent/4190536/AOMEI+B...
4,http://icicibank.com/Personal-Banking/offers/o...


In [7]:
legiurl.shape

(5000, 1)

## Feature Extraction

## Address Bar Based Features

In [8]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [9]:
def getDomain(url): 
    # url components --> scheme , network location , path , query , fragment
    domain = urlparse(url).netloc # extract network location part of the URL (domain + port)
    if re.match(r"^www.",domain): 
        domain = domain.replace("www.","") 
    return domain

In [10]:
def havingIP(url):
    try:
        ipaddress.ip_address(url)
        ip = 1
    except:
        ip = 0
    return ip

In [11]:
def haveAtSign(url):
    if "@" in url:
        at = 1
    else:
        at = 0
    return at

In [12]:
def getLength(url):
    if len(url) < 54:
        length = 0
    else:
        length = 1
    return length

In [13]:
def getDepth(url):
    s = urlparse(url).path.split('/')
    depth = 0
    for j in range(len(s)):
        if len(s[j]) != 0:
            depth = depth + 1
    return depth

In [14]:
def redirection(url):
    pos = url.rfind('//')
    if pos > 6:
        if pos > 7:
            return 1
        else:
            return 0
    else:
        return 0

In [15]:
def httpDomain(url):
    domain = urlparse(url).netloc
    if 'https' in domain:
        return 1
    else:
        return 0

In [16]:
shortening_services = (
    r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|"
    r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|"
    r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|"
    r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|"
    r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|"
    r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|"
    r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|"
    r"tr\.im|link\.zip\.net"
)

In [17]:
def tinyURL(url):
    match = re.search(shortening_services, url)
    if match:
        return 1
    else:
        return 0

In [18]:
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1  # phishing
    else:
        return 0  # legitimate

## Domain Based Features

In [19]:
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [20]:
import requests

def web_traffic(url):
    try:
        response = requests.get("http://data.alexa.com/data", params={"cli": 10, "dat": "s", "url": url})
        rank = BeautifulSoup(response.content, "xml").find("REACH")['RANK']
        rank = int(rank)
    except (TypeError, requests.RequestException) as e:
        print(f"Error: {e}")
        return 1

    if rank < 100000:
        return 1
    else:
        return 0


In [21]:
def domainAge(domain_name):
    creation_date = domain_name.creation_date
    expiration_date = domain_name.expiration_date

    if isinstance(creation_date, str) or isinstance(expiration_date, str):
        try:
            creation_date = datetime.strptime(creation_date, '%Y-%m-%d')
            expiration_date = datetime.strptime(expiration_date, '%Y-%m-%d')
        except:
            return 1

    if expiration_date is None or creation_date is None:
        return 1
    elif type(expiration_date) is list or type(creation_date) is list:
        return 1
    else:
        age_of_domain = abs((expiration_date - creation_date).days)
        if age_of_domain / 30 < 6:
            age = 1
        else:
            age = 0
        return age


In [22]:
def domainEnd(domain_name):
    expiration_date = domain_name.expiration_date

    if isinstance(expiration_date, str):
        try:
            expiration_date = datetime.strptime(expiration_date, "%Y-%m-%d")
        except:
            return 1

    if expiration_date is None:
        return 1
    elif type(expiration_date) is list:
        return 1
    else:
        today = datetime.now()
        end = abs((expiration_date - today).days)
        if end / 30 < 6:
            end = 0
        else:
            end = 1
        return end


## HTML and Js based features

In [23]:
import requests

In [24]:
def iframe(response):
    if response == "":
        return 1
    else:
        if re.findall(r"[|]", response.text):
            return 0
        else:
            return 1

In [25]:
def mouseOver(response):
    if response == "":
        return 1
    else:
        if re.findall("", response.text):
            return 1
        else:
            return 0

In [26]:
def rightClick(response):
    if response == "":
        return 1
    else:
        if re.findall(r"event\.button ?== ?2", response.text):
            return 0
        else:
            return 1


In [27]:
def forwarding(response):
    if response == "":
        return 1
    else:
        if len(response.history) <= 2:
            return 0
        else:
            return 1

## Computing URL features

In [28]:
def featureExtraction(url, label):
    features = []

    # Address bar based features (10)
    features.append(getDomain(url))
    features.append(havingIP(url))
    features.append(haveAtSign(url))
    features.append(getLength(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(tinyURL(url))
    features.append(prefixSuffix(url))

    # # Domain based features (4)
    # dns = 0
    # try:
    #     domain_name = whois.whois(urlparse(url).netloc)
    # except:
    #     dns = 1
    # features.append(dns)
    # features.append(web_traffic(url))
    # features.append(1 if dns == 1 else domainAge(domain_name))
    # features.append(1 if dns == 1 else domainEnd(domain_name))

    # HTML & Javascript based features (4)
    try:
        response = requests.get(url,timeout=2)
    except:
        response = ""
    features.append(iframe(response))
    features.append(mouseOver(response))
    features.append(rightClick(response))
    features.append(forwarding(response))

    features.append(label)
    return features

In [39]:
legi_features = []
label = 0
lim = 100

for i in range(0, 5000):
    url = legiurl['URLs'][i]
    print(i,url)
    # print(f'URL : {url} -> {featureExtraction(url, label)}')
    legi_features.append(featureExtraction(url, label))
    if len(legi_features)>=lim:
        break

0 http://graphicriver.net/search?date=this-month&length_max=&length_min=&price_max=&price_min=&rating_min=&sales=&sort=sales&term=&view=list
1 http://ecnavi.jp/redirect/?url=http://www.cross-a.net/x.php?id=1845_3212_22061_26563&m=1004&pid=%user_id%
2 https://hubpages.com/signin?explain=follow+Hubs&url=%2Fhub%2FComfort-Theories-of-Religion
3 http://extratorrent.cc/torrent/4190536/AOMEI+Backupper+Technician+%2B+Server+Edition+2.8.0+%2B+Patch+%2B+Key+%2B+100%25+Working.html
4 http://icicibank.com/Personal-Banking/offers/offer-detail.page?id=offer-ezeego-domestic-airtravel-20141407112611060
5 http://nypost.com/2015/05/07/us-indifference-leaves-saudis-partnering-with-terrorists/
6 http://kienthuc.net.vn/diem-thi/diem-chuan-dh-cong-nghe-giao-thong-van-tai-nam-2014-482407.html
7 http://thenextweb.com/in/2015/04/16/india-wants-a-neutral-web-and-facebooks-internet-org-cant-be-a-part-of-it/gtm.js
8 http://tobogo.net/cdsb/board.php?board=greet&bm=view&no=5716&category=&auth=&page=1&search=&keywor

In [30]:
# feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection', 'https_Domain',
#                   'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 'Domain_Age', 'Domain_End',
#                   'iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards', 'Label']

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth', 'Redirection', 'https_Domain',
                  'TinyURL', 'Prefix/Suffix','iFrame', 'Mouse_Over', 'Right_Click', 'Web_Forwards', 'Label']

legitimate = pd.DataFrame(legi_features, columns=feature_names)
legitimate.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,graphicriver.net,0,0,1,1,0,0,0,0,0,1,1,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,0,1,1,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,0,1,1,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,1,1,1,1,0
4,icicibank.com,0,0,1,3,0,0,0,0,1,1,1,0,0


In [31]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('legitimate.csv', index= False)

## Phishing URLs:

In [32]:
phishurl.shape

(5000, 8)

In [41]:
#Extracting the feautres & storing them in a list
phish_features = []
label = 1
lim = 100
for i in range(0, 5000):
    url = phishurl['url'][i]
    print(i,url)
    # print(f'URL : {url} -> {featureExtraction(url, label)}')
    phish_features.append(featureExtraction(url, label))
    if len(phish_features)>=lim:
        break

0 https://docs.google.com/presentation/d/e/2PACX-1vTp6AsboSqcqhsJcv9jXQyPbINUGivfrNU1kX_AgFxtI6ZtpPkAZZxFxnYA0cbvn1_52mY3yAYZZ0_b/pub?start=false&loop=false&delayms=3000
1 https://s-teamg.com/p/wvc-jtrd/vrawqtgf/
2 https://cloudflare-ipfs.com/ipfs/bafybeibnczkxh6gtu5cbvpvstglkl4dgzexhifhdcnm2rascdthzjf4d7y
3 https://bafybeictw2hplh3akxmcbsiibnh6lrp7xgw7msarpxov25jdhksy3o5qaa.ipfs.cf-ipfs.com
4 https://brwtihfyemdrt.web.app/
5 https://click.pstmrk.it/2sm/link.waveapps.com%2Fv2jf5j-3zatkz/lixABy8N/oawo/nQGekMlRY4/QXJJbnZvaWNlRGVmYXVsdEVtYWls
6 https://cloudflare-ipfs.com/ipfs/bafkreiee6rgyuqcynb5w5e443p5hyjisa2sthn5qt47jsipcihphzhc2ae
7 https://bafybeig3tm7gtkz5abjlu4ucvimk6iutcbpqd6kaj2mot2qghqj3asger4.ipfs.cf-ipfs.com/
8 https://sites.google.com/view/suraenliinea/inicio
9 https://docs.google.com/presentation/d/e/2PACX-1vTHL2-Mw87tPrkwefdS6w1B3JeZ7-XWcWtqyGRcyn64TQNO_8TMI8ASwP3qj6x9I3acMBuom0Vq_Q5z/pub?start=false&loop=false&delayms=3000&slide=id.p
10 https://dsan97.webwave.dev/
11 http

In [42]:
#converting the list to dataframe
# feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
#                       'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
#                       'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,docs.google.com,0,0,1,5,0,0,0,0,0,1,1,0,1
1,s-teamg.com,0,0,0,3,0,0,0,1,1,1,1,1,1
2,cloudflare-ipfs.com,0,0,1,2,0,0,0,1,0,1,1,0,1
3,bafybeictw2hplh3akxmcbsiibnh6lrp7xgw7msarpxov2...,0,0,1,0,0,0,0,1,0,1,1,0,1
4,brwtihfyemdrt.web.app,0,0,0,0,0,0,0,0,1,1,1,0,1


In [43]:
# Storing the extracted legitimate URLs fatures to csv file
phishing.to_csv('phishing.csv', index= False)

In [44]:
#Concatenating the dataframes into one 
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
urldata.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,graphicriver.net,0,0,1,1,0,0,0,0,0,1,1,0,0
1,ecnavi.jp,0,0,1,1,1,0,0,0,0,1,1,0,0
2,hubpages.com,0,0,1,1,0,0,0,0,0,1,1,0,0
3,extratorrent.cc,0,0,1,3,0,0,0,0,1,1,1,1,0
4,icicibank.com,0,0,1,3,0,0,0,0,1,1,1,0,0


In [45]:
urldata.tail()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
195,connectdappfix.pages.dev,0,0,0,0,0,0,0,0,0,1,1,0,1
196,security-page-community-standards.blogspot.com,0,0,1,0,0,0,1,1,0,1,1,0,1
197,wesley23-102961.weeblysite.com,0,0,0,0,0,0,0,1,1,1,1,0,1
198,site9615272.92.webydo.com,0,0,0,0,0,0,0,0,1,1,1,0,1
199,valid-pichincha.webcindario.com,0,0,1,2,0,0,0,1,0,1,1,0,1


In [46]:
urldata.shape

(200, 14)

In [47]:
# Storing the data in CSV file
urldata.to_csv('urldata.csv', index=False)