In [1]:
import pandas as pd
# from bs4 import BeautifulSoup
import requests
import re
import ipaddress
from urllib.parse import urlparse


#### Load phishing URL csv file

In [None]:
data0 = pd.read_csv("./Datafiles/online-valid.csv")
data0.head()

Unnamed: 0,url
0,http://u1047531.cp.regruhosting.ru/acces-inges...
1,http://hoysalacreations.com/wp-content/plugins...
2,http://www.accsystemprblemhelp.site/checkpoint...
3,http://www.accsystemprblemhelp.site/login_atte...
4,https://firebasestorage.googleapis.com/v0/b/so...


In [3]:
data0.shape

(14858, 1)

In [None]:
#collect random 5000 phishing URLs 
phishurl = data0.sample(n = 5000, random_state = 12).copy()
phishurl = phishurl.reset_index(drop=True)
phishurl.head()

Unnamed: 0,url
0,http://confirmprofileaccount.com/
1,http://www.marreme.com/MasterAdmin/04mop.html
2,http://modsecpaststudents.com/review/
3,https://docs.google.com/forms/d/e/1FAIpQLScL6L...
4,https://oportunidadedasemana.com/americanas//?...


In [5]:
phishurl.columns

Index(['url'], dtype='object')

In [6]:
phishurl.shape

(5000, 1)

#### Load legitimate url file csv

In [None]:
data1 = pd.read_csv("Datafiles/Benign_list_big_final.csv")
data1.columns = ['url']
data1.head()

Unnamed: 0,url
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...


In [None]:
# collect random 5000 Legitimate URLs 
legiurl = data1.sample(n = 5000, random_state = 12).copy()
legiurl = legiurl.reset_index(drop=True)
legiurl.columns
legiurl.head()


Unnamed: 0,url
0,http://graphicriver.net/search?date=this-month...
1,https://foursquare.com/v/%D1%81%D0%B0%D0%B2%D0...
2,http://shop-pro.jp/magazine/wp-content/themes/...
3,http://motthegioi.vn/the-gioi-xe/o-to-chat-luo...
4,http://tobogo.net/cdsb/board.php?board=dailyst...


In [9]:
legiurl.columns

Index(['url'], dtype='object')

In [10]:
legiurl.shape

(5000, 1)

### **Feature extraction**

#### **Address Bar Based Features:**
*   Domain of URL
*   IP Address in URL
*   "@" Symbol in URL
*   Length of URL
*   Depth of URL
*   Redirection "//" in URL
*   "http/https" in Domain name
*   Using URL Shortening Services
*   Prefix or Suffix "-" in Domain

#### **HTML/JS Based features**
*   IFrame Redirection
*   Status Bar Customization
*   Disabling Right Click
*   Website Forwarding


In [None]:
'''
ADDRESS BAR BASED FEATURES
'''
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
    domain = domain.replace("www.","")
  return domain

def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0
  
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

# shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0
    
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

'''
HTML-JS Based features
'''
# Feature: IFrame Redirection
def iframe(response):
  if response == "":
      return 1
  else:
    if re.findall(r"[<iframe>|<frameBorder>]", response.text):
        return 0
    else:
        return 1

# Feature: Status Bar Customization
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

# Feature disable right click
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

# Feature website forwarding
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

### Feature Extraction

In [12]:
#version1

# import os
# import csv

# #Function to extract features
# def featureExtraction(url,label, output_file='features.csv'):
#   features = []
#   #Address bar based features (10)
#   features.append(getDomain(url))
#   features.append(havingIP(url))
#   features.append(haveAtSign(url))
#   features.append(getLength(url))
#   features.append(getDepth(url))
#   features.append(redirection(url))
#   features.append(httpDomain(url))
#   features.append(tinyURL(url))
#   features.append(prefixSuffix(url))
  
#   # HTML & Javascript based features (4)
#   try:
#     response = requests.get(url)
#   except:
#     response = ""
#   features.append(iframe(response))
#   features.append(mouseOver(response))
#   features.append(rightClick(response))
#   features.append(forwarding(response))
#   features.append(label)
  
#   return features

#   # file_exists = os.path.isfile(output_file)
#   # # Column names for the CSV
#   # headers = ["Domain", "Having_IP", "Have_At_Sign", "URL_Length", "URL_Depth", 
#   #   "Redirection", "HTTP_Domain", "Tiny_URL", "Prefix_Suffix",
#   #   "Iframe", "Mouse_Over", "Right_Click", "Forwarding", "Label"
#   # ]
    
  

#   # Write features to CSV
#   # with open(output_file, mode='a', newline='') as file:
#   #   writer = csv.writer(file)
#   #   if not file_exists:  # Write headers if file is new
#   #     writer.writerow(headers)
#   #   writer.writerow(features)

#   # return features

In [None]:
import csv
import os

def featureExtraction(url, label, output_file='features.csv'):
    features = []
    
    # Address bar-based features (10)
    features.append(getDomain(url))
    features.append(havingIP(url))
    features.append(haveAtSign(url))
    features.append(getLength(url))
    features.append(getDepth(url))
    features.append(redirection(url))
    features.append(httpDomain(url))
    features.append(tinyURL(url))
    features.append(prefixSuffix(url))
    
    # HTML & Javascript-based features (4)
    try:
        # timeout of 3 seconds
        response = requests.get(url, timeout=3)
    except requests.exceptions.RequestException:
        response = ""  # empty response if there's an error or timeout
    
    features.append(iframe(response))
    features.append(mouseOver(response))
    features.append(rightClick(response))
    features.append(forwarding(response))
    features.append(label)
    
    # if the file already exists
    file_exists = os.path.isfile(output_file)
    
    # column names for the CSV
    headers = ["Domain", "Having_IP", "Have_At_Sign", "URL_Length", "URL_Depth", 
               "Redirection", "HTTP_Domain", "Tiny_URL", "Prefix_Suffix",
               "Iframe", "Mouse_Over", "Right_Click", "Forwarding", "Label"]
    
    # write to CSV
    with open(output_file, 'a', newline='') as file:
        writer = csv.writer(file)
        if not file_exists:
            writer.writerow(headers)  # Write headers if the file doesn't exist
        writer.writerow(features)  # Write the extracted features
    
    return features


In [None]:
from tqdm import tqdm 

def extract_features(data, label, output_file):
    for url in tqdm(data['url'], desc=f"Extracting {label} features"):
        featureExtraction(url, label, output_file)

# extract features for legitimate URLs
extract_features(legiurl, "benign", "legitimate_features.csv")

# extract features for phishing URLs
extract_features(phishurl, "phishing", "phishing_features.csv")

legi_features = pd.read_csv("legitimate_features.csv")
phish_features = pd.read_csv("phishing_features.csv")

# combine legitimate and phishing features
combined_features = pd.concat([legi_features, phish_features], ignore_index=True)

# save combined_features as a CSV
combined_features.to_csv("combined_features.csv", index=False)

print("Feature extraction and combination completed. Saved to 'combined_features.csv'.")

Extracting benign features: 100%|██████████| 5000/5000 [33:42<00:00,  2.47it/s]  
Extracting phishing features: 100%|██████████| 5000/5000 [1:04:52<00:00,  1.28it/s]

Feature extraction and combination completed. Saved to 'combined_features.csv'.





####  Final feature extracted file

In [15]:
featureExt_file = pd.read_csv('combined_features.csv')
featureExt_file.head(10)
featureExt_file.shape


(15166, 14)