In [1]:
import pandas as pd
from urllib.parse import urlparse,parse_qs
import math
from collections import Counter
import string
import tldextract
from collections import Counter
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import ipaddress



In [2]:
url_df = pd.read_csv('url_dataset.csv')

In [3]:
# Remove any duplicates in the combined dataset, although there shouldn't be any at this point
url_df = url_df.drop_duplicates()

# Save the combined dataframe to a new CSV file
url_df.to_csv('url_dataset.csv', index=False)

# Print the number of lines in the final CSV
print(f"Total number of URLs: {len(url_df)}")

# Print the number of samples for each class
print("Number of samples per class:")
print(url_df['label'].value_counts())

Total number of URLs: 972019
Number of samples per class:
label
benign      773818
phishing    140608
malware      54470
spam          3123
Name: count, dtype: int64


In [4]:
# Define the desired final sizes for the training set for each class
desired_train_sizes = {'benign': 58132, 'phishing': 14374, 'malware': 31851}

# Preserve all the 'spam' samples
spam_subset = url_df[url_df['label'] == 'spam']

# Initialize an empty DataFrame for the training set
train_df = pd.DataFrame()

# For each class in the desired train sizes, randomly sample the desired number of instances for the training set
for label, size in desired_train_sizes.items():
    # Filter the DataFrame for the current class
    class_subset = url_df[url_df['label'] == label]
    
    # Sample the desired number of instances for the training set
    train_subset = class_subset.sample(n=size, random_state=42)
    
    # Append the sampled subset to the training DataFrame
    train_df = pd.concat([train_df, train_subset])

# Append the 'spam' samples to the training DataFrame
train_df = pd.concat([train_df, spam_subset])

# Reset the index of the training DataFrame
train_df.reset_index(drop=True, inplace=True)

# Optionally, save the training dataframe to a new CSV file
train_df.to_csv('smaller_url_dataset.csv', index=False)

# Print the number of samples for each class in the training set
print("Number of samples per class in the dataset:")
print(train_df['label'].value_counts())

Number of samples per class in the training set:
label
benign      58132
malware     31851
phishing    14374
spam         3123
Name: count, dtype: int64


In [5]:
url_df = pd.DataFrame()
url_df = pd.read_csv('smaller_url_dataset.csv')

In [6]:
# Define a regular expression pattern for characters you consider safe
# This example allows alphanumeric characters, some special characters, and Unicode characters in the specified range
safe_pattern = re.compile(r'^[a-zA-Z0-9\-._~:/?#\[\]@!$&\'()*+,;=%]+$')

# Filter the DataFrame to keep only rows with URLs matching the pattern
# Note: Adjust the pattern as needed based on the characters you want to include
url_df = url_df[url_df['url'].apply(lambda x: bool(safe_pattern.match(x)) if pd.notnull(x) else False)]

# Print the number of lines in the final CSV
print(f"Total number of URLs: {len(url_df)}")

# Print the number of samples for each class
print("Number of samples per class:")
print(url_df['label'].value_counts())

Total number of URLs: 107114
Number of samples per class:
label
benign      57821
malware     31850
phishing    14320
spam         3123
Name: count, dtype: int64


In [7]:
# If url_df is derived from another DataFrame and might be a view, create an independent copy
url_df = url_df.copy()
def extract_url_components(url):
    try:
        # Preprocessing step to clean up common URL scheme errors
        url = url.replace('http://http://', 'http://').replace('http://https://', 'https://')
        url = url.replace('http://htttt://', 'http://').replace('http://ttp://', 'http://').replace('http://ttps://', 'https://')
        
        # Ensure the URL has a scheme for proper parsing
        if not urlparse(url).scheme and not url.startswith(('http://', 'https://')):
            url = 'http://' + url
        
        # Extract domain components using tldextract
        extracted = tldextract.extract(url)
        full_domain = f"{extracted.subdomain + '.' if extracted.subdomain else ''}{extracted.domain}.{extracted.suffix}" if extracted.domain and extracted.suffix else "no_domain"
        
        # Parse the URL to extract path and query
        parsed_url = urlparse(url)
        path = parsed_url.path if parsed_url.path else "/"
        query = parsed_url.query if parsed_url.query else "no_query"
        
        return full_domain, path, query
    except Exception as e:
        print(f"Error parsing URL {url}: {e}")
        return "no_domain", "/", "no_query"

# Assuming url_df is your DataFrame and it contains a 'url' column
# Apply the function and store the result in a temporary variable
temp_results = url_df['url'].apply(lambda x: pd.Series(extract_url_components(x)))

# Explicitly set the new columns using .loc to avoid SettingWithCopyWarning
url_df.loc[:, 'domain'] = temp_results[0]
url_df.loc[:, 'path_url'] = temp_results[1]
url_df.loc[:, 'query_url'] = temp_results[2]


In [8]:
url_df

Unnamed: 0,url,label,domain,path_url,query_url
0,https://www.amazon.com/Punch-Vincent-Gale/dp/B...,benign,www.amazon.com,/Punch-Vincent-Gale/dp/B000A2X47O,no_query
1,https://www.startreklinks.net/series-movies/vo...,benign,www.startreklinks.net,/series-movies/voyager.html,no_query
2,http://torcache.net/torrent/00611B9CA7EDC70114...,benign,torcache.net,/torrent/00611B9CA7EDC70114C4D6F445EC980380748...,title=[kickass.to]arma.iii.brick
3,https://www.thirdworldtraveler.com/American_Em...,benign,www.thirdworldtraveler.com,/American_Empire/American_Empire_page.html,no_query
4,syndicalist.org/archives/llr14-24/14f.shtml,benign,syndicalist.org,/archives/llr14-24/14f.shtml,no_query
...,...,...,...,...,...
107475,jfm-www.damtp.cam.ac.uk,spam,jfm-www.damtp.cam.ac.uk,/,no_query
107476,jic-bioinfo.bbsrc.ac.uk,spam,jic-bioinfo.bbsrc.ac.uk,/,no_query
107477,jobs.gbdirect.co.uk,spam,jobs.gbdirect.co.uk,/,no_query
107478,kiddyleague.co.uk,spam,kiddyleague.co.uk,/,no_query


In [9]:
import pandas as pd

# Assuming 'url_df' is your DataFrame
# Creating a mask for rows with complex empty conditions
complex_empty_mask = (url_df.isnull() | url_df.eq("") | url_df.eq("None") | url_df.eq("invalid_url") | url_df.eq("N/A")).any(axis=1)

# Using the mask to filter rows and then getting their indices
indices_of_complex_empty_rows = url_df[complex_empty_mask].index

# Displaying the indices
print(indices_of_complex_empty_rows)


Index([], dtype='int64')


In [10]:
url_df['url_length'] = url_df['url'].apply(lambda url: len(url))
url_df['domain_length'] = url_df['domain'].apply(lambda domain: len(domain) if domain else 0)

In [11]:
def character_distribution(text):
    if not text:
        return {}
    # Consider all printable characters, removing spaces
    text_cleaned = ''.join(filter(lambda x: x in string.printable and x != ' ', text))
    counter = Counter(text_cleaned)
    total = sum(counter.values())
    distribution = {char: count / total for char, count in counter.items()}
    return distribution


def kl_divergence(p, q):
    epsilon = 1e-10
    divergence = sum(p[char] * np.log2(p[char] / (q.get(char, epsilon))) for char in p)
    return divergence

# Example: Uniform distribution across all printable characters, excluding spaces
all_chars = ''.join(filter(lambda x: x in string.printable and x != ' ', string.printable))
reference_distribution = {char: 1/len(all_chars) for char in all_chars}


def calculate_url_kl_divergence(url):
    url_distribution = character_distribution(url)
    return kl_divergence(url_distribution, reference_distribution)

def calculate_domain_kl_divergence(url):
    domain = tldextract.extract(url).registered_domain
    domain_distribution = character_distribution(domain)
    return kl_divergence(domain_distribution, reference_distribution)

def calculate_path_kl_divergence(url):
    path = urlparse(url).path
    path_distribution = character_distribution(path)
    return kl_divergence(path_distribution, reference_distribution)

def calculate_query_kl_divergence(url):
    query = urlparse(url).query
    query_distribution = character_distribution(query)
    return kl_divergence(query_distribution, reference_distribution)

def calculate_query_path_kl_divergence(url):
    path = urlparse(url).path
    query = urlparse(url).query
    path_distribution = character_distribution(path)
    query_distribution = character_distribution(query)
    return kl_divergence(path_distribution, query_distribution)


In [12]:
url_df['url_kl_divergence'] = url_df['url'].apply(calculate_url_kl_divergence)
url_df['domain_kl_divergence'] = url_df['url'].apply(calculate_domain_kl_divergence)
url_df['path_kl_divergence'] = url_df['url'].apply(calculate_path_kl_divergence)
url_df['query_kl_divergence'] = url_df['url'].apply(calculate_query_kl_divergence)
url_df['query_path_kl_divergence'] = url_df['url'].apply(calculate_query_path_kl_divergence)


In [13]:
# Number of special symbols
def count_special_symbols(code):
    special_symbols = set('@#$%^&*()_-+={}[]|\:;"<>,.?/~`')
    return sum(1 for char in code if char in special_symbols)

url_df['num_special_symbols'] = url_df['url'].apply(count_special_symbols)

In [14]:
# # Function to check for IP address presence
def contains_ip_address(url):
    # Regular expression to match IP addresses within a string
    ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    
    # Find all occurrences of potential IP addresses in the URL
    potential_ips = re.findall(ip_pattern, url)
    
    for potential_ip in potential_ips:
        try:
            # Check if the extracted string is a valid IP address
            ipaddress.ip_address(potential_ip)
            return 1  # Valid IP address found
        except ValueError:
            continue  # Not a valid IP address, move to the next match

    # No valid IP addresses found in the URL
    return 0

    
url_df['contains_ip'] = url_df['url'].apply(contains_ip_address)


In [15]:
# Need implementation of - brand name domains - merge with the csv.

In [16]:
# Function to preprocess and tokenize URLs
def tokenize_url(url):
    tokens = []
    url_parts = urlparse(url)

    # Tokenize domain
    domain_tokens = url_parts.netloc.split('.')
    tokens.extend(domain_tokens)

    # Tokenize path
    path_tokens = re.split('/|-|_', url_parts.path)
    tokens.extend(filter(None, path_tokens))  # filter removes empty strings

    # Tokenize query parameters
    query_tokens = re.split('=|&', url_parts.query)
    tokens.extend(filter(None, query_tokens))

    return ' '.join(tokens)

# Apply tokenization to each URL
url_df['tokens'] = url_df['url'].apply(tokenize_url)


In [17]:
def check_file_extensions(url):
    suspicious_file_extensions = [
        "exe", "scr", "vbs", "js", "xml", "docm", "xps", "iso", "img", "doc",
        "rtf", "xls", "pdf", "pub", "arj", "lzh", "r01", "r14", "r18", "r25",
        "tar", "ace", "zip", "jar", "bat", "cmd", "moz", "vb", "vbs", "js",
        "wsc", "wsh", "ps1", "ps1xml", "ps2", "ps2xml", "psc1", "psc2"
    ]

    # Extract the file extension from the URL
    match = re.search(r'\.([a-zA-Z0-9]+)$', url)
    if match:
        extension = match.group(1).lower()
        if extension in suspicious_file_extensions:
            return 1
    return 0

url_df['presence_of_suspicious_file_extensions'] = url_df['url'].apply(check_file_extensions)


In [18]:
# Function to calculate the frequency of digits
def digit_frequency(url):
    if not url:
        return 0
    digits = sum(c.isdigit() for c in url)
    return digits / len(url)

def count_subdomains(url):
    subdomain = tldextract.extract(url).subdomain
    if subdomain:
        return subdomain.count('.') + 1  # Adding 1 because subdomains are separated by dots
    return 0

url_df['sub_domains_count'] = url_df['url'].apply(count_subdomains)
url_df['digit_frequency'] = url_df['url'].apply(digit_frequency)


In [19]:
# Function to count top-level domains
def count_tlds(url):
    if len(tldextract.extract(url).suffix.split(".")):
        return len(tldextract.extract(url).suffix.split("."))
    else:
        return 0

url_df['count_tlds'] = url_df['url'].apply(count_tlds)


In [20]:
def is_short_url(url):
    # List of known short URL services
    known_shorteners = {
        "bit.ly", "goo.gl", "tinyurl.com", "is.gd", "cli.gs", "pic.gd", "tweetphoto",
        "DwarfURL.com", "ow.ly", "yfrog.com", "migre.me", "ff.im", "tiny.cc", "url4.eu",
        "tr.im", "twit.ac", "su.pr", "twurl.nl", "snipurl.com", "BudURL.com", "short.to",
        "ping.fm", "Digg.com", "post.ly", "Just.as", ".tk", "bkite.com", "snipr.com",
        "flic.kr", "loopt.us", "doiop.com", "twitthis.com", "htxt.it", "AltURL.com",
        "RedirX.com", "DigBig.com", "short.ie", "u.mavrev.com", "kl.am", "wp.me", "u.nu",
        "rubyurl.com", "om.ly", "linkbee.com", "Yep.it", "posted.at", "xrl.us", "metamark.net",
        "sn.im", "hurl.ws", "eepurl.com", "idek.net", "urlpire.com", "chilp.it", "moourl.com",
        "snurl.com", "xr.com", "lin.cr", "EasyURI.com", "zz.gd", "ur1.ca", "URL.ie", "adjix.com",
        "twurl.cc", "s7y.us", "shrinkify", "EasyURL.net", "atu.ca", "sp2.ro", "Profile.to", "ub0.cc",
        "minurl.fr", "cort.as", "fire.to", "2tu.us", "twiturl.de", "to.ly", "BurnURL.com", "nn.nf", "clck.ru",
        "notlong.com", "thrdl.es", "spedr.com", "vl.am", "miniurl.com", "virl.com", "PiURL.com", "1url.com",
        "gri.ms", "tr.my", "Sharein.com", "urlzen.com", "fon.gs", "Shrinkify.com", "ri.ms", "b23.ru", "Fly2.ws",
        "xrl.in", "Fhurl.com", "wipi.es", "korta.nu", "shortna.me", "fa.b", "WapURL.co.uk", "urlcut.com", "6url.com",
        "abbrr.com", "SimURL.com", "klck.me", "x.se", "2big.at", "url.co.uk", "ewerl.com", "inreply.to", "TightURL.com",
        "a.gg", "tinytw.it", "zi.pe", "riz.gd", "hex.io", "fwd4.me", "bacn.me", "shrt.st", "ln-s.ru", "tiny.pl", "o-x.fr",
        "StartURL.com", "jijr.com", "shorl.com", "icanhaz.com", "updating.me", "kissa.be", "hellotxt.com", "pnt.me", "nsfw.in",
        "xurl.jp", "yweb.com", "urlkiss.com", "QLNK.net", "w3t.org", "lt.tl", "twirl.at", "zipmyurl.com", "urlot.com", "a.nf",
        "hurl.me", "URLHawk.com", "Tnij.org", "4url.cc", "firsturl.de", "Hurl.it", "sturly.com", "shrinkster.com", "ln-s.net",
        "go2cut.com", "liip.to", "shw.me", "XeeURL.com", "liltext.com", "lnk.gd", "xzb.cc", "linkbun.ch", "href.in", "urlbrief.com",
        "2ya.com", "safe.mn", "shrunkin.com", "bloat.me", "krunchd.com", "minilien.com", "ShortLinks.co.uk", "qicute.com", "rb6.me",
        "urlx.ie", "pd.am", "go2.me", "tinyarro.ws", "tinyvid.io", "lurl.no", "ru.ly", "lru.jp", "rickroll.it", "togoto.us", "ClickMeter.com",
        "hugeurl.com", "tinyuri.ca", "shrten.com", "shorturl.com", "Quip-Art.com", "urlao.com", "a2a.me", "tcrn.ch", "goshrink.com",
        "DecentURL.com", "decenturl.com", "zi.ma", "1link.in", "sharetabs.com", "shoturl.us", "fff.to", "hover.com", "lnk.in", "jmp2.net",
        "dy.fi", "urlcover.com", "2pl.us", "tweetburner.com", "u6e.de", "xaddr.com", "gl.am", "dfl8.me", "go.9nl.com", "gurl.es", "C-O.IN",
        "TraceURL.com", "liurl.cn", "MyURL.in", "urlenco.de", "ne1.net", "buk.me", "rsmonkey.com", "cuturl.com", "turo.us", "sqrl.it", "iterasi.net",
        "tiny123.com", "EsyURL.com", "urlx.org", "IsCool.net", "twitterpan.com", "GoWat.ch", "poprl.com", "njx.me"
    }

    extracted = tldextract.extract(url)
    domain = f"{extracted.domain}.{extracted.suffix}"

    # Normalize domain to handle cases like 'www.bit.ly'
    normalized_domain = domain.lower().replace('www.', '')

    return 1 if normalized_domain in known_shorteners else 0


url_df['is_short_url'] = url_df['url'].apply(is_short_url)


In [21]:
url_df

Unnamed: 0,url,label,domain,path_url,query_url,url_length,domain_length,url_kl_divergence,domain_kl_divergence,path_kl_divergence,query_kl_divergence,query_path_kl_divergence,num_special_symbols,contains_ip,tokens,presence_of_suspicious_file_extensions,sub_domains_count,digit_frequency,count_tlds,is_short_url
0,https://www.amazon.com/Punch-Vincent-Gale/dp/B...,benign,www.amazon.com,/Punch-Vincent-Gale/dp/B000A2X47O,no_query,55,14,1.891344,3.907429,2.199043,0.000000,28.788968,10,0,www amazon com Punch Vincent Gale dp B000A2X47O,0,1,0.109091,1,0
1,https://www.startreklinks.net/series-movies/vo...,benign,www.startreklinks.net,/series-movies/voyager.html,no_query,56,21,2.479037,3.409828,2.791317,0.000000,29.381241,9,0,www startreklinks net series movies voyager.html,0,1,0.000000,1,0
2,http://torcache.net/torrent/00611B9CA7EDC70114...,benign,torcache.net,/torrent/00611B9CA7EDC70114C4D6F445EC980380748...,title=[kickass.to]arma.iii.brick,109,12,1.752758,3.544394,2.344516,2.895898,22.289257,14,0,torcache net torrent 00611B9CA7EDC70114C4D6F44...,0,0,0.266055,1,0
3,https://www.thirdworldtraveler.com/American_Em...,benign,www.thirdworldtraveler.com,/American_Empire/American_Empire_page.html,no_query,76,26,2.330837,2.988107,2.748562,0.000000,29.338486,11,0,www thirdworldtraveler com American Empire Ame...,0,1,0.000000,1,0
4,syndicalist.org/archives/llr14-24/14f.shtml,benign,syndicalist.org,/archives/llr14-24/14f.shtml,no_query,43,15,2.296570,2.989133,2.296570,0.000000,28.886494,6,0,syndicalist.org archives llr14 24 14f.shtml,0,0,0.139535,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107475,jfm-www.damtp.cam.ac.uk,spam,jfm-www.damtp.cam.ac.uk,/,no_query,23,23,3.160780,4.126098,3.160780,0.000000,29.750704,5,0,jfm www.damtp.cam.ac.uk,0,2,0.000000,2,0
107476,jic-bioinfo.bbsrc.ac.uk,spam,jic-bioinfo.bbsrc.ac.uk,/,no_query,23,23,3.019688,3.715380,3.019688,0.000000,29.609612,4,0,jic bioinfo.bbsrc.ac.uk,0,1,0.000000,2,0
107477,jobs.gbdirect.co.uk,spam,jobs.gbdirect.co.uk,/,no_query,19,19,2.947476,3.107716,2.947476,0.000000,29.537400,3,0,jobs.gbdirect.co.uk,0,1,0.000000,2,0
107478,kiddyleague.co.uk,spam,kiddyleague.co.uk,/,no_query,17,17,3.130129,3.130129,3.130129,0.000000,29.720053,2,0,kiddyleague.co.uk,0,0,0.000000,2,0


In [22]:
print(url_df.isnull().sum())


url                                       0
label                                     0
domain                                    0
path_url                                  0
query_url                                 0
url_length                                0
domain_length                             0
url_kl_divergence                         0
domain_kl_divergence                      0
path_kl_divergence                        0
query_kl_divergence                       0
query_path_kl_divergence                  0
num_special_symbols                       0
contains_ip                               0
tokens                                    0
presence_of_suspicious_file_extensions    0
sub_domains_count                         0
digit_frequency                           0
count_tlds                                0
is_short_url                              0
dtype: int64


In [23]:
# Filter rows where 'domain' is 'no_domain'
no_domain_rows = url_df[url_df['domain'] == 'no_domain']

# Check if all these filtered rows have 'contains_ip' set to 1
all_have_ip = no_domain_rows['contains_ip'].all()

print("All 'no_domain' rows contain IP:", all_have_ip)



All 'no_domain' rows contain IP: True


In [24]:
url_df.to_csv('smaller_url_dataset_features.csv', index=False)
