In [1]:
# import dns.resolver
from urllib.parse import urlparse
from datetime import date
import ipaddress
import re
import pandas as pd
# import whois
# import dns


### 1. Uploading the Datasets into Dataframes

In [18]:
df1 = pd.read_csv("../Raw Data/Dataset #1.csv")
df2 = pd.read_csv("../Raw Data/Dataset #2.csv")
df3 = pd.read_csv("../Raw Data/Dataset #3.csv")

In [4]:
df1.head() #Reading in the first 5 entries of the first dataset

Unnamed: 0,url,type
0,https://www.google.com,legitimate
1,https://www.youtube.com,legitimate
2,https://www.facebook.com,legitimate
3,https://www.baidu.com,legitimate
4,https://www.wikipedia.org,legitimate


In [19]:
df2.head() #Reading in the first 5 entries of the second dataset

Unnamed: 0,url,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_and,nb_or,...,domain_in_title,domain_with_copyright,whois_registered_domain,domain_registration_length,domain_age,web_traffic,dns_record,google_index,page_rank,status
0,http://www.crestonwood.com/router.php,37,19,0,3,0,0,0,0,0,...,0,1,0,45,-1,0,1,1,4,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,77,23,1,1,0,0,0,0,0,...,1,0,0,77,5767,0,0,1,2,phishing
2,https://support-appleld.com.secureupdate.duila...,126,50,1,4,1,0,1,2,0,...,1,0,0,14,4004,5828815,0,1,0,phishing
3,http://rgipt.ac.in,18,11,0,2,0,0,0,0,0,...,1,0,0,62,-1,107721,0,0,3,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,55,15,0,2,2,0,0,0,0,...,0,1,0,224,8175,8725,0,0,6,legitimate


In [20]:
#The third dataset includes urls that fall under the "defacement" or "malware" category. 
# To keep it more consise, we will remove the entries that are labeled as "defacement" or "malware" .

df3 = df3.loc[df3['type'] != 'defacement'] 
df3 = df3.loc[df3['type'] != 'malware'] 

In [21]:
df3.head() #Reading in the first 5 entries of the third dataset

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,benign


In [6]:
#Creating a default Dataframe that contains the features of each dataset

df =pd.DataFrame(columns=['URL', 'Domain', 'IP_Address', '@_Symbol', 'URL_Length', 
                            'Http/https_in_Domain', 'Prefix/Suffix_in_Domain', 
                            'Tiny_URL', 'Depth_Of_URL', "Redirection", 'Num-of_Dots', 'Num_of_Hyphens', 'Num_of_Underscore' 'Label'])
df.head()


Unnamed: 0,URL,Domain,IP_Address,@_Symbol,URL_Length,Http/https_in_Domain,Prefix/Suffix_in_Domain,Tiny_URL,Depth_Of_URL,Redirection,Num-of_Dots,Num_of_Hyphens,Num_of_UnderscoreLabel


### 2. Creating Functions to Extract Features

In [None]:
def get_domain_name(url):
    """
    This fucntion gets the domain name of the URL.

    Parameters:
    -----------
        URL (string)

    Returns:
    --------
        domain (string): The domain part of the URL.
    """

    domain = urlparse(url).netloc

    if isinstance(domain, bytes):
        domain = domain.decode('utf-8')

    if re.match(r'^www\.', domain):
        domain = domain.removeprefix("www.")

    return domain


In [None]:
def is_ip_in_domain(url):
    """
    This function checks whether the given URL contains an IP address in the domain part.

    Parameters:
    -----------
        URL (string)

    Returns:
    --------
        int: Returns 1 if the domain part of the URL is an IP address,
             otherwise returns 0.
    """
    domain = urlparse(url).netloc

    try:
        if ipaddress.ip_address(domain):
            return 1
    except ValueError:
        return 0


In [None]:
def contains_at_symbol(url):
    """
    This fucntion checks whether the given URL contains an '@' symbol.

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 1 if the '@' symbol is present in the URL,
             otherwise returns 0.
    """
    if '@' in url:
        return 1
    else:
        return 0

In [None]:
def get_url_length(url):
    """
    This fucntion checks how long a URL is.

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 0 if the length of a URL is less than 54,
             otherwise returns 1.
    """
    if len(url) < 54:
        return 0
    else:
        return 1
        

In [None]:
def is_protocol_in_domain(url):
    """
    This fucntion checks if an "HTTP" or "HTTPS" token is used in the domain part of the URL.

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 1 if "HTTP" or "HTTPS" is in the domain,
             otherwise returns 0.
    """
    domain = urlparse(url).netloc
    if "http" in domain:
        return 1
    elif "https" in domain:
        return 1
    else:
        return 0

In [None]:
def is_prefix_in_domain(url):
    """
    This fucntion checks if the domain part of a URL has a dash symbol (-).

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 1 if the dash symbol is in the domain,
             otherwise returns 0.
    """
    domain = urlparse(url).netloc
    if "-" in domain:
        return 1
    else:
        return 0

In [None]:
def is_tiny_url(url):
    """
    This fucntion checks if a URL shortening service is being used.

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 1 if the domain is part of a URL shortening service,
             otherwise returns 0.
    """
    with open("shortening_services.txt", 'r') as f:
        shortening_services = set(line.strip().lower() for line in f if line.strip())

    domain = urlparse(url).netloc
    if domain in shortening_services:
        return 1
    else:
        return 0
    

In [14]:
def redirection(url):
    """
    This function checks if the URL contains '//' beyond the protocol part.

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns 1 if an additional '//' is found beyond the protocol,
         otherwise returns 0.
    """
    first_idx = url.find("//")
    last_idx = url.rfind("//")
    
    if first_idx != last_idx:
        return 1
    return 0

In [None]:
def depth_of_url(url):
    """
    This function checks the depth of a URL based on the number of "/".

    Parameters:
    -----------
        URL (str): The URL to check.

    Returns:
    --------
        int: Returns the number of "/"-separated segments in the path.
    """
    segment = urlparse(url).path.split('/')
    depth = 0
    for i in range(len(segment)):
        if len(segment[i]) != 0:
            depth = depth+1
    return depth

### 3. Extracting Features and exporting the Dataframe

In [None]:
def extract_features(df):
    """
    This function extracts the features of a URLs that are in a Dataframe.

    Parameters:
    -----------
        df (Dataframe): Dataframe with a 'url' named column which contains URLs.

    Returns:
    --------
        pd.DataFrame (Dataframe): Returns a new Dataframe that contains the features of the processed Dataframe that was passed.
    """
    
    return pd.DataFrame({
        'URL': df['url'],
        'Domain': df['url'].apply(get_domain_name),
        'IP_Address': df['url'].apply(is_ip_in_domain).astype(int),
        'Prefix/Suffix_in_Domain': df['url'].apply(is_prefix_in_domain).astype(int),
        'Tiny_URL': df['url'].apply(is_tiny_url).astype(int),
        '@_Symbol': df['url'].apply(contains_at_symbol).astype(int),
        'URL_Length': df['url'].apply(get_url_length),
        'Http/https_in_Domain': df['url'].apply(is_protocol_in_domain).astype(int),
        'Depth_Of_URL': df['url'].apply(depth_of_url),
        'Redirection': df['url'].apply(redirection).astype(int),
        'Num_of_Dots': df['url'].str.count('\.'),
        'Num_of_Hyphens': df['url'].str.count('-'),
        'Num_of_Underscore': df['url'].str.count('_'),
        'Label': df['type']
    })

In [33]:
df11 = extract_features(df1) #Extracting the featurs of the first dataset

In [34]:
df11.tail() #Dsiplaying the last five entries of the dataset that contains the features of Datatset #1

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,ecct-it.com,0,1,0,0,0,0,3,0,2,1,0,phishing
450172,http://faboleena.com/js/infortis/jquery/plugin...,faboleena.com,0,0,0,0,1,0,12,0,2,0,2,phishing
450173,http://faboleena.com/js/infortis/jquery/plugin...,faboleena.com,0,0,0,0,1,0,11,0,1,0,1,phishing
450174,http://atualizapj.com/,atualizapj.com,0,0,0,0,0,0,0,0,1,0,0,phishing
450175,http://writeassociate.com/test/Portal/inicio/I...,writeassociate.com,0,0,0,0,1,0,6,0,4,1,1,phishing


In [35]:
df11.shape

(450176, 14)

In [36]:
df33 = extract_features(df3)  #Extracting the featurs of the third dataset

In [37]:
df33.head() #Dsiplaying the first five entries of the dataset that contains the features of Datatset #3

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
0,br-icloud.com.br,,0,0,0,0,0,0,1,0,2,1,0,phishing
1,mp3raid.com/music/krizz_kaliko.html,,0,0,0,0,0,0,3,0,2,0,1,benign
2,bopsecrets.org/rexroth/cr/1.htm,,0,0,0,0,0,0,4,0,2,0,0,benign
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,buzzfil.net,0,0,0,0,1,0,3,0,2,16,0,benign
6,espn.go.com/nba/player/_/id/3457/brandon-rush,,0,0,0,0,0,0,7,0,2,1,1,benign


In [38]:
df33.shape

(522214, 14)

In [None]:
def extract_features2(df2):
    """
    This function extracts the features of a URLs that are in a Dataframe. 
    This is in the case that the Dataframe that is passed already contains pre-existing features which
    do not need to be extracted. 

    Parameters:
    -----------
        df (Dataframe): Dataframe with a 'url' named column which contains URLs.

    Returns:
    --------
        pd.DataFrame (Dataframe): Returns a new Dataframe that contains the features of the processed Dataframe that was passed.
    """
    return pd.DataFrame({
        'URL': df2['url'],
        'Domain': df2['url'].apply(get_domain_name),
        'IP_Address': df2['ip'],
        'Prefix/Suffix_in_Domain': df2['prefix_suffix'],
        'Tiny_URL': df2['shortening_service'],
        '@_Symbol': df2['url'].apply(contains_at_symbol).astype(int),
        'URL_Length': df2['url'].apply(get_url_length),
        'Http/https_in_Domain': df2['url'].apply(is_protocol_in_domain).astype(int),
        'Depth_Of_URL': df2['url'].apply(depth_of_url),
        'Redirection': df2['url'].apply(redirection).astype(int),
        'Google_Index': df2['google_index'],
        'WHOIS_Domain': df2['whois_registered_domain'],
        'DNS_Record': df2['dns_record'],
        'Registration_Length': df2['domain_registration_length'],
        'Num_of_Dots': df2['url'].str.count('\.'),
        'Num_of_Hyphens': df2['url'].str.count('-'),
        'Num_of_Underscore': df2['url'].str.count('_'),
        'Label': df2['status']
    })
    

In [27]:
df22 = extract_features2(df2) #Extracting the featurs of the third dataset

In [28]:
df22.head() #Dsiplaying the first five entries of the dataset that contains the features of Datatset #2

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Google_Index,WHOIS_Domain,DNS_Record,Registration_Length,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
0,http://www.crestonwood.com/router.php,crestonwood.com,0,0,0,0,0,0,1,0,1,0,1,45,3,0,0,legitimate
1,http://shadetreetechnology.com/V4/validation/a...,shadetreetechnology.com,1,0,0,0,1,0,3,0,1,0,0,77,1,0,0,phishing
2,https://support-appleld.com.secureupdate.duila...,support-appleld.com.secureupdate.duilawyeryork...,1,1,0,0,1,0,2,0,1,0,0,14,4,1,2,phishing
3,http://rgipt.ac.in,rgipt.ac.in,0,0,0,0,0,0,0,0,0,0,0,62,2,0,0,legitimate
4,http://www.iracing.com/tracks/gateway-motorspo...,iracing.com,0,0,0,0,1,0,2,0,0,0,0,224,2,2,0,legitimate


In [29]:
df22.shape

(11430, 18)

In [41]:
#Converting the status of a URL to binary 
df11['Label'] = df11['Label'].replace({
    'phishing': 1,
    'legitimate': 0
})

df11.head()

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
0,https://www.google.com,google.com,0,0,0,0,0,0,0,0,2,0,0,0
1,https://www.youtube.com,youtube.com,0,0,0,0,0,0,0,0,2,0,0,0
2,https://www.facebook.com,facebook.com,0,0,0,0,0,0,0,0,2,0,0,0
3,https://www.baidu.com,baidu.com,0,0,0,0,0,0,0,0,2,0,0,0
4,https://www.wikipedia.org,wikipedia.org,0,0,0,0,0,0,0,0,2,0,0,0


In [42]:
#Converting the status of a URL to binary 
df22['Label'] = df22['Label'].replace({
    'phishing': 1,
    'legitimate': 0
})

df22.head()

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Google_Index,WHOIS_Domain,DNS_Record,Registration_Length,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
0,http://www.crestonwood.com/router.php,crestonwood.com,0,0,0,0,0,0,1,0,1,0,1,45,3,0,0,0
1,http://shadetreetechnology.com/V4/validation/a...,shadetreetechnology.com,1,0,0,0,1,0,3,0,1,0,0,77,1,0,0,1
2,https://support-appleld.com.secureupdate.duila...,support-appleld.com.secureupdate.duilawyeryork...,1,1,0,0,1,0,2,0,1,0,0,14,4,1,2,1
3,http://rgipt.ac.in,rgipt.ac.in,0,0,0,0,0,0,0,0,0,0,0,62,2,0,0,0
4,http://www.iracing.com/tracks/gateway-motorspo...,iracing.com,0,0,0,0,1,0,2,0,0,0,0,224,2,2,0,0


In [43]:
#Converting the status of a URL to binary 
df33['Label'] = df33['Label'].replace({
    'phishing': 1,
    'benign': 0
})

df33.head()

Unnamed: 0,URL,Domain,IP_Address,Prefix/Suffix_in_Domain,Tiny_URL,@_Symbol,URL_Length,Http/https_in_Domain,Depth_Of_URL,Redirection,Num_of_Dots,Num_of_Hyphens,Num_of_Underscore,Label
0,br-icloud.com.br,,0,0,0,0,0,0,1,0,2,1,0,1
1,mp3raid.com/music/krizz_kaliko.html,,0,0,0,0,0,0,3,0,2,0,1,0
2,bopsecrets.org/rexroth/cr/1.htm,,0,0,0,0,0,0,4,0,2,0,0,0
5,http://buzzfil.net/m/show-art/ils-etaient-loin...,buzzfil.net,0,0,0,0,1,0,3,0,2,16,0,0
6,espn.go.com/nba/player/_/id/3457/brandon-rush,,0,0,0,0,0,0,7,0,2,1,1,0


In [44]:
df11.to_csv('Dataset #1 After Feature Extraction.csv', index=False) #Exporting the dataframe to CSV file format
df22.to_csv('Dataset#2 After Feature Extraction.csv', index=False) #Exporting the dataframe to CSV file format
df33.to_csv('Dataset#3 After Feature Extraction.csv', index=False) #Exporting the dataframe to CSV file format
