# Features extraction

Now, let's extract features for our model. We will extract the following features from the URLs:

- **Domain**: Domain name extracted from the URL.
- **DomainCharContinuationRate**: Ratio of the number of continuous characters in the domain.
- **DomainLength**: Number of characters in the domain name.
- **IsIP**: Indicates if the hostname is an IP address.
- **TLD**: TLD (Top Level Domain) is the last part of the domain name, such as .com or .edu.
- **TLDLength**: Number of characters in the TLD.
- **NoOfSubDomain**: Number of subdomains in the URL.
- **NoOfLetters**: Number of letters in the domain.
- **LetterRatio**: Ratio of letters in the domain.
- **NoOfDigits**: Number of digits in the domain.
- **DigitRatio**: Ratio of digits in the domain.
- **IsHTTPS**: Indicates if the webpage is running on unsecured HTTP (hypertext transfer protocol) or secured HTTPS.

**Notes**:

- We need to differentiate between URL and domain. For example, in the URL `https://www.google.com/search?q=python`, the domain is `www.google.com`. This can be referred to as the hostname. The domain is also commonly used to talk about the base/root/apex domain (google.com).
- Due to disparities in the dataset, some features may not be applicable to all URLs. For example, most URLs in the dataset do not contain query parameters, which makes the `NoOfEqualsInURL`, `NoOfQMarkInURL`, and `NoOfAmpersandInURL` features less useful.
- Boolean features are converted to numerical values (0=False; 1=True).

## Utility functions

In [None]:
import re
from urllib.parse import urlparse

In [None]:
def is_ip(domain):
    # This regex will match any sequence of four numbers separated by dots. This is a simple way to check if a string is an IP address.
    # However, it doesn't strictly validate IP addresses. For example, it will match 999.999.999.999, which is not a valid IP address.
    ip_pattern = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
    value = bool(re.search(ip_pattern, domain))
    return int(value)


print(is_ip('www.google.com'))  # 0
print(is_ip('192.168.1.1'))  # 1
print(is_ip('999.999.999.999'))  # 1 (True, but it's not a valid IP address)

0
1
1


In [None]:
def no_of_subdomain(domain):
    # IP addresses are not domain names, thus they don't have subdomains. 
    # Subdomains are part of the DNS hierarchy and are only used in domain names.
    if is_ip(domain):
        return 0

    domains = domain.split('.')
    return len(domains) - 2  # Subtract apex domain and TLD


print(no_of_subdomain('docs.python.org'))  # 1
print(no_of_subdomain('google.com'))  # 0
print(no_of_subdomain('192.168.1.1'))  # 0

1
0
0


In [None]:
# https://github.com/arvindbitm/PhiUSIIL/blob/main/CharConRate.ipynb

def char_con_rate(url):
    ln = len(url)
    chC, nmC, spC = 0, 0, 0
    maxCh, maxNm, MaxSp = 0, 0, 0
    for i in range(0, ln):
        ch = url[i]
        if ch.isalpha():
            chC = chC + 1
            if (nmC > 0):
                if (maxNm < nmC):
                    maxNm = nmC
                    nmC = 0
            elif (spC > 0):
                if (MaxSp < spC):
                    MaxSp = spC
                    spC = 0
            nmC, spC = 0, 0

        elif ch.isdigit():
            nmC = nmC + 1
            if (chC > 0):
                if (maxCh < chC):
                    maxCh = chC
                    chC = 0
            elif (spC > 0):
                if (MaxSp < spC):
                    MaxSp = spC
                    spC = 0
            chC, spC = 0, 0
        else:
            spC = spC + 1
            if (nmC > 0):
                if (maxNm < nmC):
                    maxNm = nmC
                    nmC = 0
            elif (chC > 0):
                if (maxCh < chC):
                    maxCh = chC
                    chC = 0
            nmC, chC = 0, 0

    if (maxCh < chC):
        maxCh = chC
    if (maxNm < nmC):
        maxNm = nmC
    if (MaxSp < spC):
        MaxSp = spC
    return (maxCh + maxNm + MaxSp) / ln

In [None]:
def no_of_foo_in(string, pattern):
    """
    Counts the number of occurrences of a given pattern in a string.

    Parameters:
    - string (str): The input string to search for matches.
    - pattern (str): The pattern to search for in the input string.

    Returns:
    - int: The number of matches found.

    Example:
    >>> no_of_foo_in("https://facebook.com?param=value", r"=")
    1
    """
    # Find all matches of the pattern in the URL
    matches = re.findall(pattern, string)
    return len(matches)


print(no_of_foo_in('https://facebook.com', r'\?'))
print(no_of_foo_in('https://facebook.com?param=value', r'\?'))

0
1


In [None]:
def extract_domain(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    # Handle URLs without schemes
    if not domain:
        domain = url.split('/')[0]

        # Handle URLs with ports
        if ':' in domain:
            domain = domain.split(':')[0]

    return domain

In [None]:
def extract_feature(url):
    domain = extract_domain(url)
    is_hostname_ip = is_ip(domain)

    if is_hostname_ip:
        return {
            'domain': domain,
            'domain_char_continuation_rate': None,
            'domain_length': len(domain),
            'is_ip': int(is_hostname_ip),
            'tld': None,
            'tld_length': None,
            'no_of_subdomain': 0,
            'no_of_letters': no_of_foo_in(url, r'[a-zA-Z]'),
            'letter_ratio': no_of_foo_in(url, r'[a-zA-Z]') / len(url),
            'no_of_digits': no_of_foo_in(url, r'\d'),
            'digit_ratio': no_of_foo_in(url, r'\d') / len(url),
            'is_https': int(False),
        }

    print(domain)
    root_domain = domain.split('.')[-2]
    tld = domain.split('.')[-1]
    tld_length = len(tld)
    is_https = url.startswith('https://')

    return {
        'domain': domain,
        'domain_char_continuation_rate': char_con_rate(domain.split('.')[-2]),
        # Extract the root domain name without the TLD
        'domain_length': len(domain),
        'is_ip': int(is_hostname_ip),
        'tld': tld,
        'tld_length': tld_length,
        'no_of_subdomain': no_of_subdomain(domain),
        'no_of_letters': no_of_foo_in(root_domain, r'[a-zA-Z]'),
        'letter_ratio': no_of_foo_in(root_domain, r'[a-zA-Z]') / len(root_domain),
        'no_of_digits': no_of_foo_in(root_domain, r'\d'),
        'digit_ratio': no_of_foo_in(root_domain, r'\d') / len(root_domain),
        'is_https': int(is_https),
    }

In [None]:
extract_feature('digitool.amherst.edu:8881/dtl_publish/119/377562.html')

8881


IndexError: list index out of range

In [None]:
extract_feature('https://www.google.com/search?q=alan+turing')

{'domain': 'www.google.com',
 'domain_char_continuation_rate': 1.0,
 'domain_length': 14,
 'is_ip': 0,
 'tld': 'com',
 'tld_length': 3,
 'no_of_subdomain': 1,
 'no_of_letters': 6,
 'letter_ratio': 1.0,
 'no_of_digits': 0,
 'digit_ratio': 0.0,
 'is_https': 1}

In [None]:
extract_feature('example.com')

{'domain': 'example.com',
 'domain_char_continuation_rate': 1.0,
 'domain_length': 11,
 'is_ip': 0,
 'tld': 'com',
 'tld_length': 3,
 'no_of_subdomain': 0,
 'no_of_letters': 7,
 'letter_ratio': 1.0,
 'no_of_digits': 0,
 'digit_ratio': 0.0,
 'is_https': 0}

## Load the dataset

In [None]:
import pandas as pd
import glob

In [None]:
folder_path = "../data/raw"

In [None]:
csv_files = glob.glob(folder_path + "/is_url_live_chunk_*.csv")

In [None]:
df_list = []

for file in csv_files:
    df = pd.read_csv(file, index_col='id')
    df_list.append(df)

df = pd.concat(df_list, ignore_index=False)

In [None]:
df.head()

Unnamed: 0_level_0,source_id,url,is_phishing,is_online,created_at,updated_at
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11,1,https://www.religionenlibertad.com,False,True,2024-11-15 12:44:55.549064,2024-11-15 12:49:03.015624
12,1,http://www.teramill.com,True,True,2024-11-15 12:44:55.549064,2024-11-15 12:49:03.570896
13,1,https://www.socialpolicy.org,False,True,2024-11-15 12:44:55.549064,2024-11-15 12:49:04.900914
14,1,https://www.aoh61.com,False,False,2024-11-15 12:44:55.549064,2024-11-15 12:49:12.032484
15,1,https://www.bulgariaski.com,False,True,2024-11-15 12:44:55.549064,2024-11-15 12:49:13.439163


## Extract features

In [None]:
def apply_extract_feature(row):
    try:
        features = extract_feature(row['url'])
        new_row = row.to_dict()
        new_row.update(features)
    except Exception as e:
        print(f"Error processing URL: {row['url']}, {e}")
        new_row = row.to_dict()
    return pd.Series(new_row)


processed_df = df.apply(apply_extract_feature, axis=1)

In [None]:
# Drop some columns
processed_df.drop(['source_id', 'is_online', 'created_at', 'updated_at'], axis=1, inplace=True)

In [None]:
# Convert boolean columns to integers
processed_df['is_phishing'] = processed_df['is_phishing'].astype(int)

In [None]:
processed_df.head()

Unnamed: 0_level_0,url,is_phishing,domain,domain_char_continuation_rate,domain_length,is_ip,tld,tld_length,no_of_subdomain,no_of_letters,letter_ratio,no_of_digits,digit_ratio,is_https
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
11,https://www.religionenlibertad.com,0,www.religionenlibertad.com,1.0,26,0,com,3.0,1,18,1.0,0,0.0,1
12,http://www.teramill.com,1,www.teramill.com,1.0,16,0,com,3.0,1,8,1.0,0,0.0,0
13,https://www.socialpolicy.org,0,www.socialpolicy.org,1.0,20,0,org,3.0,1,12,1.0,0,0.0,1
14,https://www.aoh61.com,0,www.aoh61.com,1.0,13,0,com,3.0,1,3,0.6,2,0.4,1
15,https://www.bulgariaski.com,0,www.bulgariaski.com,1.0,19,0,com,3.0,1,11,1.0,0,0.0,1


In [None]:
from datetime import datetime

output_file = f'../data/data_{datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}.csv'
processed_df.to_csv(output_file, index=False)