<a href="https://www.kaggle.com/code/shallykandoi/cs361-eda?scriptVersionId=168159087" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **URL Feature Extraction**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.mode.chained_assignment = None

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Load the dataset
data = pd.read_csv("/kaggle/input/phishing-dataset/Phishing_dataset_02.csv")

# print number of phishing and legitimate urls
print(data['status'].value_counts())

# print the column names
print(data.columns)

data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
for col in data.columns:
    unique_value_list = data[col].unique()
    if len(unique_value_list) > 10:
        print(f'{col} has {data[col].nunique()} unique values')
    else:
        print(f'{col} contains:\t\t\t{unique_value_list}')


In [None]:
# drop unnecessary columns from the dataframe
data = data.drop(data.columns[1:66], axis=1)
data = data.drop(data.columns[2:3], axis=1)
data = data.drop(data.columns[4:6], axis=1)
data = data.drop(data.columns[10:12], axis=1)

# print number of phishing and legitimate urls
print(data['status'].value_counts())

# print the column names
print(data.columns)

data.head()

In [None]:
data = data.drop(columns=['domain_with_copyright'])

## Feature Extraction Functions

In [None]:
# Import headers

import re
from urllib.parse import *

In [None]:
# Embedded Domain: Examines dot-separated domain/hostname patterns in the URL path.

def embedded_domain(url):
    # Extract the domain from the URL
    domain = urlparse(url).netloc
    # Split the domain into its components
    domain_parts = domain.split('.')
    
    # Checking whether the url is similar to a well-known domain
    # If it is, we return 1, else 0
    well_known_URLs = ['google', 'facebook', 'twitter', 'linkedin', 'youtube', 'instagram', 'pinterest', 'amazon', 'snapchat', 'reddit', 'flickr', 'whatsapp', 'quora', 'vimeo', 'periscope', 'vine', 'meetup', 'tagged', 'askfm', 'meetme', 'meetup', 'myspace', 'stumbleupon', 'delicious', 'digg', 'slashdot', 'fark', 'newsvine', 'foursquare', 'yelp', 'tripadvisor', 'zomato', 'opentable']
    
    for well_known_URL in well_known_URLs:
        for domain_part in domain_parts:
            if len(set(domain_part)&set(well_known_URL)) == len(well_known_URL) - 1:
                return 1
    return -1


# Example usage
url1 = "http://www.google.com"
url2 = "http://www.facehook.com"

print(embedded_domain(url1))
print(embedded_domain(url2))

In [None]:
# IP Address: Attackers often employ IP address in the URL
# to disguise a webpage’s malicious nature, while legitimate
# websites almost always use domain names instead of IP
# addresses due to their easy memorability.

def having_ip_address(url):
    # Regular expression to match IP address pattern
    ip_address_pattern = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b'

    # Search for the pattern in the URL
    match = re.search(ip_address_pattern, url)

    if match:
        # print match.group()
        return -1
    else:
        # print 'No matching pattern found'
        return 1
    
# Example usage
url1 = "https://www.google.com"
url2 = "172.0.0.1"

print(having_ip_address(url1))
print(having_ip_address(url2))

In [None]:
# Number of dots in URL: Phishing pages tend to use more
# dots in their URLs than the legitimate sites.
def no_of_dots(url):
    return url.count('.')

# Example usage
url = "https://www.google.com"

print(no_of_dots(url))

In [None]:
# Count the total number of special characters in the URL
def count_special_chars(url):
    special_chars = [';', '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+', '-', '=', '{', '}', '[', ']', '|', '\\', ':', '"', "'", '<', '>', ',', '.', '?', '/']
    count = 0
    for char in url:
        if char in special_chars:
            count += 1
    return count

# Example usage
url = 'http://www.google.com'

print(count_special_chars(url))

In [None]:
# Lexical features: The URL string is broken down into
# multiple tokens. Each token constitutes a binary feature.
# The delimiters to obtain the tokens are ‘/’, ‘?’, ‘.’, ‘=’, ‘ ’,
# ‘&’, and ‘-’

def extract_lexical_features(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Define delimiters
    delimiters = ['/', '?', '.', '=', ' ', '&', '-']

    # Split the url into tokens based on the delimiters
    tokens = re.split('|'.join(map(re.escape, delimiters)), parsed_url.geturl())

    # Remove empty tokens
    tokens = list(filter(None, tokens))

    # Initialize a dictionary to store binary features for each token
    lexical_features = {}

    # Extract binary features for each token
    for token in tokens:
        lexical_features[token] = 1
    
    return lexical_features

# Example usage

url = 'https://www.google.com/search?q=feature+extraction+from+url&oq=feature+extraction+from+url&aqs=chrome..69i57j0l7.10257j0j7&sourceid=chrome&ie=UTF-8'

print(extract_lexical_features(url))

In [None]:
# Number of sensitive words in URL: In (Garera et al., 2007),
# Garera et al summarized a set of eight sensitive words that
# frequently appear in phishing URLs. This is a numeric feature with a range of 0 to 8.

def no_of_sensitive_words(url):
    sensitive_words = ['confirm', 'account', 'banking', 'secure', 'ebayisapi', 'webscr', 'login', 'signin']
    count = 0
    for word in sensitive_words:
        if word in url:
            count += 1
    return count

# Example usage
url1 = "https://www.google.com"
url2 = "http://www.abc.com/confirm"

print(no_of_sensitive_words(url1))
print(no_of_sensitive_words(url2))

In [None]:
# Out-of-Position Top Level Domain (TLD): Checks for
# unusual positioning of TLDs in the URL.

def out_of_position_tld(url):
    tld = ['com', 'org', 'net', 'edu', 'gov', 'in']
    
    # domain = urlparse(url).netloc
    # check if the TLD is in the middle of the domain
    tokens = url.split('.')
    for i in range(len(tokens) - 1):
        if tokens[i] in tld:
            return -1
    return 1

# Example usage
url1 = 'http://www.google.com'
url2 = 'http://www.google.com.in'
    
print(out_of_position_tld(url1))
print(out_of_position_tld(url2))

In [None]:
# Check if the website is using HTTPS
def https_token(url):
    https_tokens = url.split('//')[0]
    if https_tokens == 'https:':
        return 1
    else:
        return -1

# Example usage
url1 = 'http://www.google.com'
url2 = 'https://www.google.com'

print(https_token(url1))
print(https_token(url2))

In [None]:
# Get the length of the URL
def url_length(url):
    return len(url)

In [None]:
# If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else -1 (legitimate).

# listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

# Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return -1

# Example usage
url1 = 'http://www.google.com'
url2 = 'https://goo.gl'

print(tinyURL(url1))
print(tinyURL(url2))

In [None]:
# Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return -1            # legitimate
    
# Example usage
url1 = 'http://www.google.com'
url2 = 'http://www.go-ogle.com'

print(prefixSuffix(url1))
print(prefixSuffix(url2))

In [None]:
# Apply the feature extraction functions to the URL column of the filtered data
data.loc[:,'embedded_domain'] = data.loc[:,'url'].apply(embedded_domain)
data.loc[:,'having_ip_address'] = data.loc[:,'url'].apply(having_ip_address)
data.loc[:,'no_of_dots'] = data.loc[:,'url'].apply(no_of_dots)
data.loc[:,'lexical_features'] = data.loc[:,'url'].apply(extract_lexical_features)
data.loc[:,'no_of_sensitive_words'] = data.loc[:,'url'].apply(no_of_sensitive_words)
data.loc[:,'out_of_position_tld'] = data.loc[:,'url'].apply(out_of_position_tld)
data.loc[:,'https_token'] = data.loc[:,'url'].apply(https_token)
data.loc[:,'url_length'] = data.loc[:,'url'].apply(url_length)
data.loc[:,'tinyURL'] = data.loc[:,'url'].apply(tinyURL)
data.loc[:,'prefixSuffix'] = data.loc[:,'url'].apply(prefixSuffix)
data.loc[:,'count_special_chars'] = data.loc[:,'url'].apply(count_special_chars)

data.head()

In [None]:
# Removing column 'status' and appending it to the end ('status' tells us the actual category of the url - ligitimate or phishing)
status = data.pop('status')  
data['status'] = status
data.head()

In [None]:
data = data.drop(columns=['lexical_features'])

## Exploratory Data Analysis

In [None]:
# # Printing the value counts of each feature
# for col in data.columns[1:]:
#     print(data[col].value_counts())

In [None]:
# Printing the information related to each column - its data type, count, etc
print(data.info())

In [None]:
data.isnull().sum()

#### We observe that no column has null values

In [None]:
# If we had encountered columns with missing values

# # Drop rows with missing values
# data.dropna(inplace=True)
# # OR
# # impute missing values
# # data.fillna(method='ffill', inplace=True)

In [None]:
# Describing the statistics of each column
print(data.describe())

#### We observe that columns 'submit_email' and 'sfh' are useless here as the entire columns have value as '0' only

In [None]:
# Dropping columns 'submit_email' and 'sfh'
data = data.drop(columns=['sfh','submit_email'])

In [None]:
data.shape

### Data Visualization

In [None]:
# Importing the necessary libraries for data visualization
import matplotlib.pyplot as plt  # Library for creating plots and visualizations
import seaborn as sns  # Library for statistical data visualization

# Specifying that plots should be displayed inline in the notebook
%matplotlib inline

In [None]:
import warnings  # Library for handling warnings

# Ignoring FutureWarnings to suppress future warning messages
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Listing out all the columns
list(data.columns)

In [None]:
data = data.drop(columns=['url'])

In [None]:
# Defining a dictionary to map values of 'status' to numerical values
status_mapping = {'legitimate': -1, 'phishing': 1}

# Mapping the values using the dictionary and creating a new column 'label'
data['label'] = data['status'].map(status_mapping)

#### Assigned a label of '1' to phishing urls as our goal is to correctly detect phishing urls

In [None]:
# Printing first few rows to see the current structure of the dataset
data.head()

### Categories of urls

In [None]:
# Counting the occurrences of each unique value in the 'status' column
label_counts = data['status'].value_counts()
print(label_counts)

# Creating a figure for the plot and specifying the size
plt.figure(figsize=(5, 3))

# Creating a count plot using Seaborn
sns.countplot(x='status', data=data)
plt.title('Types Of URLs')
plt.xlabel('Category Of URLs')
plt.ylabel('Count')
plt.tight_layout() # Adjusting layout to prevent overlapping of labels
plt.show() # Displaying the plot


#### We observe that the dataset is already balanced

### Histograms of data distribution for each numerical column

In [None]:
# Displaying histograms of data distribution for each numerical column
data.hist(bins = 50,figsize = (15,15))
plt.tight_layout() # Adjusting layout to prevent overlapping of labels
plt.show()

#### Observing that some columns have only discrete values whereas others have a continuous distribution

In [None]:
# Initialize an empty list to store boolean values indicating if each column is categorical
is_categorical = []

# Initialize empty lists to store the names of categorical and non-categorical columns
categorical_columns = []
non_categorical_columns = []

for col in data.columns:
    # Checking if the column has only two unique values, implying it's categorical
    if data[col].nunique() == 2:
        # Printing the column name and its unique values
        print(f'{col} contains:\t{unique_value_list}')
        is_categorical.append(True) # Appending True to indicate the column is categorical
        categorical_columns.append(col)
    else:
        # If the column has more than two unique values, it's non-categorical
        is_categorical.append(False) # Appending False to indicate the column is not categorical
        non_categorical_columns.append(col)

# print(is_categorical)        
        
# Display the list of categorical columns
print("Categorical columns: ", categorical_columns)

# Display the list of non-categorical columns
print("Non-categorical columns: ", non_categorical_columns)


### Histogram for each non-categorical feature

In [None]:
# Plotting a histogram for each non-categorical feature
# Calculate the number of rows needed for subplots
num_plots = len(non_categorical_columns)
num_rows = (num_plots + 1) // 2  # Round up to the nearest integer

# Creating subplots 
fig, axes = plt.subplots(num_rows, 2, figsize=(15, 3*num_rows))

# Plotting a histogram for each non-categorical feature
for i, ax in enumerate(axes.flat):
    if i < num_plots:
        # Plot the histogram for the current column
        sns.histplot(data=data, x=non_categorical_columns[i], bins=50, edgecolor='black', ax=ax)
        
        # Set labels and limits
        ax.set_xlabel(non_categorical_columns[i], fontsize=18)
        ax.set_ylim(0, 1000)  # Setting upper_limit on values of y-axis as 1000
        ax.grid(True) # Add grid
        
        # Set ylabel only for the first plot of every row
        if i % 2 == 0:
            ax.set_ylabel("Number Of URLs", fontsize=18)
        else:
            ax.set_ylabel("")
    else:
        ax.remove() # Removing extra subplot if there are fewer plots than expected

plt.tight_layout()
plt.show()

### Distribution of urls in each categorical feature

In [None]:
# Plotting distribution of urls in each categorical feature
# Calculate the number of rows needed for subplots
num_plots = len(categorical_columns)
num_rows = (num_plots + 3) // 4 

# Create subplots with the desired layout
fig, axes = plt.subplots(num_rows, 4, figsize=(20, 3*num_rows))
    
# Iterate over categorical columns and create count plots
for i, ax in enumerate(axes.flat):
    # Skip 'label' and 'status' columns
    if categorical_columns[i] == "label" or categorical_columns[i] == "status":
        continue
    
    # Remove extra subplots
    if i >= num_plots:
        ax.remove()
        continue
    
    # Plot countplot for the current column
    sns.countplot(x=categorical_columns[i], data=data, ax=ax)

    # Set labels
    ax.set_xlabel(categorical_columns[i], fontsize=18)
    if i % 4 == 0:
        ax.set_ylabel("Number Of Urls", fontsize=18)
    else:
        ax.set_ylabel("") 
        
plt.delaxes(axes[3][2])
plt.delaxes(axes[3][3])

plt.tight_layout()
plt.show()

#### We observe that 'iframe', 'popup_window', 'onmouseover', 'right_clic' will not be useful as the urls are not much spread across them. 

### Heatmap

In [None]:
# Creating a correlation matrix of the columns, and visualizing it using a heatmap.
# Drop 'status' and 'label' columns from the DataFrame and store it as another copy
data2 = data.drop(columns=['status', 'label'])

# Set the figure size for the heatmap
plt.figure(figsize=(10, 10))

sns.heatmap(data2.corr(), square=True, linewidths=.5)

# Add proper labels to the heatmap
plt.xlabel("Features", fontsize=18)  
plt.ylabel("Features", fontsize=18)  

plt.show()

### Box plots for the non-categorical features

In [None]:
# Creating box plots for the non-categorical features
plt.figure(figsize=(15, 15))

for i, feature in enumerate(non_categorical_columns, 1):
    plt.subplot(4, 3, i)
    
    sns.boxplot(x='status', y=feature, data=data)
    
    # Set title, x-label, and y-label for the subplot
    plt.title(f'{feature.capitalize()}')  # Title with feature name
    plt.ylabel(feature)  # y-axis label
    plt.xlabel("")

plt.tight_layout()
plt.show()

### Bar plots for categorical features divided into subgroups

In [None]:
# Categorical columns:  ['login_form', 'iframe', 'popup_window', 'onmouseover', 'right_clic', 'whois_registered_domain', 'dns_record', 'google_index', 'embedded_domain', 'having_ip_address', 'out_of_position_tld', 'https_token', 'tinyURL', 'prefixSuffix']

# Define subgroups of categorical features
lexical_features = ['having_ip_address', 'embedded_domain', 'out_of_position_tld', 'https_token', 'tinyURL', 'prefixSuffix']
page_based_features = ['dns_record', 'google_index']
domain_based_features = ['whois_registered_domain']
html_javascript_based_features = ['iframe', 'popup_window', 'onmouseover', 'right_clic', 'login_form',]

# Dictionary to store feature groups
feature_groups = {
    'Lexical': lexical_features,
    'Page-based': page_based_features,
    'Domain-based': domain_based_features,
    'HTML/JavaScript-based': html_javascript_based_features
}

# Plotting bar plots for each feature group
for group_name, group_features in feature_groups.items():
    plt.figure(figsize=(15, 8))
    for i, feature in enumerate(group_features, 1):
        plt.subplot(2, 3, i)
        
        sns.countplot(x=feature, hue='status', data=data)
        
        plt.title(f'{feature.capitalize()}')
        plt.xlabel(feature)
        plt.ylabel('Count')
        plt.legend(title='Status')
        
    plt.suptitle(f'{group_name} Features', fontsize=16)
    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    plt.show()

In [None]:
data = data.drop(columns=['label'])
data.head()

In [None]:
# Saving the preprocessed dataset to a new csv file 
data.to_csv('/kaggle/working/Preprocessed_data.csv', index=False)