# Exploratory Data Analysis

In this notebook, we will perform exploratory data analysis (EDA) on the phishing URL dataset. The goal is to visualize data distributions and gain insights into the dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualisation style
sns.set(style='whitegrid')

In [None]:
# Load the dataset
data = pd.read_csv('../data/processed/phishing_data.csv')

# Display the first few rows of the dataset
data.head()

In [None]:
# Visualize the distribution of the target variable
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=data)
plt.title('Distribution of Phishing vs Legitimate URLs')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
# Visualize the length of URLs
data['url_length'] = data['url'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(data['url_length'], bins=30, kde=True)
plt.title('Distribution of URL Lengths')
plt.xlabel('Length of URL')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()