# 01 - Exploratory Data Analysis

PhiUSIIL phishing dataset (tabular engineered URL features).

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

DATA_PATH = 'data/raw/dataset.csv'
LABEL_COL = 'label'

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Class distribution plot
if LABEL_COL not in df.columns:
    raise ValueError(f"Label column '{LABEL_COL}' not found in {DATA_PATH}")
counts = df[LABEL_COL].value_counts()
plt.figure(figsize=(6, 4))
plt.bar(counts.index.astype(str), counts.values)
plt.title('Class distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
counts

In [None]:
# Tabular feature summaries
print('Shape:', df.shape)
print('\nDtypes:')
print(df.dtypes)
print('\nMissing values:')
print(df.isna().sum())
print('\nNumeric describe():')
print(df.select_dtypes(include='number').describe())

In [None]:
# Optional URL samples (if column exists)
if 'URL' in df.columns:
    df['URL'].dropna().head()

**Insights**

Class balance should be checked early because heavy skew can inflate apparent accuracy for the majority class.
Very strong baseline scores on engineered URL features can indicate a curated dataset with highly separable patterns.
Real-world phishing detection can still perform worse due to drift, adversarial adaptation, and domain shift.