# Exploratory Data Analysis
Load the dataset and show concise summaries for classification tasks.

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

DATA_PATH = 'data/raw/dataset.csv'
df = pd.read_csv(DATA_PATH)
df.head()


In [ ]:
# Class distribution plot
label_col = 'label'
if label_col not in df.columns:
    raise ValueError(f\"Label column '{label_col}' not found in {DATA_PATH}\")
counts = df[label_col].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=counts.index.astype(str), y=counts.values, palette='muted')
plt.title('Class distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
counts


In [ ]:
# Feature summaries: text or tabular
text_col = 'text'
if text_col in df.columns:
    texts = df[text_col].dropna().astype(str)
    lens = texts.str.len()
    print('Text length â€” mean:', round(lens.mean(), 1), 'median:', int(lens.median()))
    vec = CountVectorizer(stop_words='english', max_features=20)
    X = vec.fit_transform(texts)
    toks = vec.get_feature_names_out()
    freqs = X.sum(axis=0).A1
    top = list(zip(toks, freqs))
    print('\nTop tokens:')
    print(top)
else:
    print(df.describe(include='all'))
    print('\nMissing values:')
    print(df.isna().sum())


## Insights

- The class distribution plot indicates whether classes are imbalanced. Strong imbalance means accuracy can be misleading, so precision, recall, and F1 should also be emphasized.
- If the dataset is imbalanced, use stratified splits and consider class weighting or resampling. For cybersecurity tasks, recall and F1 are often especially important for detecting minority attack/phishing classes.