# Exploratory Data Analysis (EDA)

This notebook explores the spam dataset to understand its structure, class distribution, and text characteristics.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add parent directory to path to import from src if needed
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

%matplotlib inline

## 1. Load Data

In [None]:
data_path = '../data/raw/spam.csv'
try:
    df = pd.read_csv(data_path, encoding='latin-1') # Common encoding for this dataset
except:
    df = pd.read_csv(data_path, encoding='utf-8')

df.head()

## 2. Data Cleaning (Quick Look)

In [None]:
# Rename columns for clarity if needed
df = df.rename(columns={'v1': 'label', 'v2': 'text'})
df = df[['label', 'text']]
df.head()

## 3. Class Distribution

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=df)
plt.title('Distribution of Spam vs Ham')
plt.show()

print(df['label'].value_counts(normalize=True))

## 4. Text Length Analysis

In [None]:
df['length'] = df['text'].apply(len)

plt.figure(figsize=(10,6))
sns.histplot(data=df, x='length', hue='label', bins=50, kde=True)
plt.title('Message Length Distribution by Class')
plt.show()