# 01 - Exploratory Data Analysis

Load the dataset and show concise summaries for classification tasks.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

DATA_PATH = 'data/raw/dataset.csv'
LABEL_COL = 'label'
TEXT_COL = 'text'

df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Class distribution plot
if LABEL_COL not in df.columns:
    raise ValueError(f"Label column '{LABEL_COL}' not found in {DATA_PATH}")
counts = df[LABEL_COL].value_counts()
plt.figure(figsize=(6, 4))
plt.bar(counts.index.astype(str), counts.values)
plt.title('Class distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
counts

In [None]:
# Feature summaries: text or tabular
if TEXT_COL in df.columns:
    texts = df[TEXT_COL].dropna().astype(str)
    lengths = texts.str.len()
    print('Text length - mean:', round(lengths.mean(), 1), 'median:', int(lengths.median()))

    vec = CountVectorizer(stop_words='english', max_features=20)
    X = vec.fit_transform(texts)
    tokens = vec.get_feature_names_out()
    freqs = X.sum(axis=0).A1
    top = sorted(zip(tokens, freqs), key=lambda x: x[1], reverse=True)
    print('\nTop tokens:')
    for tok, freq in top:
        print(f'{tok}: {int(freq)}')
else:
    print(df.describe(include='all'))
    print('\nMissing values:')
    print(df.isna().sum())

**Insights**

If the class distribution is skewed, accuracy can overstate performance because the majority class dominates.
Use macro-averaged precision, recall, and F1 (and inspect per-class results) to understand behavior on minority classes.
Stratified splits help evaluation represent the original distribution but do not solve imbalance during training.