# Data Validation Notebook

This notebook performs sanity checks on the data to ensure quality before training.

In [None]:
import pandas as pd
import numpy as np

data_path = '../data/raw/spam.csv'

## 1. Load Data

In [None]:
try:
    df = pd.read_csv(data_path, encoding='latin-1')
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")

## 2. Missing Values Check

In [None]:
missing = df.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])

if missing['v1'] > 0 or missing['v2'] > 0:
    print("WARNING: Critical columns have missing values!")
else:
    print("PASSED: No missing values in critical columns.")

## 3. Duplicate Check

In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("Note: Duplicates found. Consider whether to remove them.")

## 4. Label Validity Check

In [None]:
valid_labels = {'ham', 'spam'}
unique_labels = set(df['v1'].unique())

print(f"Found labels: {unique_labels}")

if unique_labels.issubset(valid_labels):
    print("PASSED: All labels are valid.")
else:
    print(f"WARNING: Unexpected labels found: {unique_labels - valid_labels}")