# 01 Ingest & Validation
This notebook ingests raw datasets, performs validation checks, and ensures the data is ready for downstream processing.

In [None]:
import pandas as pd
import numpy as np
import os

DATA_PATH = "./data"
files = os.listdir(DATA_PATH)
files

## Load a Sample Dataset

In [None]:
# Example: load a CSV file for validation
df = pd.read_csv(os.path.join(DATA_PATH, files[0]))
df.head()

## Basic Validation Checks

In [None]:
# Check shape, missing values, dtypes
print("Shape:", df.shape)
print("Missing values per column:\n", df.isnull().sum())
print("Data types:\n", df.dtypes)

In [None]:
# Example: summary stats for numeric columns
df.describe(include=[np.number])

## Data Cleaning / Fixes

In [None]:
# Drop duplicates
df = df.drop_duplicates()

# Fill missing numeric values with median
for col in df.select_dtypes(include=[np.number]):
    df[col] = df[col].fillna(df[col].median())

# Fill categorical NaNs with mode
for col in df.select_dtypes(exclude=[np.number]):
    df[col] = df[col].fillna(df[col].mode()[0])

df.head()

## Save Cleaned Data

In [None]:
clean_path = os.path.join(DATA_PATH, "cleaned.csv")
df.to_csv(clean_path, index=False)
print(f"Saved cleaned dataset to {clean_path}")