# 01 – Ingest + EDA (Notebook)

Purpose: fast, reproducible first pass for newsroom data tests.
- Loads the first CSV in `data/raw/` (or writes/uses a tiny fallback sample).
- Prints basic EDA (shape, dtypes, NA counts, head).
- Exports: `output/tables/summary.csv` and one chart to `output/charts/`.

Keep methods in `docs/METHODS.md`, sources in `docs/SOURCES.md`, and progress in `docs/RUN_LOG.md`.

In [None]:
from __future__ import annotations
import os
from pathlib import Path
import sys
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib
# Use headless backend to avoid Tk on Windows shells
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_ROOT = Path.cwd().parents[0] if (Path.cwd().name == 'notebooks') else Path.cwd()
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
OUT_CHARTS = PROJECT_ROOT / 'output' / 'charts'
OUT_TABLES = PROJECT_ROOT / 'output' / 'tables'
for p in [DATA_RAW, DATA_PROCESSED, OUT_CHARTS, OUT_TABLES]:
    p.mkdir(parents=True, exist_ok=True)

print(f'Project root: {PROJECT_ROOT}')
print(f'Python: {sys.version}')

Project root: c:\Users\MUHAMMAD SHUMAIL\Downloads\Houston
Python: 3.13.0 (tags/v3.13.0:60403a5, Oct  7 2024, 09:38:07) [MSC v.1941 64 bit (AMD64)]
c:\Users\MUHAMMAD SHUMAIL\Downloads\Houston


In [None]:
# Locate a user-provided dataset in data/raw (CSV or XLSX) — no fallback
from pathlib import Path

csvs = sorted(DATA_RAW.glob('*.csv'))
xlsx = sorted(DATA_RAW.glob('*.xlsx'))
files = csvs or xlsx
if not files:
    raise FileNotFoundError(
        'No CSV/XLSX found in data/raw. Please place your dataset into data/raw and rerun this cell.'
    )

src_path = files[0]
print('Loading:', src_path)


def read_table_robust(path: Path) -> pd.DataFrame:
    suffix = path.suffix.lower()
    if suffix == '.csv':
        encodings = ['utf-8', 'latin-1', 'cp1252']
        for enc in encodings:
            try:
                return pd.read_csv(path, encoding=enc)
            except Exception:
                continue
        return pd.read_csv(path, engine='python')
    elif suffix in ('.xlsx', '.xls'):
        return pd.read_excel(path)
    else:
        raise ValueError(f'Unsupported file type: {suffix}')


df = read_table_robust(src_path)
print('Loaded rows x cols:', df.shape)

Loading: C:\Users\MUHAMMAD SHUMAIL\Downloads\Houston\data\raw\sample_tips.csv
Loaded rows x cols: (244, 7)


In [7]:
# Basic EDA
print('\nDtypes:\n', df.dtypes)
print('\nMissing values (top 10):\n', df.isna().sum().sort_values(ascending=False).head(10))
print('\nHead:\n', df.head(5))

# Summary table
summary = pd.DataFrame({
    'column': df.columns,
    'dtype': [str(t) for t in df.dtypes],
    'n_missing': df.isna().sum().values,
})
summary_path = OUT_TABLES / 'summary.csv'
summary.to_csv(summary_path, index=False)
print('Saved table:', summary_path)


Dtypes:
 total_bill    float64
tip           float64
sex            object
smoker         object
day            object
time           object
size            int64
dtype: object

Missing values (top 10):
 total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

Head:
    total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4
5       25.29  4.71    Male     No  Sun  Dinner     4
6        8.77  2.00    Male     No  Sun  Dinner     2
7       26.88  3.12    Male     No  Sun  Dinner     4
8       15.04  1.96    Male     No  Sun  Dinner     2
9       14.78  3.23    Male     No  Sun  Dinner     2
Saved table: C:\Users\MUHAMMAD SHUMAIL\Downloads\Houston\output\tabl

In [5]:
# Try to infer a date column and find numeric columns
date_col = None
for c in df.columns:
    lc = c.lower()
    if any(k in lc for k in ['date', 'time', 'period']):
        parsed = pd.to_datetime(df[c], errors='coerce', utc=False)
        if parsed.notna().sum() > 0:
            df[c] = parsed
            date_col = c
            break

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print('Inferred date_col:', date_col)
print('Numeric columns (first 5):', num_cols[:5])

Inferred date_col: None
Numeric columns (first 5): ['total_bill', 'tip', 'size']


In [6]:
# Visualization logic
sns.set_theme(style='whitegrid')
fig = None
chart_path = None

if date_col and num_cols:
    first_num = num_cols[0]
    try:
        df_sorted = df.dropna(subset=[date_col]).sort_values(date_col)
        ts = df_sorted.set_index(date_col)[first_num]
        if ts.index.inferred_type in ('datetime64', 'datetime64tz'):
            ts = ts.resample('MS').mean()
        fig, ax = plt.subplots(figsize=(8, 4.5), dpi=150)
        ts.plot(ax=ax, color='#1f77b4')
        ax.set_title(f'{first_num} over time')
        ax.set_xlabel('Date')
        ax.set_ylabel(first_num)
        plt.tight_layout()
        chart_path = OUT_CHARTS / f'timeseries_{first_num}.png'
        fig.savefig(chart_path)
    except Exception as e:
        print('Time-series plot failed, fallback to histogram.', e)
        fig = None
        chart_path = None

if fig is None and num_cols:
    first_num = num_cols[0]
    fig, ax = plt.subplots(figsize=(8, 4.5), dpi=150)
    sns.histplot(df[first_num].dropna(), ax=ax, bins=30, color='#1f77b4')
    ax.set_title(f'Distribution of {first_num}')
    ax.set_xlabel(first_num)
    ax.set_ylabel('Count')
    plt.tight_layout()
    chart_path = OUT_CHARTS / f'hist_{first_num}.png'
    fig.savefig(chart_path)

if fig is not None and chart_path is not None:
    print('Saved chart:', chart_path)
else:
    print('No numeric columns to chart.')

Saved chart: c:\Users\MUHAMMAD SHUMAIL\Downloads\Houston\output\charts\hist_total_bill.png


## Next steps
- Update `docs/SOURCES.md` with your dataset links and access dates.
- Write cleaning rules in `docs/METHODS.md`.
- Replace the fallback sample by putting your CSV(s) into `data/raw/` and rerun.