# HW03 — Python Fundamentals
_Use NumPy, load a CSV with pandas, compute summary stats, and save outputs._

# 1. NumPy Operations


In [None]:
# make repo root importable so `from src...` works when notebook runs from /notebooks
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd().parent))
print('Repo root on sys.path:', sys.path[0])

# 2. Data Loading & Inspection


In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from src.config import load_env, get
load_env()
print('API_KEY present:', get('API_KEY') is not None)
print('numpy:', np.__version__, '| pandas:', pd.__version__)

## Load Dataset


In [None]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent          # repo root
DATA_PATH = ROOT / "data" / "starter_data.csv"

assert DATA_PATH.exists(), f"Missing {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
df.head()


## Inspect Data


In [None]:
df.info()

# 3. Summary Statistics


In [None]:
from src.utils import get_summary_stats, save_summary
os.makedirs('data/processed', exist_ok=True)
summary = get_summary_stats(df)
save_summary(summary, 'data/processed/summary.csv')
summary.head()

# 4. GroupBy Aggregation


In [None]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in df.columns if pd.api.types.is_object_dtype(df[c])]
if cat_cols and num_cols:
    display(df.groupby(cat_cols[0])[num_cols[0]].mean())
else:
    print('No obvious categorical/numeric pair for groupby; skipping demo.')

## Save a histogram of the first numeric column (if any)

In [None]:
if num_cols:
    col = num_cols[0]
    ax = df[col].dropna().plot(kind='hist', bins=30)
    fig = ax.get_figure()
    out = f'data/processed/hist_{col}.png'
    fig.savefig(out, bbox_inches='tight')
    plt.close(fig)
    print('Saved histogram to', out)
else:
    print('No numeric column to plot.')

## NumPy vectorization speed demo

In [None]:
import time
n = 5_000_000
arr = np.arange(n, dtype=np.float64)
t0 = time.time(); out_vec = arr * 2.0 + 1.0; t1 = time.time()
lst = list(range(n))
s0 = time.time(); out_loop = [x*2.0 + 1.0 for x in lst]; s1 = time.time()
print(f'Vectorized: {(t1-t0):.3f}s  |  Loop: {(s1-s0):.3f}s  |  speedup ~{(s1-t0)/(t1-t0) if (t1-t0)>0 else float("inf"):.1f}x')

## Done

In [None]:
import os
print('Wrote: data/processed/summary.csv →', os.path.exists('data/processed/summary.csv'))