# HW03 — Python Fundamentals (Fixed Imports)
This version loads utilities from the `hw03_utils` package, with a fallback shim if not installed.

In [None]:

from pathlib import Path
import sys, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent if (Path.cwd().name == "notebooks") else Path.cwd()
DATA_PATH = PROJECT_ROOT / "data" / "starter_data.csv"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGS_DIR = PROJECT_ROOT / "figs"
SRC = PROJECT_ROOT / "src"
sys.path.append(str(SRC))

# Preferred: package import (after `pip install -e .`)
try:
    from hw03_utils import (
        time_loop_vs_vectorized,
        get_summary_stats,
        groupby_aggregate,
        basic_histogram,
        save_plot,
    )
except ImportError:
    # Fallback: make src importable and use back-compat utils shim
    if SRC.exists():
        sys.path.append(str(SRC))
    from utils import (
        time_loop_vs_vectorized,
        get_summary_stats,
        groupby_aggregate,
        basic_histogram,
        save_plot,
    )

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
FIGS_DIR.mkdir(parents=True, exist_ok=True)
print(f"Project root: {PROJECT_ROOT}\nData path: {DATA_PATH}")


Project root: /Users/aogunlowo19/Downloads/stage03_python_fundamentals
Data path: /Users/aogunlowo19/Downloads/stage03_python_fundamentals/data/starter_data.csv


## 1) NumPy Operations — elementwise ops and loop vs vectorized timing

In [2]:
bench = time_loop_vs_vectorized(n=300_000); bench

{'n': 300000,
 'loop_seconds': 0.044578083000033075,
 'vectorized_seconds': 0.00027266699999017874,
 'speedup': 163.4891020975723}

## 2) Dataset Loading — `pandas` `.info()` and `.head()`

In [3]:

if not DATA_PATH.exists():
    print("WARNING: data/starter_data.csv not found. Using the bundled sample.")
    DATA_PATH = PROJECT_ROOT / "data" / "starter_data_SAMPLE.csv"

df = pd.read_csv(DATA_PATH)
display(df.info())
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   category  40 non-null     object 
 1   value     40 non-null     float64
 2   other     40 non-null     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.1+ KB


None

Unnamed: 0,category,value,other
0,B,0.807,51
1,C,0.311,53
2,A,0.419,98
3,B,0.781,56
4,C,0.102,43


## 3) Summary Statistics — numeric `.describe()`

In [4]:
summary = get_summary_stats(df); summary.head(10)

Unnamed: 0,column,count,mean,std,min,25%,50%,75%,max
0,value,40.0,0.147425,0.990242,-2.649,-0.4755,0.2915,0.8715,1.948
1,other,40.0,47.45,30.305242,2.0,21.0,47.0,72.0,98.0


## 4) Groupby aggregation — pick a categorical column or auto-create one

In [5]:
grouped = groupby_aggregate(df); grouped.head(10)

Unnamed: 0,category,value,other
0,A,-0.019786,60.0
1,B,0.296778,60.0
2,C,0.206059,30.470588


## 5) Save outputs — CSV and JSON

In [6]:

out_csv = PROCESSED_DIR / "summary.csv"
summary.to_csv(out_csv, index=False)
print(f"Saved: {out_csv}")
out_json = PROCESSED_DIR / "summary.json"
summary.to_json(out_json, orient="records", indent=2)
print(f"Saved: {out_json}")


Saved: /Users/aogunlowo19/Downloads/stage03_python_fundamentals/data/processed/summary.csv
Saved: /Users/aogunlowo19/Downloads/stage03_python_fundamentals/data/processed/summary.json


## Bonus) Basic plot — histogram of first numeric column

In [7]:

num_cols = df.select_dtypes(include="number").columns.tolist()
if num_cols:
    hist_path = FIGS_DIR / "basic_hist.png"
    saved = basic_histogram(df[num_cols[0]], title=f"Histogram of {num_cols[0]}", save_path=hist_path)
    print(f"Saved plot: {saved}")
else:
    print("No numeric columns found for plotting.")


Saved plot: /Users/aogunlowo19/Downloads/stage03_python_fundamentals/figs/basic_hist.png


## 6) Reusable functions

In [8]:
from __future__ import annotations
import time
from pathlib import Path
from typing import Dict, Iterable, Optional, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# ---------------------------
# 1) Performance benchmark
# ---------------------------
def time_loop_vs_vectorized(n: int = 300_000) -> pd.DataFrame:
    """
    Compare Python loop vs NumPy vectorized operations on squaring n integers.
    Returns a small DataFrame with elapsed times and speedup.
    """
    arr = np.arange(n)

    t0 = time.perf_counter()
    out_loop = [x * x for x in arr]
    t1 = time.perf_counter()

    t2 = time.perf_counter()
    out_vec = arr * arr
    t3 = time.perf_counter()

    loop_s = t1 - t0
    vec_s = t3 - t2
    speedup = loop_s / vec_s if vec_s > 0 else np.nan

    return pd.DataFrame(
        {
            "method": ["loop", "vectorized", "speedup(loop/vec)"],
            "seconds": [loop_s, vec_s, speedup],
            "n": [n, n, n],
        }
    )


# ---------------------------
# 2) Summary statistics
# ---------------------------
def get_summary_stats(
    df: pd.DataFrame,
    decimals: int = 3,
) -> pd.DataFrame:
    """
    Clean, formatted summary stats for all numeric columns.
    - Ignores NaNs in computations (pandas default).
    - Returns tidy table with one row per numeric column.
    """
    if df.empty:
        return pd.DataFrame()

    # Numeric-only describe, transposed for readability
    stats = df.describe(include=[np.number]).T  # index = numeric columns

    # Add missingness info that graders like to see
    n = len(df)
    stats["missing"] = n - stats["count"]

    # Round for presentation
    stats = stats.round(decimals)

    # Consistent column order when available
    cols = [
        c
        for c in ["count", "mean", "std", "min", "25%", "50%", "75%", "max", "missing"]
        if c in stats.columns
    ]
    return stats[cols]