# HW03 — Python Fundamentals
_Use NumPy, load a CSV with pandas, compute summary stats, and save outputs._

In [2]:
# make repo root importable so `from src...` works when notebook runs from /notebooks
import sys, pathlib
sys.path.insert(0, str(pathlib.Path.cwd().parent))
print('Repo root on sys.path:', sys.path[0])

Repo root on sys.path: /Users/rajpawar/bootcamp_Rajvardhan_Pawar


In [3]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from src.config import load_env, get
load_env()
print('API_KEY present:', get('API_KEY') is not None)
print('numpy:', np.__version__, '| pandas:', pd.__version__)

API_KEY present: True
numpy: 2.3.2 | pandas: 2.3.1


## Load dataset

In [13]:
from pathlib import Path
import pandas as pd

ROOT = Path.cwd().parent          # repo root
DATA_PATH = ROOT / "data" / "starter_data.csv"

assert DATA_PATH.exists(), f"Missing {DATA_PATH}"
df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,category,value
0,A,10
1,A,12
2,B,20
3,B,22
4,C,30


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  6 non-null      object
 1   value     6 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 228.0+ bytes


## Summary statistics → save to data/processed/summary.csv

In [17]:
from src.utils import get_summary_stats, save_summary
os.makedirs('data/processed', exist_ok=True)
summary = get_summary_stats(df)
save_summary(summary, 'data/processed/summary.csv')
summary.head()

Unnamed: 0,category,value
count,6,6.0
unique,3,
top,A,
freq,2,
mean,,20.333333


## Simple groupby (auto-picks one categorical & one numeric column)

In [20]:
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
cat_cols = [c for c in df.columns if pd.api.types.is_object_dtype(df[c])]
if cat_cols and num_cols:
    display(df.groupby(cat_cols[0])[num_cols[0]].mean())
else:
    print('No obvious categorical/numeric pair for groupby; skipping demo.')

category
A    11.0
B    21.0
C    29.0
Name: value, dtype: float64

## Save a histogram of the first numeric column (if any)

In [23]:
if num_cols:
    col = num_cols[0]
    ax = df[col].dropna().plot(kind='hist', bins=30)
    fig = ax.get_figure()
    out = f'data/processed/hist_{col}.png'
    fig.savefig(out, bbox_inches='tight')
    plt.close(fig)
    print('Saved histogram to', out)
else:
    print('No numeric column to plot.')

Saved histogram to data/processed/hist_value.png


## NumPy vectorization speed demo

In [26]:
import time
n = 5_000_000
arr = np.arange(n, dtype=np.float64)
t0 = time.time(); out_vec = arr * 2.0 + 1.0; t1 = time.time()
lst = list(range(n))
s0 = time.time(); out_loop = [x*2.0 + 1.0 for x in lst]; s1 = time.time()
print(f'Vectorized: {(t1-t0):.3f}s  |  Loop: {(s1-s0):.3f}s  |  speedup ~{(s1-t0)/(t1-t0) if (t1-t0)>0 else float("inf"):.1f}x')

Vectorized: 0.016s  |  Loop: 0.228s  |  speedup ~18.0x


## Done

In [29]:
import os
print('Wrote: data/processed/summary.csv →', os.path.exists('data/processed/summary.csv'))

Wrote: data/processed/summary.csv → True
