1. API Pull

In [1]:
import yfinance as yf

SYMBOL = "NVDA"
df_raw = yf.download(SYMBOL, period="5d", interval="1m")

print(df_raw)

  df_raw = yf.download(SYMBOL, period="5d", interval="1m")
[*********************100%***********************]  1 of 1 completed

Price                           Close        High         Low        Open  \
Ticker                           NVDA        NVDA        NVDA        NVDA   
Datetime                                                                    
2025-08-14 13:30:00+00:00  179.756699  180.570007  179.740005  179.794998   
2025-08-14 13:31:00+00:00  180.549194  180.819000  179.460007  179.756500   
2025-08-14 13:32:00+00:00  180.892502  180.960007  180.479996  180.559998   
2025-08-14 13:33:00+00:00  180.070007  180.649994  180.050003  180.610001   
2025-08-14 13:34:00+00:00  180.520004  180.580002  179.727005  180.085007   
...                               ...         ...         ...         ...   
2025-08-20 19:55:00+00:00  175.384995  175.529907  175.050003  175.100006   
2025-08-20 19:56:00+00:00  175.279999  175.419998  175.264999  175.380005   
2025-08-20 19:57:00+00:00  175.449997  175.490005  175.270004  175.274994   
2025-08-20 19:58:00+00:00  175.365005  175.460007  175.339996  175.449997   




In [2]:
import pandas as pd
from typing import Dict, List

def validate_df(df: pd.DataFrame, required_cols: List[str], dtypes_map: Dict[str, str]) -> Dict[str, str]:
    msgs = {}
    required_cols = [col.lower() for col in required_cols] # make required columns lower case
    df_cols_lower = [str(col).lower() for col in df.columns] # does not modify the df columns, only to make it case insensitive for validation
    missing = [
        col for col in required_cols
        if not any(col.lower() in str(item).lower() for tup in df.columns for item in tup) # since the column names are in tuples, eg: "('open', 'nvda')", I can not simply setting equal signs to see if the column exist
    ]
    if missing:
        msgs['missing_cols'] = f"Missing columns: {missing}"
    for col, dtype in dtypes_map.items():
        if col in df.columns:
            try:
                if dtype == 'datetime64[ns]':
                    pd.to_datetime(df[col])
                elif dtype == 'float':
                    pd.to_numeric(df[col])
            except Exception as e:
                msgs[f'dtype_{col}'] = f"Failed to coerce {col} to {dtype}: {e}"
    na_counts = df.isna().sum().sum()
    msgs['na_total'] = f"Total NA values: {na_counts}"
    return msgs

required_cols = list(["open", "close", "high", "low", "ticker"])
validate_df(df_raw, required_cols, {})

{'missing_cols': "Missing columns: ['ticker']",
 'na_total': 'Total NA values: 0'}

2. Scrape a Small Table (Needs Fix)

In [12]:
import requests, datetime, os
from bs4 import BeautifulSoup
from pathlib import Path
from dotenv import load_dotenv

def safe_stamp():
    return datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

def safe_filename(prefix: str, meta: Dict[str, str]) -> str:
    mid = "_".join([f"{k}-{str(v).replace(' ', '-')[:20]}" for k, v in meta.items()])
    return f"{prefix}_{mid}_{safe_stamp()}.csv"

load_dotenv()
rootPath = os.getenv("PROJECT_ROOT")
SCRAPE_URL = "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001045810/b2bd9f31-ce6e-4f50-bb51-8d433e958103.html#"
headers = {"User-Agent": "AFE-Course-Notebook/1.0 (contact: instructor@example.edu)"}
DATA_RAW = Path(rootPath) / "data/raw"

try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'lxml')
    tables = soup.find_all('table')
    df_scrape = pd.DataFrame()
    for idx, table in enumerate(tables, start=1):
        rows = []
        for tr in table.find_all("tr"):
            cells = []
            for td in tr.find_all(["td", "th"]):
                # get_text joins text from all nested tags with spaces
                text = td.get_text(" ", strip=True)
                if text or td.find():  # ignore truly empty cells
                    cells.append(text)
            if cells:
                rows.append(cells)

        if rows:
            header = [f"col_{i}" for i in range(len(rows[0]))]
            data = [r for r in rows[1:] if len(r) == len(header)] # check header and data are of same columns sizes
            df = pd.DataFrame(data, columns=header)
            df_scrape = pd.concat([df_scrape, df], ignore_index=True)
    else:
        print("no table found")

except Exception as e:
    print("Scrape failed: ", e)

# if 'Price' in df_scrape.columns:
#     df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')

msgs2 = validate_df(df_scrape, required_cols=list([]), dtypes_map={})
print(msgs2)

print(df_scrape)
if not df_scrape.empty:
    fname2 = safe_filename(prefix="scrape", meta={"site": "example", "table": "markets"})
    out_path2 = DATA_RAW / fname2
    df_scrape.to_csv(out_path2, index=False)
    print("Saved to data")

no table found
{'na_total': 'Total NA values: 19259'}
                  col_0                  col_1 col_2 col_3 col_4 col_5 col_6
0     Namespace Prefix:                   dei_   NaN   NaN   NaN   NaN   NaN
1            Data Type:  xbrli:booleanItemType   NaN   NaN   NaN   NaN   NaN
2         Balance Type:                     na   NaN   NaN   NaN   NaN   NaN
3          Period Type:               duration   NaN   NaN   NaN   NaN   NaN
4     Namespace Prefix:                   dei_   NaN   NaN   NaN   NaN   NaN
...                 ...                    ...   ...   ...   ...   ...   ...
3896         Data Type:                     na   NaN   NaN   NaN   NaN   NaN
3897  Namespace Prefix:                    NaN   NaN   NaN   NaN   NaN   NaN
3898      Balance Type:                    NaN   NaN   NaN   NaN   NaN   NaN
3899       Period Type:                    NaN   NaN   NaN   NaN   NaN   NaN
3900         Data Type:                     na   NaN   NaN   NaN   NaN   NaN

[3901 rows x 7 column

3. Documentation

 - For validation logic and output, please see the second step and helper functions in the first step.

 - source: Nvidia 10-K report, Fed 26 2025, link: https://www.sec.gov/Archives/edgar/data/1045810/000104581025000023/nvda-20250126.htm
 
  - .env.example will be committed, but .env should not 
  - Counted missing values
  - Confirmed row matches column dim

Assumptions % Risks

 - the data is assumed to be accurate
 - the parsed content might be incorrect and requires manual checking
 - sec.gov might not be accessible in the future