Goal: fetch 5Y standardized statements for peers via OpenDART.

In [None]:
import os,io,zipfile,xml.etree.ElementTree as ET 
from datetime import datetime
import requests,pandas as pd 

API="https://engopendar.fss.or.kr/engapi"
KEY = os.getenv("DART_API_KEY)

assert KEY and len(key) >= 20. "Missing DART_API_KEY. Add it to your environment and re-run."

PEER_TICKERS = ["278470", "090430","051900"] #APR, Amore Pacific, LG H&H 
YEARS = list(range(datetime.now().year-4,datetime.now().year+1))
REPRT = {"A":"11011"} #Annual report code
STATEMENTS = ["BS","IS","CF"] #Balance sheet, Income Statement, Cashflow Statement

In [None]:
def get_corp_table():
    """Download corpCode XML (zipped) and return DataFrame of corp_code/stock_code/corp_name."""
    url = f"{API}/corpCode.xml"
    r = requests.get(url, params={"crtfc_key": KEY}, timeout=60)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
        xml_name = zf.namelist()[0]
        xml_bytes = zf.read(xml_name)
    root = ET.fromstring(xml_bytes)
    rows = []
    for el in root.findall(".//list"):
        rows.append({
            "corp_code": el.findtext("corp_code"),
            "corp_name": el.findtext("corp_name"),
            "stock_code": el.findtext("stock_code"),
        })
    return pd.DataFrame(rows)

def resolve_by_stock(df, stock_codes):
    stock_codes = {str(s).zfill(6) for s in stock_codes}
    out = df[df["stock_code"].isin(stock_codes)].copy()
    return out[["stock_code", "corp_name", "corp_code"]].reset_index(drop=True)

def fetch_fnltt_singl_all(corp_code, bsns_year, reprt_code, fs_div="CFS", sj_div=None):
    """Call fnlttSinglAcntAll for one company-year-report. Returns a tidy DataFrame."""
    params = {
        "crtfc_key": KEY,
        "corp_code": corp_code,
        "bsns_year": str(bsns_year),
        "reprt_code": reprt_code,
        "fs_div": fs_div,                   # CFS or OFS
    }
    if sj_div:
        params["sj_div"] = sj_div          # BS/IS/CF
    url = f"{API}/fnlttSinglAcntAll.json"
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()
    if data.get("status") != "000":
        # No data / out-of-range year / etc. -> return empty frame
        return pd.DataFrame()
    df = pd.DataFrame(data.get("list", []))
    if df.empty:
        return df

    # Keep relevant columns if present
    keep = ["corp_code","bsns_year","reprt_code","fs_div","sj_div","sj_nm",
            "account_id","account_nm","thstrm_amount","thstrm_add_amount",
            "frmtrm_amount","frmtrm_add_amount","bfefrmtrm_amount","currency"]
    df = df[[c for c in keep if c in df.columns]].copy()

    # Numeric clean (amounts come as strings with commas)
    for c in ["thstrm_amount","thstrm_add_amount","frmtrm_amount","frmtrm_add_amount","bfefrmtrm_amount"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", ""), errors="coerce")
    return df


In [None]:
corp_df = get_corp_table()
peer_map = resolve_by_stock(corp_df, PEER_TICKERS)
display(peer_map)  # expect 3 rows with corp_code

frames = []
for _, row in peer_map.iterrows():
    code = row.corp_code
    for y in YEARS:
        for sj in STATEMENTS:
            df = fetch_fnltt_singl_all(code, y, REPRT["A"], fs_div="CFS", sj_div=sj)
            if not df.empty:
                frames.append(df)

peer_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print(peer_raw.shape)
peer_raw.head(10)


In [None]:
# Save long-format for downstream use
out_dir = "Projects/kbeauty-device-brief/data/processed"
os.makedirs(out_dir, exist_ok=True)
peer_raw.to_parquet(f"{out_dir}/peer_raw.parquet", index=False)

# Build a compact "wide" view by company-year
# Prefer account_id if present (stable IFRS tags); else fallback to account_nm.
def _best_account_col(df):
    if "account_id" in df and df["account_id"].notna().any():
        return "account_id"
    return "account_nm"

acct_col = _best_account_col(peer_raw)

# Use 'thstrm_amount' (당기) as the annual reported figure.
pivot = (peer_raw
         .pivot_table(index=["corp_code","bsns_year","sj_div","currency"],
                      columns=acct_col, values="thstrm_amount", aggfunc="first")
         .reset_index())

# Optional: collapse BS/IS/CF into one row per company-year (columns keep prefixes)
pivot["sj_div"] = pivot["sj_div"].fillna("NA")
wide = (pivot
        .assign(idx=pivot["corp_code"].astype(str)+"_"+pivot["bsns_year"].astype(str))
        .set_index(["idx","sj_div"])
        .sort_index())

# Join BS/IS/CF blocks side-by-side
wide_blocks = []
for sj in ["BS","IS","CF"]:
    if (wide.index.get_level_values("sj_div") == sj).any():
        block = wide.loc[(slice(None), sj)].droplevel("sj_div")
        # add prefix to avoid duplicate column names across statements
        block.columns = [f"{sj}:{c}" for c in block.columns]
        wide_blocks.append(block)

peer_wide = pd.concat(wide_blocks, axis=1).reset_index().rename(columns={"idx":"_key"})
# Recover corp_code and year from _key
peer_wide[["corp_code","bsns_year"]] = peer_wide["_key"].str.split("_", expand=True)
peer_wide.drop(columns=["_key"], inplace=True)
peer_wide = peer_wide.sort_values(["corp_code","bsns_year"])

peer_wide.to_csv(f"{out_dir}/peer_wide.csv", index=False)
peer_wide.head(5)
