Goal: fetch 5Y standardized statements for peers via OpenDART.

In [6]:
# 01_dart_pull.ipynb — Cell 1: Config & env
import os, io, zipfile, xml.etree.ElementTree as ET
from datetime import datetime
import requests, pandas as pd

# Load .env (repo root) for DART_API_KEY
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

API = "https://engopendart.fss.or.kr/engapi"   # OpenDART (English) base; works for corpCode & financials
KEY = os.getenv("DART_API_KEY")
assert KEY and len(KEY) == 40, "Set DART_API_KEY (40 chars) in your .env at the repo root."

# Peers (tickers): APR, Amorepacific, LG H&H
PEER_TICKERS = ["278470", "090430", "051900"]

# Last 5 fiscal years (incl. current; empty years will be skipped automatically)
current_year = datetime.now().year
YEARS = list(range(current_year - 4, current_year + 1))

# Report & statement types
REPRT = {"A": "11011"}           # Annual
STATEMENTS = ["BS", "IS", "CF"]  # Balance Sheet, Income Statement, Cash Flow


In [7]:
def get_corp_table():
    """
    Download corpCode.xml (zipped) and return DataFrame:
    ['corp_code','corp_name','stock_code'].
    """
    url = f"{API}/corpCode.xml"
    r = requests.get(url, params={"crtfc_key": KEY}, timeout=60)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
        xml_name = zf.namelist()[0]
        xml_bytes = zf.read(xml_name)
    root = ET.fromstring(xml_bytes)
    rows = []
    for el in root.findall(".//list"):
        rows.append({
            "corp_code": el.findtext("corp_code"),
            "corp_name": el.findtext("corp_name"),
            "stock_code": el.findtext("stock_code"),
        })
    return pd.DataFrame(rows)

def resolve_by_stock(df, stock_codes):
    stock_codes = {str(s).zfill(6) for s in stock_codes}
    out = df[df["stock_code"].isin(stock_codes)].copy()
    return out[["stock_code","corp_name","corp_code"]].reset_index(drop=True)

def fetch_fnltt_singl_all(corp_code, bsns_year, reprt_code, fs_div="CFS", sj_div=None):
    """
    Call fnlttSinglAcntAll for one company-year-report.
    Params:
      corp_code (8d), bsns_year (YYYY), reprt_code ('11011' Annual), fs_div ('CFS' or 'OFS'), sj_div ('BS','IS','CF' or None)
    Returns tidy DataFrame with 'thstrm_amount' etc. Already numeric.
    """
    params = {
        "crtfc_key": KEY,
        "corp_code": corp_code,
        "bsns_year": str(bsns_year),
        "reprt_code": reprt_code,
        "fs_div": fs_div
    }
    if sj_div:
        params["sj_div"] = sj_div
    url = f"{API}/fnlttSinglAcntAll.json"
    r = requests.get(url, params=params, timeout=60)
    r.raise_for_status()
    data = r.json()
    if data.get("status") != "000":
        return pd.DataFrame()
    df = pd.DataFrame(data.get("list", []))
    if df.empty:
        return df

    keep = ["corp_code","bsns_year","reprt_code","fs_div","sj_div","sj_nm",
            "account_id","account_nm",
            "thstrm_amount","thstrm_add_amount",
            "frmtrm_amount","frmtrm_add_amount",
            "bfefrmtrm_amount","currency"]
    df = df[[c for c in keep if c in df.columns]].copy()

    # Numeric coercion (remove commas)
    for c in ["thstrm_amount","thstrm_add_amount","frmtrm_amount","frmtrm_add_amount","bfefrmtrm_amount"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c].astype(str).str.replace(",", ""), errors="coerce")
    return df


In [8]:
# Resolve corp codes
corp_df = get_corp_table()
peer_map = resolve_by_stock(corp_df, PEER_TICKERS)
display(peer_map)  # expect 3 rows (APR, Amorepacific, LG H&H)

# Pull data
frames = []
for _, row in peer_map.iterrows():
    code = row.corp_code
    for y in YEARS:
        for sj in STATEMENTS:
            df = fetch_fnltt_singl_all(code, y, REPRT["A"], fs_div="CFS", sj_div=sj)
            if not df.empty:
                frames.append(df)

peer_raw = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print("peer_raw shape:", peer_raw.shape)
display(peer_raw.head(10))

Unnamed: 0,stock_code,corp_name,corp_code
0,90430,아모레퍼시픽,583424
1,51900,LG생활건강,356370
2,278470,에이피알,1190568


peer_raw shape: (7389, 12)


Unnamed: 0,corp_code,bsns_year,reprt_code,sj_div,sj_nm,account_id,account_nm,thstrm_amount,thstrm_add_amount,frmtrm_amount,bfefrmtrm_amount,currency
0,583424,2021,11011,BS,Statement of financial position,ifrs-full_CurrentAssets,유동자산,2018527000000.0,,1825732000000.0,1803064000000.0,KRW
1,583424,2021,11011,BS,Statement of financial position,ifrs-full_CashAndCashEquivalents,현금및현금성자산,537693600000.0,,793728300000.0,673580400000.0,KRW
2,583424,2021,11011,BS,Statement of financial position,dart_ShortTermDepositsNotClassifiedAsCashEquiv...,금융기관예치금,101825500000.0,,57882300000.0,70600000000.0,KRW
3,583424,2021,11011,BS,Statement of financial position,ifrs-full_CurrentFinancialAssetsAtFairValueThr...,당기손익-공정가치금융자산,490839300000.0,,166463200000.0,137522900000.0,KRW
4,583424,2021,11011,BS,Statement of financial position,dart_ShortTermTradeReceivable,매출채권,295769500000.0,,283770100000.0,365884400000.0,KRW
5,583424,2021,11011,BS,Statement of financial position,ifrs-full_TradeAndOtherCurrentReceivables,기타수취채권,41647240000.0,,24723690000.0,32485370000.0,KRW
6,583424,2021,11011,BS,Statement of financial position,ifrs-full_CurrentTaxAssets,당기법인세자산,3330887000.0,,19235960000.0,4014570000.0,KRW
7,583424,2021,11011,BS,Statement of financial position,dart_OtherCurrentAssets,기타유동자산,50130930000.0,,65445880000.0,66386910000.0,KRW
8,583424,2021,11011,BS,Statement of financial position,ifrs-full_Inventories,재고자산,493223600000.0,,414483000000.0,452589100000.0,KRW
9,583424,2021,11011,BS,Statement of financial position,ifrs-full_NoncurrentAssetsOrDisposalGroupsClas...,매각예정자산,4066226000.0,,,,KRW


In [None]:
# build wide table + save CSVs
from pathlib import Path

out_dir = Path("Projects/kbeauty-device-brief/data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

# Add corp_name for convenience (join by corp_code)
peer_raw_named = peer_raw.merge(peer_map[["corp_code","corp_name","stock_code"]],
                                on="corp_code", how="left")

# 1) Save the long-format data
peer_raw_named.to_csv(out_dir / "peer_raw.csv", index=False)
print("Saved:", out_dir / "peer_raw.csv")

# 2) Build a wide table: one row per (company, year); columns prefixed by statement (BS/IS/CF)
def choose_account_col(df):
    if "account_id" in df and df["account_id"].notna().any():
        return "account_id"
    return "account_nm"

acct_col = choose_account_col(peer_raw_named)

pivot = (
    peer_raw_named
      .pivot_table(index=["corp_code","corp_name","stock_code","bsns_year","sj_div","currency"],
                   columns=acct_col, values="thstrm_amount", aggfunc="first")
      .reset_index()
)

# Join BS/IS/CF side-by-side with prefixes to avoid duplicate names across statements
blocks = []
for sj in ["BS","IS","CF"]:
    block = pivot[pivot["sj_div"] == sj].drop(columns=["sj_div"])
    if not block.empty:
        block = block.set_index(["corp_code","corp_name","stock_code","bsns_year","currency"])
        block.columns = [f"{sj}:{c}" for c in block.columns]
        blocks.append(block)

peer_wide = pd.concat(blocks, axis=1).reset_index().sort_values(["corp_name","bsns_year"])
peer_wide.to_csv(out_dir / "peer_wide.csv", index=False)

print("Saved:", out_dir / "peer_wide.csv")
display(peer_wide.head(8))


Saved: Projects/kbeauty-device-brief/data/processed/peer_raw.csv
Saved: Projects/kbeauty-device-brief/data/processed/peer_wide.csv


Unnamed: 0,corp_code,corp_name,stock_code,bsns_year,currency,BS:-Standard account code not used-,BS:dart_AcquisitionOfTreasuryShares,BS:dart_AdjustmentsForAmortisationExpense,BS:dart_AdjustmentsForAssetsLiabilitiesOfOperatingActivities,BS:dart_AdjustmentsForBadDebtExpenses,...,CF:ifrs-full_SellingGeneralAndAdministrativeExpense,CF:ifrs-full_ShareOfOtherComprehensiveIncomeOfAssociatesAndJointVenturesAccountedForUsingEquityMethodThatWillBeReclassifiedToProfitOrLossNetOfTax,CF:ifrs-full_ShareOfProfitLossOfAssociatesAccountedForUsingEquityMethod,CF:ifrs-full_ShareOfProfitLossOfAssociatesAndJointVenturesAccountedForUsingEquityMethod,CF:ifrs-full_SharePremium,CF:ifrs-full_ShorttermBorrowings,CF:ifrs-full_ShorttermDepositsNotClassifiedAsCashEquivalents,CF:ifrs-full_TradeAndOtherCurrentPayables,CF:ifrs-full_TradeAndOtherCurrentPayablesToTradeSuppliers,CF:ifrs-full_TradeAndOtherCurrentReceivables
0,356370,LG생활건강,51900,2021,KRW,38107740000.0,,,,,...,,,,,,,,,,
1,356370,LG생활건강,51900,2022,KRW,52024000000.0,,,,,...,,,,,,,,,,
2,356370,LG생활건강,51900,2023,KRW,138182500000.0,,,,,...,,,,7730730000.0,,,,,,
3,356370,LG생활건강,51900,2024,KRW,,,,,,...,,,,,,,,,,
4,583424,아모레퍼시픽,90430,2021,KRW,94729520000.0,,,,,...,,,,,,,,,,
5,583424,아모레퍼시픽,90430,2022,KRW,181406900000.0,,,,,...,,,,,,,,,,
6,583424,아모레퍼시픽,90430,2023,KRW,,,,,,...,,,,,,,,,,
7,583424,아모레퍼시픽,90430,2024,KRW,,,,,,...,,,,,,,,,,


In [10]:
#Resolve Jion Meditech corp_code
import re

def normalize_korean(s: str) -> str:
    return re.sub(r"\s+", "", (s or "")).lower()

target_names = ["지온메디텍", "지온 메디텍"]

corp_df_norm = corp_df.assign(_norm=corp_df["corp_name"].map(normalize_korean))
hits = corp_df_norm[corp_df_norm["_norm"].isin(map(normalize_korean, target_names))]

if hits.empty:
    # fallback: partial match
    hits = corp_df_norm[corp_df_norm["_norm"].str.contains("지온", na=False)]

display(hits)

assert not hits.empty, "지온메디텍 corp_code not found in corpCode.xml. Try another variant of the name."
jion_corp_code = hits.iloc[0]["corp_code"]
jion_corp_name = hits.iloc[0]["corp_name"]
print("Resolved:", jion_corp_name, jion_corp_code)


Unnamed: 0,corp_code,corp_name,stock_code,_norm
97808,1844678,지온메디텍,,지온메디텍


Resolved: 지온메디텍 01844678


In [13]:
#List filings
from datetime import date
API_KR = "https://opendart.fss.or.kr/api"  # list.json is on the KR base

def list_filings(corp_code, bgn_de="20200101", end_de=None, page_no=1, page_count=100):
    if end_de is None:
        end_de = date.today().strftime("%Y%m%d")
    params = {
        "crtfc_key": KEY,
        "corp_code": corp_code,
        "bgn_de": bgn_de,
        "end_de": end_de,
        "page_no": page_no,
        "page_count": page_count,
    }
    r = requests.get(f"{API_KR}/list.json", params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    if data.get("status") != "000":
        print("DART returned:", data)
        return pd.DataFrame()
    return pd.DataFrame(data.get("list", []))

jion_filings = list_filings(jion_corp_code, bgn_de="20200101")
# filter to 감사보고서 (Audit Report)
jion_audits = jion_filings[jion_filings["report_nm"].str.contains("감사보고서", na=False)]
jion_audits = jion_audits.sort_values("rcept_dt", ascending=False)
display(jion_audits[["rcept_dt","report_nm","rcept_no"]].head(5))

assert not jion_audits.empty, "No audit report found for 지온메디텍 in the period."
# take the most recent
jion_rcept_no = jion_audits.iloc[0]["rcept_no"]
jion_rcept_dt = jion_audits.iloc[0]["rcept_dt"]
print("Using rcept_no:", jion_rcept_no, "date:", jion_rcept_dt)


Unnamed: 0,rcept_dt,report_nm,rcept_no
0,20250403,감사보고서 (2024.12),20250403003319


Using rcept_no: 20250403003319 date: 20250403


In [14]:
#Download the audit PDF
import urllib.parse
from bs4 import BeautifulSoup

raw_dir = Path("Projects/kbeauty-device-brief/data/raw") / "jion" / jion_rcept_dt
raw_dir.mkdir(parents=True, exist_ok=True)
pdf_path = raw_dir / f"{jion_rcept_no}.pdf"

def try_download_audit_pdf(rcept_no, out_pdf_path):
    viewer = f"https://dart.fss.or.kr/dsaf001/main.do?rcpNo={rcept_no}"
    html = requests.get(viewer, headers={"User-Agent":"Mozilla/5.0"}).text
    soup = BeautifulSoup(html, "html.parser")
    # Heuristic: look for any 'download' link that ends with .pdf
    pdf_url = None
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "pdf" in href and "download" in href:
            pdf_url = urllib.parse.urljoin("https://dart.fss.or.kr", href)
            break

    if not pdf_url:
        # Fallback: show viewer URL for manual download
        print("Couldn't auto-find the PDF link. Open and download manually:", viewer)
        return None

    with requests.get(pdf_url, stream=True) as r:
        r.raise_for_status()
        with open(out_pdf_path, "wb") as f:
            for chunk in r.iter_content(8192):
                f.write(chunk)
    return str(out_pdf_path)

pdf_file = try_download_audit_pdf(jion_rcept_no, pdf_path)
print("Saved PDF:", pdf_file or "(manual download required)")


Couldn't auto-find the PDF link. Open and download manually: https://dart.fss.or.kr/dsaf001/main.do?rcpNo=20250403003319
Saved PDF: (manual download required)


In [16]:
#Parse Jion audit PDF → 2-year stack (당기/전기) ===
from pathlib import Path
import sys, subprocess, re
import pandas as pd

# 0) Ensure parser dependency
try:
    import pdfplumber
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "pdfplumber"])
    import pdfplumber

JION_RCEPT_DT  = "20250403"
JION_RCEPT_NO  = "20250403003319"

NB_DIR   = Path.cwd()
PROJ_DIR = NB_DIR.parent                           
pdf_path = PROJ_DIR / "data" / "raw" / "jion" / JION_RCEPT_DT / f"{JION_RCEPT_NO}.pdf"

print("CWD:      ", NB_DIR)
print("Project:  ", PROJ_DIR)
print("PDF path: ", pdf_path.resolve(), "| exists:", pdf_path.exists())
assert pdf_path.exists(), f"Audit PDF not found at {pdf_path}"

# --- B) Detect statement pages & parse tables ---
KEYWORDS = {
    "BS": ["재무상태표", "대차대조표"],
    "IS": ["손익계산서", "포괄손익계산서"],
    "CF": ["현금흐름표"],
}

def page_label(page):
    txt = (page.extract_text() or "").replace(" ", "")
    return txt

def table_to_df(tbl):
    df = pd.DataFrame(tbl).dropna(axis=1, how="all")

    # Find a header row that contains both '당기' and '전기'
    header_row_idx = None
    for i in range(min(5, len(df))):
        row = "".join(map(str, df.iloc[i].tolist()))
        if ("당기" in row) and ("전기" in row):
            header_row_idx = i
            break
    if header_row_idx is None:
        header_row_idx = 0

    df.columns = [str(x).strip() for x in df.iloc[header_row_idx].tolist()]
    df = df.iloc[header_row_idx+1:].reset_index(drop=True)

    # Normalize headers → account/current/prior
    rename_map = {}
    for c in list(df.columns):
        c0 = str(c)
        if any(k in c0 for k in ["계정", "과목", "항목", "주요", "세목"]):
            rename_map[c] = "account_nm"
        elif "당기" in c0:
            rename_map[c] = "thstrm_amount"
        elif "전기" in c0:
            rename_map[c] = "frmtrm_amount"
    df = df.rename(columns=rename_map)

    keep = [c for c in ["account_nm","thstrm_amount","frmtrm_amount"] if c in df.columns]
    if not keep:
        return pd.DataFrame()
    df = df[keep].copy()

    # Clean numeric strings
    for col in ["thstrm_amount","frmtrm_amount"]:
        if col in df.columns:
            df[col] = (df[col].astype(str)
                                .str.replace(",", "", regex=False)
                                .str.replace("(", "-", regex=False)
                                .str.replace(")", "", regex=False)
                                .str.replace("\u2212", "-", regex=False))
            df[col] = pd.to_numeric(df[col], errors="coerce")

    df["account_nm"] = df["account_nm"].astype(str).str.strip()
    df = df[df["account_nm"].notna() & (df["account_nm"]!="")]
    return df

def extract_statements(pdf_path: Path):
    results = {}
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            txt = page_label(page)
            for sj, kws in KEYWORDS.items():
                if any(kw in txt for kw in kws):
                    tables = page.extract_tables()
                    for tbl in tables:
                        df = table_to_df(tbl)
                        if not df.empty:
                            results.setdefault(sj, []).append(df)
                            break
    for sj in list(results.keys()):
        results[sj] = (pd.concat(results[sj], ignore_index=True)
                         .drop_duplicates(subset=["account_nm"]))
    return results

stmts = extract_statements(pdf_path)
for sj in ["BS","IS","CF"]:
    print(sj, "rows:", 0 if sj not in stmts else len(stmts[sj]))

# --- C) Stack into 2 years using rcept_dt for '당기' year ---
cur_year = int(JION_RCEPT_DT[:4])  # '2025' from '20250403'
frames = []
for sj, df in stmts.items():
    if df.empty:
        continue
    if "thstrm_amount" in df:
        cur = df[["account_nm","thstrm_amount"]].rename(columns={"thstrm_amount":"amount"})
        cur["year"] = cur_year
        cur["sj_div"] = sj
        frames.append(cur)
    if "frmtrm_amount" in df:
        prior = df[["account_nm","frmtrm_amount"]].rename(columns={"frmtrm_amount":"amount"})
        prior["year"] = cur_year - 1
        prior["sj_div"] = sj
        frames.append(prior)

jion_long = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
print("jion_long shape:", jion_long.shape)
display(jion_long.head(10))

# --- D) Save outputs ---
proc_dir = PROJ_DIR / "data" / "processed"
proc_dir.mkdir(parents=True, exist_ok=True)
jion_long.to_csv(proc_dir / "jion_long.csv", index=False)

if not jion_long.empty:
    jion_wide = (jion_long
                 .assign(col=lambda d: d["sj_div"] + ":" + d["account_nm"].astype(str))
                 .pivot_table(index="year", columns="col", values="amount", aggfunc="first")
                 .reset_index()
                 .sort_values("year"))
    jion_wide.to_csv(proc_dir / "jion_wide.csv", index=False)
    print("Saved:", proc_dir / "jion_long.csv")
    print("Saved:", proc_dir / "jion_wide.csv")
    display(jion_wide)
else:
    print("No tables parsed from the audit PDF. We can tighten the extractor or key a few critical lines if needed.")


CWD:       /Users/seunghyunhong/simonhong/Projects/kbeauty-device-brief/notebooks
Project:   /Users/seunghyunhong/simonhong/Projects/kbeauty-device-brief
PDF path:  /Users/seunghyunhong/simonhong/Projects/kbeauty-device-brief/data/raw/jion/20250403/20250403003319.pdf | exists: True
BS rows: 0
IS rows: 0
CF rows: 0
jion_long shape: (0, 0)


No tables parsed from the audit PDF. We can tighten the extractor or key a few critical lines if needed.
