In [None]:
pip install pyarrow

In [None]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path(".").resolve()
RAW_MST_DIR = PROJECT_ROOT / "mst_raw"
OUT_DIR     = PROJECT_ROOT / "mst_fixed"

RAW_MST_DIR.mkdir(parents=True, exist_ok=True)
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
def read_mst_lines(path: Path) -> list[bytes]:
    lines = []
    with path.open("rb") as f:
        for raw in f:
            line = raw.rstrip(b"\r\n")
            if line:
                lines.append(line)
    if not lines:
        raise ValueError(f"{path} is empty")
    return lines

# 공통 길이 정의
SZ_SHRNCODE = 9
SZ_STNDCODE = 12
SZ_KORNAME  = 40
SZ_KORNAME20 = 20

# 파일별 스키마
EQUITY_SCHEMA = [
    ("short_code", SZ_SHRNCODE),
    ("std_code", SZ_STNDCODE),
    ("name", SZ_KORNAME),
]

KOSPI_SCHEMA      = EQUITY_SCHEMA
KOSDAQ_SCHEMA     = EQUITY_SCHEMA
KONEX_SCHEMA      = EQUITY_SCHEMA
NXT_KOSPI_SCHEMA  = EQUITY_SCHEMA
NXT_KOSDAQ_SCHEMA = EQUITY_SCHEMA

ELW_SCHEMA = [
    ("short_code", SZ_SHRNCODE),
    ("std_code", SZ_STNDCODE),
    ("name", SZ_KORNAME),
]

IDX_SCHEMA = [
    ("idx_div", 1),
    ("idx_code", 4),
    ("idx_name", 40),
]

THEME_SCHEMA = [
    ("theme_code", 3),
    ("theme_name", 40),
    ("short_code", 6),
]

MEM_SCHEMA = [
    ("member_code", 5),
    ("member_name", SZ_KORNAME20),
    ("is_global", 1),
]

BOND_SCHEMA = [
    ("type", 2),
    ("bond_cls_code", 2),
    ("std_code", 12),
    ("name", 40),
]

FO_IDX_SCHEMA = [
    ("info_type", 1),
    ("short_code", SZ_SHRNCODE),
    ("std_code", SZ_STNDCODE),
    ("name", SZ_KORNAME),
]

FO_STK_SCHEMA = FO_IDX_SCHEMA

FO_COM_SCHEMA = [
    ("com_type", 1),
    ("info_type", 1),
    ("short_code", SZ_SHRNCODE),
    ("std_code", SZ_STNDCODE),
    ("name", SZ_KORNAME),
]

# schema 매핑
SCHEMA_MAP = {
    "kospi_code.mst":      KOSPI_SCHEMA,
    "nxt_kospi_code.mst":  NXT_KOSPI_SCHEMA,
    "kosdaq_code.mst":     KOSDAQ_SCHEMA,
    "nxt_kosdaq_code.mst": NXT_KOSDAQ_SCHEMA,
    "konex_code.mst":      KONEX_SCHEMA,
    "elw_code.mst":        ELW_SCHEMA,
    "idxcode.mst":         IDX_SCHEMA,
    "theme_code.mst":      THEME_SCHEMA,
    "memcode.mst":         MEM_SCHEMA,
    "bond_code.mst":       BOND_SCHEMA,
    "fo_idx_code_mts.mst": FO_IDX_SCHEMA,
    "fo_eurex_code.mst":   FO_IDX_SCHEMA,
    "fo_cme_code.mst":     FO_IDX_SCHEMA,
    "fo_stk_code_mts.mst": FO_STK_SCHEMA,
    "fo_com_code.mst":     FO_COM_SCHEMA,
    "fo_cmu_code.mst":     FO_COM_SCHEMA,
}

In [None]:
def parse_fixed_width_lines(lines: list[bytes], schema, *, encoding="cp949") -> pd.DataFrame:
    record_len = max(len(l) for l in lines)
    offsets, offset = [], 0
    for name, length in schema:
        offsets.append((name, offset, length))
        offset += length

    columns = {name: [] for name, _ in schema}
    for line in lines:
        if len(line) < record_len:
            line = line.ljust(record_len, b" ")
        for name, start, length in offsets:
            raw = line[start:start+length]
            columns[name].append(raw.decode(encoding, errors="ignore").rstrip())
    return pd.DataFrame(columns)

In [None]:
def save_df(df: pd.DataFrame, base_name: str):
    df.to_csv(OUT_DIR / f"{base_name}.csv", index=False, encoding="utf-8-sig")
    try:
        df.to_parquet(OUT_DIR / f"{base_name}.parquet", index=False)
    except:
        pass

def build_all_mst():
    out = {}
    for path in sorted(RAW_MST_DIR.glob("*.mst")):
        schema = SCHEMA_MAP.get(path.name)
        if not schema:
            continue
        lines = read_mst_lines(path)
        df = parse_fixed_width_lines(lines, schema)
        base_name = path.stem
        save_df(df, base_name)
        out[base_name] = df

    # 주식 통합 마스터
    eq = ["kospi_code", "kosdaq_code", "konex_code"]
    if all(k in out for k in eq):
        merged = pd.concat([out[k] for k in eq], ignore_index=True)
        save_df(merged, "equity_master")
        out["equity_master"] = merged

    return out

In [None]:
all_dfs = build_all_mst()
for name, df in all_dfs.items():
    print("\n###", name)
    display(df.head())