# 1. 기존 DB자료 "Tab" → "," 변환

In [None]:
import pandas as pd

df = pd.read_csv("ygs.csv", sep="\t", dtype=str, encoding="utf-8")
df.to_csv("ygs2.csv", index=False, encoding="utf-8-sig")


# 2. 토큰화 - ① 단어사전 생성

In [None]:
from __future__ import annotations
import argparse
import json
import math
import sys
from collections import Counter
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import Iterable, List, Dict, Any, Optional, Tuple, Hashable

# Optional imports guarded so script works without SQL deps if not used
try:
    import pandas as pd
except Exception:
    pd = None

RESERVED_DEFAULT = ["<PAD>", "<UNK>"]  # 0, 1

@dataclass
class Vocab:
    name: str
    reserved: List[str]
    id2token: List[Any]  # reserved first, then data tokens
    counts: Dict[str, int]  # token string repr -> count (for reference)
    created_at: str
    meta: Dict[str, Any]

    @property
    def size(self) -> int:
        return len(self.id2token)

    def token_to_id(self, token: Any) -> int:
        # exact match if token equals one of reserved strings
        for i, r in enumerate(self.reserved):
            if token == r:
                return i
        # other tokens; compare to id2token list
        try:
            return self.id2token.index(token)
        except ValueError:
            # UNK id is index of "<UNK>"
            try:
                return self.reserved.index("<UNK>")
            except ValueError:
                # if UNK missing, fallback to 1
                return 1

    def to_json(self) -> str:
        payload = {
            "name": self.name,
            "reserved": self.reserved,
            "id2token": self.id2token,
            "counts": self.counts,
            "created_at": self.created_at,
            "meta": self.meta,
        }
        return json.dumps(payload, ensure_ascii=False, indent=2)

    @staticmethod
    def from_json(s: str) -> "Vocab":
        obj = json.loads(s)
        return Vocab(
            name=obj["name"],
            reserved=list(obj["reserved"]),
            id2token=list(obj["id2token"]),
            counts=dict(obj.get("counts", {})),
            created_at=obj.get("created_at", ""),
            meta=dict(obj.get("meta", {})),
        )

def _ensure_pandas():
    if pd is None:
        raise RuntimeError("pandas is required for CSV mode. Please `pip install pandas`.")

def _iter_series_values(series) -> Iterable[Any]:
    # Drop NaN and convert numpy types to Python scalars
    for v in series:
        if v is None:
            continue
        # Handle pandas NaN
        try:
            if pd is not None and pd.isna(v):
                continue
        except Exception:
            pass
        # Convert numpy scalar to python scalar
        try:
            import numpy as np
            if isinstance(v, (np.generic,)):
                v = v.item()
        except Exception:
            pass
        yield v

def _make_counts(values: Iterable[Any]) -> Counter:
    c = Counter()
    for v in values:
        c[v] += 1
    return c

def _sorted_tokens_from_counts(cnt: Counter, min_freq: int, max_size: Optional[int]) -> List[Any]:
    # Filter by min_freq
    items = [(tok, n) for tok, n in cnt.items() if n >= min_freq]
    # Deterministic order: freq desc, then token asc by str()
    items.sort(key=lambda x: (-x[1], str(x[0])))
    tokens = [tok for tok, _ in items]
    if max_size is not None and max_size > 0:
        tokens = tokens[:max_size]
    return tokens

def build_vocab_from_series(
    name: str,
    series,
    min_freq: int = 1,
    max_size: Optional[int] = None,
    reserved: Optional[List[str]] = None,
    meta: Optional[Dict[str, Any]] = None,
) -> Vocab:
    if reserved is None:
        reserved = list(RESERVED_DEFAULT)
    cnt = _make_counts(_iter_series_values(series))
    tokens = _sorted_tokens_from_counts(cnt, min_freq=min_freq, max_size=max_size)

    # id2token = reserved + tokens
    id2token = list(reserved) + tokens
    # counts saved as str keys for JSON safety
    counts_str = {str(k): int(v) for k, v in cnt.items()}
    return Vocab(
        name=name,
        reserved=list(reserved),
        id2token=id2token,
        counts=counts_str,
        created_at=datetime.utcnow().isoformat(timespec="seconds") + "Z",
        meta=meta or {},
    )

def build_joint_vocab_from_df(
    name: str,
    df, cols: List[str],
    sep: str = "||",
    min_freq: int = 1,
    max_size: Optional[int] = None,
    reserved: Optional[List[str]] = None,
    meta: Optional[Dict[str, Any]] = None,
) -> Vocab:
    # Create tuple tokens then stringify with sep for JSON-safe storage
    tuples = df[cols].astype(str).agg(sep.join, axis=1)
    return build_vocab_from_series(
        name=name, series=tuples, min_freq=min_freq, max_size=max_size, reserved=reserved,
        meta={"joint_cols": cols, "sep": sep, **(meta or {})},
    )

def encode_series_to_ids(series, vocab: Vocab) -> List[int]:
    # Builds a list of ids for a 1D series
    token2id = {tok: i for i, tok in enumerate(vocab.id2token)}
    unk_id = vocab.reserved.index("<UNK>") if "<UNK>" in vocab.reserved else 1
    ids = []
    for v in _iter_series_values(series):
        ids.append(token2id.get(v, unk_id))
    return ids

def load_csv_select_column(csv_path: str, col: str, where_col: Optional[str]=None, where_val: Optional[str]=None):
    _ensure_pandas()
    df = pd.read_csv(csv_path)
    if where_col is not None and where_val is not None:
        df = df[df[where_col] == where_val]
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in CSV.")
    return df[col]

def load_csv_dataframe(csv_path: str, usecols: Optional[List[str]]=None, where_col: Optional[str]=None, where_val: Optional[str]=None):
    _ensure_pandas()
    df = pd.read_csv(csv_path, usecols=usecols)
    if where_col is not None and where_val is not None:
        df = df[df[where_col] == where_val]
    return df

def main():
    ap = argparse.ArgumentParser(description="Build vocabulary JSON from CSV or SQL query.")
    src = ap.add_mutually_exclusive_group(required=True)
    src.add_argument("--csv", type=str, help="CSV file path")
    src.add_argument("--sql", type=str, help="SQLAlchemy URL, e.g., mysql+pymysql://user:pwd@host:3306/db")

    ap.add_argument("--query", type=str, help="SQL query when using --sql")
    ap.add_argument("--col", type=str, help="Single column name to build vocab from")
    ap.add_argument("--joint-cols", nargs="+", help="Build joint vocab from multiple columns")
    ap.add_argument("--where-col", type=str, help="Column name for filtering (e.g., split)")
    ap.add_argument("--where-val", type=str, help="Value for filtering (e.g., train)")
    ap.add_argument("--min-freq", type=int, default=1, help="Minimum frequency to keep a token")
    ap.add_argument("--max-size", type=int, default=0, help="Max vocab size (0 = unlimited)")
    ap.add_argument("--out", type=str, required=True, help="Output JSON path")
    ap.add_argument("--name", type=str, default="", help="Optional name override for vocab")
    ap.add_argument("--reserved", nargs="*", default=None, help="Reserved tokens in order (default: <PAD> <UNK>)")

    args = ap.parse_args()

    if args.csv:
        if args.joint_cols:
            df = load_csv_dataframe(args.csv, usecols=args.joint_cols + ([args.where_col] if args.where_col else []), where_col=args.where_col, where_val=args.where_val)
            vocab = build_joint_vocab_from_df(
                name=args.name or ("joint:" + "+".join(args.joint_cols)),
                df=df,
                cols=args.joint_cols,
                min_freq=args.min_freq,
                max_size=(args.max_size or None),
                reserved=args.reserved,
                meta={"source": "csv", "csv": args.csv, "filter": {args.where_col: args.where_val} if args.where_col else None},
            )
        else:
            if not args.col:
                print("Error: --col is required when not using --joint-cols", file=sys.stderr)
                sys.exit(2)
            series = load_csv_select_column(args.csv, args.col, where_col=args.where_col, where_val=args.where_val)
            vocab = build_vocab_from_series(
                name=args.name or args.col,
                series=series,
                min_freq=args.min_freq,
                max_size=(args.max_size or None),
                reserved=args.reserved,
                meta={"source": "csv", "csv": args.csv, "filter": {args.where_col: args.where_val} if args.where_col else None},
            )
    else:
        # SQL mode
        if not args.query:
            print("Error: --query is required with --sql", file=sys.stderr)
            sys.exit(2)
        try:
            import pandas as pd
            from sqlalchemy import create_engine
        except Exception as e:
            print("Error: SQL mode requires pandas and sqlalchemy. pip install pandas sqlalchemy [driver]", file=sys.stderr)
            sys.exit(2)
        engine = create_engine(args.sql)
        df = pd.read_sql(args.query, engine)
        if args.joint_cols:
            for c in args.joint_cols:
                if c not in df.columns:
                    raise ValueError(f"Column '{c}' not found in SQL result.")
            vocab = build_joint_vocab_from_df(
                name=args.name or ("joint:" + "+".join(args.joint_cols)),
                df=df,
                cols=args.joint_cols,
                min_freq=args.min_freq,
                max_size=(args.max_size or None),
                reserved=args.reserved,
                meta={"source": "sql", "query": args.query},
            )
        else:
            if not args.col:
                print("Error: --col is required when not using --joint-cols", file=sys.stderr)
                sys.exit(2)
            if args.col not in df.columns:
                raise ValueError(f"Column '{args.col}' not found in SQL result.")
            series = df[args.col]
            vocab = build_vocab_from_series(
                name=args.name or args.col,
                series=series,
                min_freq=args.min_freq,
                max_size=(args.max_size or None),
                reserved=args.reserved,
                meta={"source": "sql", "query": args.query},
            )

    with open(args.out, "w", encoding="utf-8") as f:
        f.write(vocab.to_json())

    print(f"Wrote vocab: {args.out} (size={vocab.size})")

if __name__ == "__main__":
    main()
