# 기초통계 기초데이터 변환 TEST

Load Libaries

In [1]:
from __future__ import annotations
import pandas as pd
import numpy as np
from dataclasses import dataclass, asdict
from typing import Optional, Tuple
from abc import ABC, abstractmethod

Read Sol Data

In [2]:
file_path = "input_data.txt"

df = pd.read_csv(file_path, sep="\t")
print(df.shape)
df.head()

(501, 25)


Unnamed: 0,_container_id,_container_name,_container_codes,id,_coverage_id,coverage_name,coverage_features,benefit_nth,benefit_type,benefit_risk,...,term_condition_2,term_2,period,period_condition_1,period_1,period_condition_2,period_2,base,step,cycle
0,c1296fad-52a3-40e7-acef-0aede41a21f3,무배당 알파Plus보장보험2511,6AELM 6AELN,b1ef2a6e-3494-4d9b-8412-7005cbb369d4,1,기본계약(일반상해80%이상후유장해)_x000D_\n,[],1,SimpleBenefit,#MR000091,...,,,,,,,,,,
1,c1296fad-52a3-40e7-acef-0aede41a21f3,무배당 알파Plus보장보험2511,6AELM 6AELN,ab858b92-3c1c-49da-abb5-373d11804fe5,2,골절진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001589,...,,,,,,,,,,
2,c1296fad-52a3-40e7-acef-0aede41a21f3,무배당 알파Plus보장보험2511,6AELM 6AELN,14d5d232-7bd6-4627-b84d-f89b495d5f1c,3,일반상해80%이상후유장해생활자금_x000D_\n,[],1,SimpleBenefit,#MR000091,...,,,,,,,,,,1M
3,c1296fad-52a3-40e7-acef-0aede41a21f3,무배당 알파Plus보장보험2511,6AELM 6AELN,2c6536c3-17fe-411e-97af-ad931c2adf3b,4,골절(치아파절 제외)진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001591,...,,,,,,,,,,
4,c1296fad-52a3-40e7-acef-0aede41a21f3,무배당 알파Plus보장보험2511,6AELM 6AELN,2300b8ab-81f8-4a2b-9e36-fe4d4655b957,5,신화상치료비(화상진단비)_x000D_\n,[],1,SimpleBenefit,#MR001166,...,,,,,,,,,,


Preprocessing
- 컬럼 삭제 : _container_id, id
- 컬럼명 변경 
    - _container_codes -> prod_code
    - _coverage_id -> cov_seq

In [3]:
df = (
    df.rename(columns={"_container_codes": "prod_code", "_coverage_id": "cov_seq"})
      .drop(columns=["_container_id", "id"], errors="ignore")
)

print(df.shape)
df.head()

(501, 23)


Unnamed: 0,_container_name,prod_code,cov_seq,coverage_name,coverage_features,benefit_nth,benefit_type,benefit_risk,benefit_feature_type,rate,...,term_condition_2,term_2,period,period_condition_1,period_1,period_condition_2,period_2,base,step,cycle
0,무배당 알파Plus보장보험2511,6AELM 6AELN,1,기본계약(일반상해80%이상후유장해)_x000D_\n,[],1,SimpleBenefit,#MR000091,,,...,,,,,,,,,,
1,무배당 알파Plus보장보험2511,6AELM 6AELN,2,골절진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001589,,,...,,,,,,,,,,
2,무배당 알파Plus보장보험2511,6AELM 6AELN,3,일반상해80%이상후유장해생활자금_x000D_\n,[],1,SimpleBenefit,#MR000091,SplitPaymentBenefitFeature,,...,,,,,,,,,,1M
3,무배당 알파Plus보장보험2511,6AELM 6AELN,4,골절(치아파절 제외)진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001591,,,...,,,,,,,,,,
4,무배당 알파Plus보장보험2511,6AELM 6AELN,5,신화상치료비(화상진단비)_x000D_\n,[],1,SimpleBenefit,#MR001166,,,...,,,,,,,,,,


Helper 함수

In [4]:
class ParsingHelperMixIn:

    @classmethod
    def parse_year(self, value: object) -> Optional[float]:
        """
        기간 문자열을 년 단위 숫자로 변환 (예: 10Y -> 10, 1M -> 1/12)
        """
        if pd.isna(value):
            return None
        s = str(value).strip()
        if not s:
            return None
        if s.endswith("Y"):
            try:
                return float(s[:-1])
            except ValueError:
                return None
        if s.endswith("M"):
            try:
                return float(s[:-1]) / 12
            except ValueError:
                return None
        try:
            return float(s)
        except ValueError:
            return None

    @classmethod
    def parse_rate(self, value: object) -> Optional[float]:
        """
        비율 문자열을 숫자로 변환 (예: 50% -> 0.5, 1/3 -> 0.333...)
        """
        if pd.isna(value):
            return None
        s = str(value).strip()
        if not s:
            return None
        if s.endswith("%"):
            try:
                return float(s[:-1]) / 100
            except ValueError:
                return None
        if "/" in s:
            try:
                num, den = s.split("/", 1)
                return float(num) / float(den)
            except Exception:
                return None
        try:
            return float(s)
        except ValueError:
            return None

    @classmethod
    def parse_periods(self, value: object) -> Tuple[Optional[float], Optional[float]]:
        """
        기간 범위를 (시작, 종료) 형태로 변환
        """
        if pd.isna(value):
            return (None, None)
        s = str(value).strip()
        if not s:
            return (None, None)
        if "~" in s:
            start, end = s.split("~", 1)
            return self.parse_year(start), self.parse_year(end)
        return (None, self.parse_year(s))


BenefitFeatureType 정의
- ExclusionFeature
- PaymentRatioByPeriodBenefitFeature
- PayoutRatioFeature
- SplitPaymentBenefitFeature

In [5]:
@dataclass
class BenefitFeatureType(ABC, ParsingHelperMixIn):

    @classmethod
    @abstractmethod
    def from_series(cls, row: pd.Series) -> "BenefitFeatureType":
        raise NotImplementedError


@dataclass
class ExclusionFeature(BenefitFeatureType):
    """
    면책기간
    - term : 면책기간(Y)
    - is_over_15 : 15세 이상 적용여부
    """
    term: Optional[float]
    is_over_15: bool

    @classmethod
    def from_series(self, row: pd.Series) -> "ExclusionFeature":
        is_over_15 = str(row.get("term_condition_1", "")).strip() == "x>=15"
        term_value = row.get("term_1") if is_over_15 else row.get("term")
        return self(term=self.parse_year(term_value), is_over_15=is_over_15)


@dataclass
class PaymentRatioByPeriodBenefitFeature(BenefitFeatureType):
    """
    기간에 따른 지급비율
    - start : 감액 시작시점(Y)
    - end : 감액 종료시점(Y)
    - rate : 감액률
    """
    start: Optional[float]
    end: Optional[float]
    rate: Optional[float]

    @classmethod
    def from_series(self, row: pd.Series) -> "PaymentRatioByPeriodBenefitFeature":
        start, end = self.parse_periods(row.get("period"))
        return self(start=start, end=end, rate=self.parse_rate(row.get("rate")))


@dataclass
class PayoutRatioFeature(BenefitFeatureType):
    """
    지급비율
    - rate : 지급비율
    """
    rate: Optional[float]

    @classmethod
    def from_series(self, row: pd.Series) -> "PayoutRatioFeature":
        return self(rate=self.parse_rate(row.get("rate")))


@dataclass
class SplitPaymentBenefitFeature(BenefitFeatureType):
    """
    분할지급
    - term : 분할지급기간(Y)
    - cycle : 지급주기(Y)
    """
    term: Optional[float]
    cycle: Optional[float]

    @classmethod
    def from_series(self, row: pd.Series) -> "SplitPaymentBenefitFeature":
        return self(term=self.parse_year(row.get("term")), cycle=self.parse_year(row.get("cycle")))

Add Feature Object Column

In [12]:
FEATURE_CLASS_MAP = {
    "ExclusionFeature": ExclusionFeature,
    "PaymentRatioByPeriodBenefitFeature": PaymentRatioByPeriodBenefitFeature,
    "PayoutRatioFeature": PayoutRatioFeature,
    "SplitPaymentBenefitFeature": SplitPaymentBenefitFeature,
}

In [13]:
def build_feature(row: pd.Series) -> Optional[BenefitFeatureType]:
    feat = FEATURE_CLASS_MAP.get(row.get("benefit_feature_type"))
    return feat.from_series(row) if feat else None
    
df["feature_obj"] = df.apply(build_feature, axis=1)

Simple Benefit Type

In [14]:
df_simple_benefit = df[df["benefit_type"] == "SimpleBenefit"].copy()
print(df_simple_benefit.shape)
df_simple_benefit.head()

(448, 24)


Unnamed: 0,_container_name,prod_code,cov_seq,coverage_name,coverage_features,benefit_nth,benefit_type,benefit_risk,benefit_feature_type,rate,...,term_2,period,period_condition_1,period_1,period_condition_2,period_2,base,step,cycle,feature_obj
0,무배당 알파Plus보장보험2511,6AELM 6AELN,1,기본계약(일반상해80%이상후유장해)_x000D_\n,[],1,SimpleBenefit,#MR000091,,,...,,,,,,,,,,
1,무배당 알파Plus보장보험2511,6AELM 6AELN,2,골절진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001589,,,...,,,,,,,,,,
2,무배당 알파Plus보장보험2511,6AELM 6AELN,3,일반상해80%이상후유장해생활자금_x000D_\n,[],1,SimpleBenefit,#MR000091,SplitPaymentBenefitFeature,,...,,,,,,,,,1M,"SplitPaymentBenefitFeature(term=10.0, cycle=0...."
3,무배당 알파Plus보장보험2511,6AELM 6AELN,4,골절(치아파절 제외)진단비Ⅱ_x000D_\n,[],1,SimpleBenefit,#MR001591,,,...,,,,,,,,,,
4,무배당 알파Plus보장보험2511,6AELM 6AELN,5,신화상치료비(화상진단비)_x000D_\n,[],1,SimpleBenefit,#MR001166,,,...,,,,,,,,,,


Summary

In [16]:
def collect_features(group: pd.DataFrame) -> pd.Series:
    objs = [f for f in group["feature_obj"] if f is not None]
    out = {}
    for name, cls in FEATURE_CLASS_MAP.items():
        matches = [o for o in objs if isinstance(o, cls)]
        out[name] = tuple(str(m) for m in matches) if matches else None
    return pd.Series(out)

In [18]:
df_features_by_key = (
    # key_cols = ["prod_code", "cov_seq", "benefit_nth", "benefit_risk"]
    df_simple_benefit.groupby(["prod_code", "cov_seq", "benefit_nth", "benefit_risk"], group_keys=False)
    .apply(collect_features, include_groups=False)
).reset_index()

print(df_features_by_key.shape)
df_features_by_key.head()


(388, 8)


Unnamed: 0,prod_code,cov_seq,benefit_nth,benefit_risk,ExclusionFeature,PaymentRatioByPeriodBenefitFeature,PayoutRatioFeature,SplitPaymentBenefitFeature
0,6AELM 6AELN,1,1,#MR000091,,,,
1,6AELM 6AELN,2,1,#MR001589,,,,
2,6AELM 6AELN,3,1,#MR000091,,,,"(SplitPaymentBenefitFeature(term=10.0, cycle=0..."
3,6AELM 6AELN,4,1,#MR001591,,,,
4,6AELM 6AELN,5,1,#MR001166,,,,
