# 1. 원본 데이터 중 EPC 코드 기준 샘플 추출

In [None]:
import pandas as pd

# 1. 파일 로드 (탭 구분, 모두 문자열)
df = pd.read_csv("icn.csv", sep="\t", dtype=str)

# 2. epc_code 기준 정렬
df = df.sort_values("epc_code")

# 3. epc_code별 그룹화, epc_code 유니크값에서 n개 랜덤 추출
epc_codes = df["epc_code"].unique()
sample_epc_codes = pd.Series(epc_codes).sample(n=1000, random_state=42).tolist()

# 4. 해당 n개 epc_code에 속한 모든 이력 row 추출
sample_df = df[df["epc_code"].isin(sample_epc_codes)].copy()

# 5. 결과 저장
sample_df.to_csv("icn500개.csv", sep="\t", index=False)

# 2. 오류 데이터 생성

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# 1. 데이터 로딩
df = pd.read_csv("icn3000.csv", sep="\t")

# 2. 'Aggregation' row는 건드리지 않고, 나머지 row 중 10%를 오류 대상으로 선정
mask_agg = df["event_type"] == "Aggregation"
df_agg = df[mask_agg]
df_rest = df[~mask_agg]

# 3. 오류 넣을 row 샘플링
np.random.seed(42)
n_err = int(len(df_rest) * 0.10)
err_idx = np.random.choice(df_rest.index, n_err, replace=False)
df_error_target = df_rest.loc[err_idx].copy()

# 4. 오류 생성 함수 매핑
def corrupt_event_time(val):
    rand = random.choice([1, 2, 3])
    if rand == 1:
        return "2025-06-30 21:00:11"
    elif rand == 2:
        return "2025-05-30 00:00:00"
    else:
        return "2025-04-30 23:59:59"

def corrupt_event_type(val):
    return "POS_Sell"

def corrupt_business_step(val):
    
    return "R_Stock_Inbound"

def corrupt_epc_company(val):
    return str(random.randint(9995843, 9999999))

def corrupt_epc_product(val):
    return str(random.randint(9999000, 9999999))

def corrupt_epc_lot(val):
    return "^" + str(random.randint(300000, 999999))

def corrupt_epc_serial(val):
    return str(random.randint(000000000, 000000000))

corrupt_funcs = {
    "event_time": corrupt_event_time,
    "event_type": corrupt_event_type,
    "business_step": corrupt_business_step,
    "epc_company": corrupt_epc_company,
    "epc_product": corrupt_epc_product,
    "epc_lot": corrupt_epc_lot,
    "epc_serial": corrupt_epc_serial,
}
error_cols = list(corrupt_funcs.keys())

orig_columns = list(df.columns)

random.seed(42)
df_error_target["corrupted_col"] = None
df_error_target["orig_value"] = None
df_error_target["corrupted_value"] = None

for idx in df_error_target.index:
    col = random.choice(error_cols)
    orig_val = df_error_target.at[idx, col]
    new_val = corrupt_funcs[col](orig_val)
    df_error_target.at[idx, col] = new_val
    df_error_target.at[idx, "corrupted_col"] = col
    df_error_target.at[idx, "orig_value"] = orig_val
    df_error_target.at[idx, "corrupted_value"] = new_val

# ❶ **여기서 모든 문자열 컬럼 null/NaN 제거 (오류 row만)**
string_cols = [
    "event_time", "event_type", "business_step",
    "epc_company", "epc_product", "epc_lot", "epc_serial"
]
for col in string_cols:
    df_error_target[col] = df_error_target[col].fillna("").astype(str)

# 6. 오류가 삽입된 row를 원래 위치에 넣기
df_rest_updated = df_rest.copy()
df_rest_updated.loc[err_idx] = df_error_target[orig_columns]

# 7. 전체 데이터(오류 삽입)
df_with_error = pd.concat([df_agg, df_rest_updated], ignore_index=True)[orig_columns]

# ❷ **전체 파일도 한 번 더 null 제거(안전!)**
for col in string_cols:
    df_with_error[col] = df_with_error[col].fillna("").astype(str)
    df_error_target[col] = df_error_target[col].fillna("").astype(str)

# 8. 오류 row만 추출 (칼럼 강제)
df_error_only = df_error_target[orig_columns]

df = df.astype(str)

df_with_error = df_with_error.astype(str)
df_error_only = df_error_only.astype(str)

# 9. 결과 저장
df_with_error.to_csv("icn_with_error.csv", sep="\t", index=False)
df_error_only.to_csv("icn_only_error.csv", sep="\t", index=False)


  df_error_target.at[idx, col] = new_val
  df_error_target.at[idx, col] = new_val
  df_error_target.at[idx, col] = new_val
  df_error_target.at[idx, col] = new_val
 '8804823' '8804823' '9996103' '8805843' '8809437' '8805843' '8805843'
 '8805843' '8804823' '8809437' '8804823' '9997648' '9998121' '8805843'
 '8804823' '8809437' '8804823' '8809437' '8804823' '8809437' '8804823'
 '8804823' '9996635' '8805843' '8809437' '8805843' '8809437' '8809437'
 '8809437' '9996488' '8805843' '8805843' '8805843' '8804823' '8805843'
 '8804823' '8809437' '8804823' '8805843' '9998120' '9998831' '8804823'
 '8804823' '8809437' '8809437' '8809437' '8804823' '8805843' '8805843'
 '8804823' '8805843' '8809437' '9998951' '8805843' '8805843' '8804823'
 '8805843' '8804823' '8804823' '8809437' '8804823' '9998036' '8804823'
 '8805843' '8805843' '8805843' '8809437' '9999601' '8805843' '8809437'
 '8804823' '8809437' '8804823' '8804823' '8804823' '9999114' '8804823'
 '8809437' '8805843' '8805843' '8809437' '8809437' '880

# 3. 생성 오류 데이터 중 정상 데이터 소거

In [2]:
import pandas as pd

# 1. 정상 조합 목록 (문자열로 통일)
valid_pairs = {
    ('0102313', '8805843'),
    ('1059560', '8809437'),
    ('1203199', '8809437'),
    ('1232127', '8809437'),
    ('1239213', '8809437'),
    ('1240424', '8809437'),
    ('1293291', '8804823'),
    ('1424444', '8804823'),
    ('2031921', '8804823'),
    ('2031932', '8804823'),
    ('2190354', '8805843'),
    ('2932031', '8805843'),
    ('3812382', '8805843'),
    ('3842332', '8805843'),
    ('4282032', '8804823'),
}

# 2. EpcSerialValidatorService 규칙을 파이썬으로 구현
def get_factory_by_hub_type(hub_type):
    if pd.isna(hub_type):
        return None
    if "화성" in hub_type or "HWS" in hub_type:
        return "화성"
    if "ICN" in hub_type or "인천" in hub_type:
        return "인천"
    if "구미" in hub_type or "GUM" in hub_type:
        return "구미"
    if "양산" in hub_type or "YAS" in hub_type:
        return "양산"
    return None

def init_factory_lot_serials():
    rules = {}
    # (공장, 시작Lot, lot개수, chunk, reset)
    param_list = [
        ("화성", 50001, 26, 2000, 16),
        ("인천", 10001, 51, 2000, 16),
        ("구미", 150001, 11, 2000, 16),
        ("양산", 100001, 32, 2000, 16),
    ]
    for factory, sl, bs, cpl, ri in param_list:
        lot_map = {}
        ss = 1
        for i in range(bs):
            lot = str(sl + i)
            if i % ri == 0:
                se = ss
            elif (i+1) % ri == 0:
                se = ss + 1998
            else:
                se = ss + cpl - 1
            lot_map[lot] = (ss, se)
            if (i+1) % ri == 0:
                ss = 1
            else:
                ss = se + 1
        rules[factory] = lot_map
    return rules

factory_lot_serials = init_factory_lot_serials()

def is_valid_serial(factory, lot, serial):
    lot_map = factory_lot_serials.get(factory)
    if not lot_map or lot not in lot_map:
        return False
    try:
        serial = int(serial)
    except Exception:
        return False
    ss, se = lot_map[lot]
    return ss <= serial <= se

# 3. 데이터 불러오기 (파일명은 실제 파일 경로로 변경)
df = pd.read_csv('icn_only_error.csv', sep="\t", dtype=str)

# 4. 필터: 정상 조합 + epc_lot/serial 규칙
normal_rows = []
for idx, row in df.iterrows():
    epc_product = row["epc_product"]
    epc_company = row["epc_company"]
    pair = (epc_product, epc_company)
    if pair not in valid_pairs:
        continue

    # hub_type(공장) 파싱
    hub_type = row.get("hub_type", "")
    factory = get_factory_by_hub_type(hub_type)
    epc_lot = row.get("epc_lot", "")
    epc_serial = row.get("epc_serial", "")

    if is_valid_serial(factory, epc_lot, epc_serial):
        normal_rows.append(row)

# 5. 출력
result_df = pd.DataFrame(normal_rows)
print(result_df[["epc_code", "epc_company", "epc_product", "epc_lot", "epc_serial", "hub_type", "event_time"]])
print(f"\n정상 row 개수: {len(result_df)}")

# 6. 정상 row 삭제
# epc_code와 event_time이 동일한 기준으로 제거 (필요 시 key 컬럼 조정 가능)
df_filtered = df[~df.apply(lambda row: (
    (row["epc_code"], row["event_time"]) 
    in zip(result_df["epc_code"], result_df["event_time"])
), axis=1)]

print(f"삭제 후 row 개수: {len(df_filtered)}")
df_filtered.to_csv("icn_only_error_filtered.csv", sep="\t", index=False)

                                          epc_code epc_company epc_product  \
3    001.8805843.2190354.010008.20250701.000013527     8805843     2190354   
44   001.8804823.2031932.010011.20250701.000018294     8804823     2031932   
62   001.8804823.1293291.010004.20250701.000005432     8804823     1293291   
65   001.8804823.1293291.010004.20250701.000004587     8804823     1293291   
170  001.8804823.1293291.010004.20250701.000005968     8804823     1293291   
173  001.8809437.1240424.010005.20250701.000006132     8809437     1240424   
185  001.8804823.4282032.010007.20250701.000010305     8804823     4282032   
197  001.8805843.2932031.010016.20250701.000029557     8805843     2932031   
221  001.8805843.2190354.010008.20250701.000012130     8805843     2190354   
232  001.8809437.1240424.010005.20250701.000007623     8809437     1240424   
238  001.8809437.1059560.010009.20250701.000014151     8809437     1059560   
240  001.8804823.2031932.010011.20250701.000018429     8804823  