<a href="https://colab.research.google.com/github/nanafish/ORS/blob/main/%E3%83%9E%E3%83%83%E3%83%94%E3%83%B3%E3%82%B0%E9%AB%98%E9%80%9F%E5%8C%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os, re, traceback
import pandas as pd
from google.colab import files

# =========================
# 列名ゆれ（残す列）をグループ化
# 先に見つかった列名を採用して残す（列名は崩さない）
# =========================
KEEP_COL_GROUPS = [
    ["メールアドレス"],
    ["郵便番号"],
    ["性別"],
    ["職業"],
    ["年齢"],
    ["既婚・未婚"],
    ["ご希望の予約特典"],
    ["弊社イベントのご来展回数", "イベントへのご来展回数"],  # ★ここが修正点
    ["イベントを知ったきっかけ"],
    ["イベントを知ったきっかけ（その他の情報）"],
    ["出展作家を知ったきっかけ"],
    ["出展作家を知ったきっかけ（以前から知っていた）"],
    ["来場日"],
]

COL_EMAIL   = "メールアドレス"
COL_TARGET  = "接客対象"
COL_BUYEXP  = "当社での絵画購入経験"
COL_VISIT   = "来場状況"

def norm_colname(s: str) -> str:
    if s is None:
        return s
    s = str(s)
    s = s.replace("\u3000", " ")
    s = re.sub(r"[\u200b-\u200d\ufeff]", "", s)  # ゼロ幅/BOM等
    return s.strip()

def norm_email(s) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip().replace("\u3000", " ")
    s = re.sub(r"\s+", "", s)
    return s.lower()

def to_str_series(df, col):
    if col not in df.columns:
        return None
    ser = df[col].astype("string")
    return ser.str.replace("\u3000", " ", regex=False).str.strip()

def read_csv_robust(path: str) -> pd.DataFrame:
    encodings = ["utf-8-sig", "cp932", "utf-8"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(path, dtype="object", encoding=enc)
        except Exception as e:
            last_err = e
    # TSV保険
    for enc in encodings:
        try:
            return pd.read_csv(path, dtype="object", encoding=enc, sep="\t")
        except Exception as e:
            last_err = e
    raise last_err

def pick_keep_cols(df):
    """KEEP_COL_GROUPSの順で、存在する最初の列名を1つずつ採用する"""
    picked = []
    for group in KEEP_COL_GROUPS:
        for name in group:
            if name in df.columns:
                picked.append(name)
                break
    # 念のため重複除去（順序維持）
    seen = set()
    picked2 = []
    for c in picked:
        if c not in seen:
            picked2.append(c); seen.add(c)
    return picked2

def process_csv(in_path: str):
    df = read_csv_robust(in_path)
    y = len(df)

    df.columns = [norm_colname(c) for c in df.columns]

    # ② 接客対象：非対象を除外し対象だけ
    ser = to_str_series(df, COL_TARGET)
    if ser is not None:
        s = ser.fillna("")
        df = df[(s.str.contains("対象") & ~s.str.contains("非対象"))].copy()

    # ③ 購入経験：あり(有り等)を除外、なし/空白は残す
    ser = to_str_series(df, COL_BUYEXP)
    if ser is not None:
        s = ser.fillna("")
        df = df[~s.str.contains(r"(あり|有り|^有$)")].copy()

    # ④ 来場状況：空白を除外し来場済だけ
    ser = to_str_series(df, COL_VISIT)
    if ser is not None:
        s = ser.fillna("")
        df = df[s.str.contains("来場済")].copy()

    # ① メールで重複削除（メール列がある時だけ）
    if COL_EMAIL in df.columns:
        email_norm = df[COL_EMAIL].apply(norm_email)
        key = email_norm.where(email_norm != "", other=email_norm.index.astype(str))
        df = df.loc[~key.duplicated(keep="first")].copy()

    # ⑤ 残す列（列名ゆれ対応）
    keep_cols = pick_keep_cols(df)
    df_out = df[keep_cols].copy()

    x = len(df_out)

    # ⑥ 出力名：元ファイル名 + _新規対象来場x／y + .csv
    base = os.path.splitext(os.path.basename(in_path))[0]
    out_name = f"{base}_新規対象来場{x}／{y}.csv"
    out_path = os.path.join(os.path.dirname(in_path), out_name)

    df_out.to_csv(out_path, index=False, encoding="utf-8-sig")
    return out_path, x, y, keep_cols

# ===== 実行（複数CSVを1件ずつ確実に処理） =====
uploaded = files.upload()

results, errors = [], []
for fn in list(uploaded.keys()):
    try:
        out_path, x, y, keep_cols = process_csv(fn)
        results.append((fn, out_path, x, y, keep_cols))
    except Exception as e:
        errors.append((fn, str(e), traceback.format_exc()))

print("==== 処理結果 ====")
for fn, out_path, x, y, keep_cols in results:
    print("----")
    print(f"入力: {fn}")
    print(f"出力: {os.path.basename(out_path)}")
    print(f"行数: {x}／{y}")
    print(f"残した列: {keep_cols}")

if errors:
    print("\n==== エラー（失敗ファイル） ====")
    for fn, msg, _tb in errors:
        print(f"- {fn}: {msg}")

for _, out_path, *_ in results:
    files.download(out_path)
