## Task 3: Parts T16 in Registered Vehicles (Adelshofen)

In [None]:
# === Task3 · Cell1：自动定位 + 查 T16 属于哪个 Typ（兼容有/无 Data 目录） ===
from pathlib import Path
import sys, re
import pandas as pd

# 0) 自动定位项目根
def detect_project_root() -> Path:
    here = Path.cwd()
    for p in [here, *here.parents]:
        if (p / "Einzelteil").exists() or (p / "Fahrzeug").exists() or (p / "Data").exists():
            return p
    if "google.colab" in sys.modules:
        cand = Path("/content/drive/MyDrive/Case_Study_IDA_Group11")
        if cand.exists():
            return cand
    return here

PROJECT_ROOT = detect_project_root()
# 有 Data 就进 Data，没有就用根目录
SEARCH_ROOT = PROJECT_ROOT / "Data" if (PROJECT_ROOT / "Data").exists() else PROJECT_ROOT
print("PROJECT_ROOT =", PROJECT_ROOT)
print("SEARCH_ROOT  =", SEARCH_ROOT)

# 1) 小工具
def find_one(root: Path, patterns):
    cands = []
    for pat in patterns:
        cands += list(root.rglob(pat))
    if not cands:
        raise FileNotFoundError(f"在 {root} 下没找到：{patterns}")
    # 路径短优先 + 修改时间新优先
    cands.sort(key=lambda p: (len(p.parts), -p.stat().st_mtime))
    return cands[0]

def read_any(path: Path, nrows=None, dtype=None):
    suf = path.suffix.lower()
    if suf in [".xls", ".xlsx"]:
        return pd.read_excel(path, nrows=nrows, dtype=dtype)
    # csv/txt：自动分隔符 + 编码兜底
    for enc in [None, "utf-8-sig", "latin-1", "cp1252"]:
        try:
            return pd.read_csv(path, sep=None, engine="python", encoding=enc, nrows=nrows, dtype=dtype)
        except UnicodeDecodeError:
            continue
    return pd.read_csv(path, nrows=nrows, dtype=dtype)  # 让它抛出原生错误

# 2) 找 T16 文件（csv/txt/xls/xlsx 都支持）
t16_path = find_one(
    SEARCH_ROOT,
    ["Einzelteil/Einzelteil_T16.*", "**/Einzelteil_T16.*"]
)
print("✅ T16 file:", t16_path)

t16 = read_any(t16_path)
# 识别“零件号”列（包含 'teil' 字样）
teil_cols = [c for c in t16.columns if re.search("teil", c, re.I)]
if not teil_cols:
    raise ValueError(f"在 {t16_path.name} 里没找到包含 'teil' 的列；实际列名：{list(t16.columns)[:12]}")
teil_col = teil_cols[0]
t16_ids = set(t16[teil_col].astype(str).unique())
print("零件列:", teil_col, "| T16 示例:", list(t16_ids)[:5])

# 3) 在 Fahrzeug 目录里扫描 “零件-车辆对应表”
fahrzeug_dir = (SEARCH_ROOT / "Fahrzeug")
if not fahrzeug_dir.exists():
    raise FileNotFoundError(f"未找到目录：{fahrzeug_dir}")

maps = sorted(list(fahrzeug_dir.glob("Bestandteile_Fahrzeuge_*.*")))
if not maps:
    raise FileNotFoundError(f"在 {fahrzeug_dir} 下没找到 'Bestandteile_Fahrzeuge_*' 文件")

matches = []
for f in maps:
    try:
        df = read_any(f, nrows=100000)  # 为了快，只读前 10 万行
        cols = [c for c in df.columns if re.search("teil", c, re.I)]
        if not cols:
            continue
        col = cols[0]
        vals = set(df[col].astype(str).unique())
        if t16_ids & vals:
            m = re.search(r"Typ(\d+)", f.name)
            typ = m.group(1) if m else "?"
            matches.append((f.name, typ))
            print(f"✅ Found in: {f.name} (Typ {typ})")
    except Exception as e:
        print("跳过", f.name, "→", e)

if not matches:
    print("❌ 没在任何 Bestandteile_Fahrzeuge_* 里找到 T16；请确认 T16 的零件列名/值。")
else:
    print("T16 出现于这些文件：", matches)
    # 你可以在下一格根据 typ 选择对应的 Fahrzeuge_* 文件继续合并分析


PROJECT_ROOT = D:\IDA\Case_Study_IDA_Group11
SEARCH_ROOT  = D:\IDA\Case_Study_IDA_Group11\Data
✅ T16 file: D:\IDA\Case_Study_IDA_Group11\Data\Einzelteil\Einzelteil_T16.txt


In [None]:
# === Task3 · 分批统计：Zulassungen_alle_Fahrzeuge 按块处理，避免内存卡死 ===
import re
from collections import defaultdict
from pathlib import Path
import pandas as pd

# ---------- 小工具 ----------
def detect_project_root() -> Path:
    here = Path.cwd()
    for p in [here, *here.parents]:
        if (p / "Zulassungen").exists() or (p / "Data").exists():
            return p
    return here

def norm_id_series(s: pd.Series) -> pd.Series:
    # 统一 ID：去空格、去掉尾部“.0”
    return s.astype(str).str.strip().str.replace(r"\.0$", "", regex=True)

def open_zulassungen_path(search_root: Path) -> Path:
    p = next((x for x in search_root.rglob("**/Zulassungen_alle_Fahrzeuge.*")), None)
    if p is None:
        p = next((x for x in search_root.rglob("**/Zulassungen*Fahrzeuge.*")), None)
    if p is None:
        raise FileNotFoundError("未找到 Zulassungen_alle_Fahrzeuge.*")
    if p.suffix.lower() != ".csv":
        raise ValueError(f"找到的是 {p.name}（{p.suffix}）。分批读取建议先把它另存为 CSV 再跑。")
    return p

# ---------- 前置检查 ----------
if "veh_ids" not in globals() or len(veh_ids) == 0:
    raise RuntimeError("veh_ids 未定义或为空。请先用前面的 Cell 计算出使用 T16 的车辆 ID 集合（veh_ids）。")

PROJECT_ROOT = detect_project_root()
SEARCH_ROOT  = PROJECT_ROOT / "Data" if (PROJECT_ROOT / "Data").exists() else PROJECT_ROOT
zul_path = open_zulassungen_path(SEARCH_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Zulassungen CSV:", zul_path)
print("veh_ids size:", len(veh_ids))

# 规范化一下 veh_ids
veh_ids_n = { re.sub(r"\.0$", "", str(x).strip()) for x in veh_ids }

# ---------- 分批统计 ----------
chunksize = 500_000  # 每块 50 万行，可按机器内存情况调大/调小
vehicles_all = set()       # 使用 T16 的唯一车辆（全区域）
vehicles_adel = set()      # Adelshofen 的唯一车辆
rows_adel = 0              # Adelshofen 的记录行数
city_veh = defaultdict(set)  # 城市 -> 唯一车辆集合（用于 Top10）

reader = pd.read_csv(
    zul_path,
    sep=None, engine="python", encoding="utf-8",
    dtype={"IDNummer": "string"},
    chunksize=chunksize,
)

for i, chunk in enumerate(reader, start=1):
    # 规范字段
    if "IDNummer" not in chunk.columns or "Gemeinden" not in chunk.columns:
        # 列名安全处理
        chunk.columns = [str(c).strip() for c in chunk.columns]
    # 统一格式
    chunk["IDNummer_n"] = norm_id_series(chunk["IDNummer"])
    chunk["Gemeinden_u"] = chunk["Gemeinden"].astype(str).str.upper().str.strip()

    # 先按 veh_ids 过滤
    part = chunk[chunk["IDNummer_n"].isin(veh_ids_n)].copy()
    if not part.empty:
        # 全区域唯一车辆
        vehicles_all.update(part["IDNummer_n"].unique())

        # Adelshofen
        mask_adel = part["Gemeinden_u"] == "ADELSHOFEN"
        rows_adel += int(mask_adel.sum())
        if mask_adel.any():
            vehicles_adel.update(part.loc[mask_adel, "IDNummer_n"].unique())

        # 各城市唯一车辆（分块去重后再累计，避免膨胀）
        dedup = part.drop_duplicates(subset=["Gemeinden_u", "IDNummer_n"])
        for city, ids in dedup.groupby("Gemeinden_u")["IDNummer_n"]:
            city_veh[city].update(ids.values)

    print(f"Processed chunk {i:>2d}: rows={len(chunk):,} | matched={len(part):,} | "
          f"vehicles_all={len(vehicles_all):,} | adel_rows={rows_adel:,}")

# ---------- 汇总结果 ----------
vehicles_all_n = len(vehicles_all)
vehicles_adel_n = len(vehicles_adel)

# Top10 城市（按唯一车辆数排）
top10 = sorted(((city, len(ids)) for city, ids in city_veh.items()),
               key=lambda x: x[1], reverse=True)[:10]

print("\n===== FINAL RESULTS =====")
print("Vehicles using T16 (all regions, unique):", vehicles_all_n)
print("Vehicles registered in Adelshofen with T16 (unique):", vehicles_adel_n)
print("Number of rows in Adelshofen:", rows_adel)

print("\nTop 10 Gemeinden by vehicles with T16 (unique vehicles):")
for city, cnt in top10:
    print(f"{city:20s}  {cnt}")
