In [None]:
import pandas as pd
import jieba
import re
from rapidfuzz import fuzz

# ========== Step 1. 基础信息提取 ==========
brand_dict = ["康师傅", "汤达人", "上好佳", "海氏海诺", "爱斐堡", "统一", "金龙鱼", "旺旺", "农夫山泉"]

def extract_brand(name):
    for b in brand_dict:
        if b in name:
            return b
    return ""

def extract_spec(name):
    m = re.findall(r'[\d\.]+[gG克斤mMlL升包袋盒瓶罐卷片只]*', name)
    return m[0] if m else ""

def clean_text(text):
    return re.sub(r'[^0-9a-zA-Z\u4e00-\u9fa5]', '', text)

def parse_product_info(name):
    brand = extract_brand(name)
    spec = extract_spec(name)
    core = clean_text(name.replace(brand, '').replace(spec, ''))
    return brand, spec, core

# ========== Step 2. 相似度计算 ==========
def calc_similarity(m, e):
    # 条码完全相同则满分
    if str(m['upc码']) == str(e['条码']) and str(m['upc码']).strip():
        return 1.0

    brand_sim = 1.0 if m['品牌'] == e['品牌'] and m['品牌'] else (
        0.8 if m['品牌'] in e['品牌'] or e['品牌'] in m['品牌'] else 0.0
    )
    name_sim = fuzz.token_set_ratio(m['核心名'], e['核心名']) / 100
    spec_sim = fuzz.ratio(m['规格'], e['规格']) / 100

    return 0.5 * name_sim + 0.3 * spec_sim + 0.2 * brand_sim

# ========== Step 3. 主流程 ==========
def match_sku(meituan_path, elme_path, output_path):
    mt = pd.read_csv(meituan_path)
    el = pd.read_csv(elme_path)

    # 解析信息
    for df, col_name in [(mt, '商品名称'), (el, '商品名称')]:
        parsed = df[col_name].apply(parse_product_info)
        df['品牌'] = parsed.map(lambda x: x[0])
        df['规格'] = parsed.map(lambda x: x[1])
        df['核心名'] = parsed.map(lambda x: x[2])

    result_rows = []
    for _, mrow in mt.iterrows():
        candidates = []
        for _, erow in el.iterrows():
            score = calc_similarity(mrow, erow)
            candidates.append((erow['商品id'], erow['商品名称'], score))
        top3 = sorted(candidates, key=lambda x: x[2], reverse=True)[:3]

        result_rows.append({
            "美团商品ID": mrow["商品ID"],
            "美团名称": mrow["商品名称"],
            "饿了么Top1_ID": top3[0][0],
            "饿了么Top1_名称": top3[0][1],
            "相似度1": round(top3[0][2], 3),
            "饿了么Top2_ID": top3[1][0] if len(top3) > 1 else "",
            "饿了么Top2_名称": top3[1][1] if len(top3) > 1 else "",
            "相似度2": round(top3[1][2], 3) if len(top3) > 1 else "",
            "饿了么Top3_ID": top3[2][0] if len(top3) > 2 else "",
            "饿了么Top3_名称": top3[2][1] if len(top3) > 2 else "",
            "相似度3": round(top3[2][2], 3) if len(top3) > 2 else "",
        })

    pd.DataFrame(result_rows).to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"✅ 匹配完成，结果已保存到: {output_path}")

# 示例运行
match_sku("meituan_sku.csv", "elme_sku_small.csv", "sku_match_result.csv")
