In [1]:
import pandas as pd
import jieba
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
def clean_text(text):
    """
    清洗文本，去除特殊字符和多余空格
    """
    if not isinstance(text, str):
        return ""
    # 移除非中文字符、字母和数字
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
    # 将多个空格合并为一个
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

In [6]:
text="元气森林 霸气汽水（冰红茶可乐味） 600ml_瓶"

In [7]:
clean_text("元气森林 霸气汽水（冰红茶可乐味） 600ml_瓶")

'元气森林 霸气汽水 冰红茶可乐味 600ml 瓶'

In [8]:
def chinese_tokenizer(text):
    """
    使用jieba进行中文分词
    """
    return " ".join(jieba.cut(text))

In [10]:
chinese_tokenizer(clean_text(text))

'元气 森林   霸气 汽水   冰红茶 可乐 味   600ml   瓶'

In [11]:
def find_top_n_similar_products(meituan_df, elme_df, n=3):
    """
    为meituan_sku中的每个商品匹配elme_sku中相似度前n的商品

    参数:
    - meituan_df: 美团商品DataFrame
    - elme_df: 饿了么商品DataFrame
    - n: 需要返回的相似商品数量

    返回:
    - 一个字典，键为美团商品ID，值为一个包含相似饿了么商品信息的列表
    """

    # 1. 数据预处理
    # 为了避免修改原始数据，我们创建副本
    meituan_df_copy = meituan_df.copy()
    elme_df_copy = elme_df.copy()

    # 统一列名，便于处理
    meituan_df_copy.rename(columns={'商品ID': 'id', '商品名称': 'name', '规格': 'spec', 'upc码': 'upc'}, inplace=True)
    elme_df_copy.rename(columns={'商品id': 'id', '商品名称': 'name', '规格': 'spec', '条码': 'upc'}, inplace=True)

    # 创建用于匹配的文本字段
    meituan_df_copy['match_text'] = meituan_df_copy['name'].astype(str) + " " + meituan_df_copy['spec'].astype(str)
    elme_df_copy['match_text'] = elme_df_copy['name'].astype(str) + " " + elme_df_copy['spec'].astype(str)

    # 清洗和分词
    meituan_df_copy['cleaned_text'] = meituan_df_copy['match_text'].apply(clean_text)
    elme_df_copy['cleaned_text'] = elme_df_copy['match_text'].apply(clean_text)

    meituan_df_copy['tokenized_text'] = meituan_df_copy['cleaned_text'].apply(chinese_tokenizer)
    elme_df_copy['tokenized_text'] = elme_df_copy['cleaned_text'].apply(chinese_tokenizer)

    # 2. TF-IDF向量化
    vectorizer = TfidfVectorizer()
    elme_tfidf_matrix = vectorizer.fit_transform(elme_df_copy['tokenized_text'])

    results = {}

    # 3. 遍历美团商品进行匹配
    for index, m_row in meituan_df_copy.iterrows():
        m_product_id = m_row['id']
        m_upc = m_row['upc']

        similarity_scores = {}

        # # 优先进行UPC码匹配
        # if pd.notna(m_upc) and m_upc != '':
        #     upc_matches = elme_df_copy[elme_df_copy['upc'] == m_upc]
        #     for _, e_row in upc_matches.iterrows():
        #         # UPC码匹配上的商品给予最高分1.0
        #         similarity_scores[e_row['id']] = 1.0

        # 进行文本相似度匹配
        m_tfidf_vector = vectorizer.transform([m_row['tokenized_text']])
        cosine_sims = cosine_similarity(m_tfidf_vector, elme_tfidf_matrix).flatten()

        # 将文本相似度分数存入字典
        for i, score in enumerate(cosine_sims):
            e_product_id = elme_df_copy.iloc[i]['id']
            # 如果该商品不是通过UPC码匹配上的，或者文本相似度更高，则更新分数
            if e_product_id not in similarity_scores or score > similarity_scores[e_product_id]:
                similarity_scores[e_product_id] = score

        # 排序并选出前N个
        sorted_scores = sorted(similarity_scores.items(), key=lambda item: item[1], reverse=True)
        top_n_matches = sorted_scores[:n]

        # 整理输出结果
        top_n_details = []
        for e_id, score in top_n_matches:
            matched_product = elme_df_copy[elme_df_copy['id'] == e_id].iloc[0]
            top_n_details.append({
                'matched_elme_id': matched_product['id'],
                'matched_elme_name': matched_product['name'],
                'matched_elme_spec': matched_product['spec'],
                'similarity_score': round(score, 4)
            })

        results[m_product_id] = {
            'meituan_name': m_row['name'],
            'meituan_spec': m_row['spec'],
            'top_matches': top_n_details
        }

    return results

In [12]:
# 加载csv文件
meituan_df = pd.read_csv('./meituan_sku_small.csv')
elme_df = pd.read_csv('./elme_sku_small.csv')

In [13]:
meituan_df.head()

Unnamed: 0,商品ID,商品名称,规格,upc码,折扣价,原价,skuid
0,20318304358,（规格自选强力布基胶带网格双面胶高粘度透明无痕地毯地垫沙发垫固定贴 一卷,5cm*10m#2cm*10m,6902051000000.0,12.45,15.0,35112245909
1,20593235769,海氏海诺 一次性医用外科口罩 10只_盒,10只/盒,6925924000000.0,12.96,15.8,35816960188
2,20317711803,爱斐堡 牛奶味生吐司 散装 约70g_袋,70g*1袋,6971985000000.0,4.58,5.2,35111173396
3,20318114339,【1条】雀巢 脆脆鲨巧克力味威化饼干 零食营养能量棒,1根,6901884000000.0,3.76,2.0,35113206574
4,20316956858,天明 桉叶糖 22克_盒,22克*1盒,6901311000000.0,2.64,3.0,35110532938


In [14]:
elme_df.head()

Unnamed: 0,商品id,商品名称,规格,条码,折扣价,原价,skuid
0,950657120093,汤达人 日式豚骨拉面 83g(面饼55g+配料28g)/桶,,6925303770563,3.49,8.3,950657120093
1,950103769331,康师傅 BIG大食桶老坛酸菜牛肉面 159g/桶,,6920152424285,4.9,6.5,950103769331
2,949473714726,上好佳 鲜虾片膨化食品 80克/袋,,6926265313386,5.2,8.0,949473714726
3,949474714550,百威 9.7°P啤酒 500ml/听,,6948960100078,0.01,9.5,949474714550
4,949480010428,统一茄皇 茄皇牛肉面 128g/桶,,6925303796426,5.5,7.5,949480010428


In [15]:
products = find_top_n_similar_products(meituan_df, elme_df)
products


{20318304358: {'meituan_name': '（规格自选强力布基胶带网格双面胶高粘度透明无痕地毯地垫沙发垫固定贴 一卷',
  'meituan_spec': '5cm*10m#2cm*10m',
  'top_matches': [{'matched_elme_id': 950657120093,
    'matched_elme_name': '汤达人 日式豚骨拉面 83g(面饼55g+配料28g)/桶',
    'matched_elme_spec': nan,
    'similarity_score': 0.0},
   {'matched_elme_id': 950103769331,
    'matched_elme_name': '康师傅 BIG大食桶老坛酸菜牛肉面 159g/桶',
    'matched_elme_spec': nan,
    'similarity_score': 0.0},
   {'matched_elme_id': 949473714726,
    'matched_elme_name': '上好佳 鲜虾片膨化食品 80克/袋',
    'matched_elme_spec': nan,
    'similarity_score': 0.0}]},
 20593235769: {'meituan_name': '海氏海诺 一次性医用外科口罩 10只_盒',
  'meituan_spec': '10只/盒',
  'top_matches': [{'matched_elme_id': 949021019485,
    'matched_elme_name': '心相印 杀菌卫生湿巾 10片/袋',
    'matched_elme_spec': nan,
    'similarity_score': 0.3454},
   {'matched_elme_id': 950129193493,
    'matched_elme_name': '青岛啤酒 10°P经典啤酒 500ml/听',
    'matched_elme_spec': nan,
    'similarity_score': 0.3406},
   {'matched_elme_id': 949023655620

In [16]:
# 结果存储为csv,第一列为待匹配列，后面为匹配列
pd.DataFrame(products).T.to_csv("./meituan_elme_match_result.csv", encoding="utf-8-sig")