In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from keybert import KeyBERT
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
from rapidfuzz import fuzz , process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from PIL import Image
from io import BytesIO
from torchvision import models, transforms
import torch
import ast

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
maxmara = pd.read_csv('maxmara_cleaned.csv')
netaporter = pd.read_csv('netaporter_cleaned.csv')
luisaviaroma = pd.read_csv('luisaviaroma_cleaned.csv')

In [3]:
maxmara.head(2)

Unnamed: 0,url,product_id,ref,main_title,description,full_path,color,current_price,previous_price,available_sizes,all_sizes,images,outlet_color,excluweb_color,material,brand,discounted
0,['https://us.maxmara.com/p-1081065106001-ogger...,1081065106001,1081065106,cropped teddy bear icon coat in alpaca and wool,cropped jacket in the iconic teddy fabric of a...,"[['Homepage', 'Coats and Jackets', 'Coats and ...",white,2890.0,2890.0,"['4', '6', '12', '0', '8', '2', '10']","['4', '6', '12', '0', '16', '8', '10', '2', '14']",['https://b2c-media.maxmara.com/sys-master/m0/...,False,False,"Jersey fabric 62% alpaca, 26% virgin wool, 12%...",max mara,False
1,['https://us.maxmara.com/p-1336024206001-zigot...,1336024206001,1336024206,wide-leg wool yarn trousers,long trousers made from soft pure wool yarn - ...,"[['Homepage', 'Clothing', 'Clothing', 'Trouser...",white,612.0,875.0,"['XS', 'S', 'XL', 'M']","['XS', 'S', 'L', 'M', '2XL', 'XL']",['https://b2c-media.maxmara.com/sys-master/m0/...,False,False,Knitted fabric 100% virgin wool; - exclusive o...,max mara,True


In [4]:
maxmara['synthetic_id'] = maxmara['main_title'] + '_' + maxmara['color'] + '_' + maxmara['material'] + maxmara['brand']
netaporter['synthetic_id'] = netaporter['main_title'] + '_' + netaporter['color'] + '_' + netaporter['material'] + netaporter['brand']
luisaviaroma['synthetic_id'] = luisaviaroma['main_title'] + '_' + luisaviaroma['color'] + '_' + luisaviaroma['material'] + luisaviaroma['brand']

In [5]:
def compute_cosine_similarity(series1, series2):

    vectorizer = TfidfVectorizer()
    tfidf_matrix_1 = vectorizer.fit_transform(series1)
    tfidf_matrix_2 = vectorizer.transform(series2)

    similarity_matrix = cosine_similarity(tfidf_matrix_1, tfidf_matrix_2)
    return similarity_matrix

def compute_rapidfuzz_similarity(series1, series2):
    series1 = series1.fillna("").astype(str)
    series2 = series2.fillna("").astype(str)
    similarity_matrix = []

    for val1 in series1:
        row_scores = []
        for val2 in series2:
            score = fuzz.ratio(val1, val2) / 100  # Normalize to 0-1
            row_scores.append(score)
        similarity_matrix.append(row_scores)
    
    return similarity_matrix

In [6]:
def text_similarity(method, maxmara, netaporter):
    if method == 'cosine':
        title_similarity = compute_cosine_similarity(maxmara['main_title'], netaporter['main_title'])
        description_similarity = compute_cosine_similarity(maxmara['description'], netaporter['description'])
        material_similarity = compute_cosine_similarity(maxmara['material'], netaporter['material'])
        color_similarity = compute_cosine_similarity(maxmara['color'], netaporter['color'])
        synthetic_similarity = compute_cosine_similarity(maxmara['synthetic_id'], netaporter['synthetic_id'])
    elif method == 'fuzzy':
        title_similarity = compute_rapidfuzz_similarity(maxmara['main_title'], netaporter['main_title'])
        description_similarity = compute_rapidfuzz_similarity(maxmara['description'], netaporter['description'])
        material_similarity = compute_rapidfuzz_similarity(maxmara['material'], netaporter['material'])
        color_similarity = compute_rapidfuzz_similarity(maxmara['color'], netaporter['color'])
        synthetic_similarity = compute_rapidfuzz_similarity(maxmara['synthetic_id'], netaporter['synthetic_id'])
    return title_similarity, description_similarity, material_similarity, color_similarity, synthetic_similarity

In [7]:
cosine_title_similarity, cosine_description_similarity, cosine_material_similarity, cosine_color_similarity, cosine_synthetic_similarity = text_similarity('cosine', maxmara, netaporter)
fuzzy_title_similarity, fuzzy_description_similarity, fuzzy_material_similarity, fuzzy_color_similarity, fuzzy_synthetic_similarity = text_similarity('fuzzy', maxmara, netaporter)
results = []
for i, maxmara_row in tqdm(maxmara.iterrows(), total=len(maxmara)):
    for j, netaporter_row in netaporter.iterrows():
        results.append({
            'maxmara_index': i,
            'retailer_index': j,
            'retailer_name': 'netaporter',
            'title_cosine_score': cosine_title_similarity[i, j],
            'title_rapidfuzz_score': fuzzy_title_similarity[i][j],
            'description_cosine_score': cosine_description_similarity[i, j],
            'description_rapidfuzz_score': fuzzy_description_similarity[i][j],
            'material_cosine_score': cosine_material_similarity[i, j],
            'material_rapidfuzz_score': fuzzy_material_similarity[i][j],
            'color_cosine_score': cosine_color_similarity[i, j],
            'color_rapidfuzz_score': fuzzy_color_similarity[i][j],
            'synthetic_cosine_score': cosine_synthetic_similarity[i, j],
            'synthetic_rapidfuzz_score': fuzzy_synthetic_similarity[i][j]
        })
netaporter_similarity_df = pd.DataFrame(results)

100%|██████████| 2510/2510 [00:38<00:00, 66.03it/s]


In [8]:
cosine_title_similarity, cosine_description_similarity, cosine_material_similarity, cosine_color_similarity, cosine_synthetic_similarity = text_similarity('cosine', maxmara, luisaviaroma)
fuzzy_title_similarity, fuzzy_description_similarity, fuzzy_material_similarity, fuzzy_color_similarity, fuzzy_synthetic_similarity = text_similarity('fuzzy', maxmara, luisaviaroma)
results = []
for i, maxmara_row in tqdm(maxmara.iterrows(), total=len(maxmara)):
    for j, netaporter_row in luisaviaroma.iterrows():
        results.append({
            'maxmara_index': i,
            'retailer_index': j,
            'retailer_name': 'luisaviaroma',
            'title_cosine_score': cosine_title_similarity[i, j],
            'title_rapidfuzz_score': fuzzy_title_similarity[i][j],
            'description_cosine_score': cosine_description_similarity[i, j],
            'description_rapidfuzz_score': fuzzy_description_similarity[i][j],
            'material_cosine_score': cosine_material_similarity[i, j],
            'material_rapidfuzz_score': fuzzy_material_similarity[i][j],
            'color_cosine_score': cosine_color_similarity[i, j],
            'color_rapidfuzz_score': fuzzy_color_similarity[i][j],
            'synthetic_cosine_score': cosine_synthetic_similarity[i, j],
            'synthetic_rapidfuzz_score': fuzzy_synthetic_similarity[i][j]
        })
luisaviaroma_similarity_df = pd.DataFrame(results)

100%|██████████| 2510/2510 [01:18<00:00, 31.89it/s]


In [9]:
text_similarity_score = pd.concat([netaporter_similarity_df, luisaviaroma_similarity_df], ignore_index=True)

In [10]:
text_similarity_score['title_similarity'] = text_similarity_score[['title_cosine_score', 'title_rapidfuzz_score']].mean(axis=1)
text_similarity_score['color_similarity'] = text_similarity_score[['color_cosine_score', 'color_rapidfuzz_score']].mean(axis=1)
text_similarity_score['material_similarity'] = text_similarity_score[['material_cosine_score', 'material_rapidfuzz_score']].mean(axis=1)
text_similarity_score['cosine_avg_score'] = text_similarity_score[['title_cosine_score', 'description_cosine_score', 'material_cosine_score', 'color_cosine_score', 'synthetic_cosine_score']].mean(axis=1)
text_similarity_score['rapidfuzz_avg_score'] = text_similarity_score[['title_rapidfuzz_score', 'description_rapidfuzz_score', 'material_rapidfuzz_score', 'color_rapidfuzz_score', 'synthetic_rapidfuzz_score']].mean(axis=1)
text_similarity_score['overall_similarity'] = text_similarity_score[['cosine_avg_score', 'rapidfuzz_avg_score']].mean(axis=1)

In [11]:
text_similarity_score.to_csv('text_similarity_score.csv',index=False)