In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm



In [16]:
# Load the dataset
parquet_file =  r'B:\Python\Veridion Project\Project 4\veridion_product_deduplication_challenge.snappy.parquet'
df = pd.read_parquet(parquet_file, engine="auto")
print(df)



                                                  unspsc  \
0      Sewing and stitchery and weaving equipment and...   
1                 Electric alternating current AC motors   
2                     Vehicle trim and exterior covering   
3                                        Pipe connectors   
4                                                  Doors   
...                                                  ...   
21941                                              Other   
21942                     Processed and synthetic rubber   
21943                            Fresh cut rose bouquets   
21944  Vision correction or cosmetic eyewear and rela...   
21945       Chocolate and sugars and sweetening products   

                         root_domain  \
0                  studio-atcoat.com   
1                     worm-gears.net   
2               customcarcoverco.com   
3                    plumbmaster.com   
4                           sogno.in   
...                              ...   

In [20]:
# Step 1: Group products with empty descriptions, summaries, or product titles
empty_products = df[df['description'].isnull() | df['description'].str.strip().eq('') | 
                    df['product_summary'].isnull() | df['product_summary'].str.strip().eq('') |
                    df['product_title'].isnull() | df['product_title'].str.strip().eq('')]



In [22]:
# Step 2: Remove products with empty descriptions, summaries, or titles from the main DataFrame
df_clean = df[~df['product_identifier'].isin(empty_products['product_identifier'])]

# Step 3: Group by 'unspsc' and remove duplicates based on text similarity (for product_title)
grouped = df_clean.groupby('unspsc')

print(grouped)



<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002A24A3F4F80>


In [24]:
# Function to check similarity within each group and remove duplicates based on product_title similarity
def remove_duplicates_within_group(group):
    # Vectorize the product_title using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(group['product_title'].fillna(''))  # Apply TF-IDF on 'product_title'
    
    # Calculate the cosine similarity matrix for product_title
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    # Find indices of products to keep (based on similarity threshold)
    to_keep = set()
    
    # Loop through each pair and keep the first product in case of high similarity
    for i in range(len(group)):
        if i not in to_keep:
            to_keep.add(i)  # Always keep the first product (or you can implement a custom strategy)
            for j in range(i + 1, len(group)):
                if cosine_sim[i, j] > 0.8:  # Similarity threshold
                    to_keep.add(j)
    
    # Return only the rows that we decided to keep
    return group.iloc[list(to_keep)]

cleaned_df = pd.concat([remove_duplicates_within_group(group) for _, group in tqdm(grouped, desc="Removing duplicates based on product_title")])

# Step 4: Final Cleaned DataFrame with duplicates removed and empty descriptions/summaries/titles eliminated
print("Cleaned DataFrame with duplicates removed and empty descriptions/summaries/titles eliminated:")
print(cleaned_df)




Removing duplicates based on product_title: 100%|█████████████████████████████████| 1492/1492 [00:05<00:00, 295.52it/s]


Cleaned DataFrame with duplicates removed and empty descriptions/summaries/titles eliminated:
                unspsc               root_domain  \
741    Abrasive wheels                 refima.eu   
6649   Abrasive wheels          diamond-tool.net   
6845   Abrasive wheels          diamond-tool.net   
7742   Abrasive wheels          diamond-tool.net   
9137   Abrasive wheels          diamond-tool.net   
...                ...                       ...   
21381            Yarns       sallyridgway.com.au   
21529            Yarns  meanmotherscreations.com   
21634            Yarns      yarnswithalix.com.au   
21707            Yarns    edencottageyarns.co.uk   
21872            Yarns     wattlebirdyarn.com.au   

                                                page_url  \
741             https://www.refima.eu/technical-support/   
6649   http://www.diamond-tool.net/product/cup-wheel.htm   
6845   http://www.diamond-tool.net/product/diamond-bl...   
7742   http://www.diamond-tool.net/produc

In [26]:
# Optionally, save the cleaned DataFrame to a CSV
cleaned_df.to_csv(r'B:\Python\Veridion Project\Project 4\cleaned_products_2.csv', index=False)