### Import Libraries

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
import uuid

  from .autonotebook import tqdm as notebook_tqdm


### Download Dataset From Hugging Face

In [3]:
ds = load_dataset("ckandemir/amazon-products")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 23993
    })
    test: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 6665
    })
    eval: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 2666
    })
})

### Merge Train, Test and Eval

In [5]:
df_train = pd.DataFrame(ds["train"])
df_test = pd.DataFrame(ds["test"])
df_eval = pd.DataFrame(ds["eval"])

In [6]:
df = pd.concat([df_train, df_test, df_eval], ignore_index=True)

In [7]:
df.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Craft-tastic – Empower Poster – Craft Kit – De...,Toys & Games | Arts & Crafts | Craft Kits | Pa...,PERFECT GIFT FOR AGES 8 AND ABOVE: Make this f...,$14.47,ProductDimensions:3x10x15inches|ItemWeight:15....,https://images-na.ssl-images-amazon.com/images...


In [8]:
len(df)

33324

### Clean Text From The Data Frame

In [9]:
def clean_text(text):
    """
    Lowercase, remove special characters, and strip whitespace.
    """
    
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)

    return text.strip()

In [10]:
df["Product Name"] = df["Product Name"].apply(clean_text)
df["Description"] = df["Description"].apply(clean_text)

### Process Categories

In [11]:
def process_list(field):
    """
    Split a ' | ' separated string into a structured list.
    """

    if isinstance(field, str) and field.strip():
        return field.split(" | ")

    if isinstance(field, (np.ndarray, list)):
        field = " | ".join(map(str, field))
        return field.split(" | ")

    if pd.isna(field) or field == "":
        return []

    return []

In [12]:
df["Category"] = df["Category"].apply(process_list)

### Turn Price Into Numbers

In [13]:
def turn_price(price):
    if pd.isna(price):
        return None

    price_str = str(price)

    match = re.search(r"[\d]+(\.\d+)?", price_str)

    if not match:
        return None;

    return float(match.group())

In [14]:
df["Selling Price"] = df["Selling Price"].apply(turn_price)

### Process Specification

In [15]:
def process_specification(raw_specifications):
    if pd.isna(raw_specifications):
        return None

    clean_specifications = re.sub(r"\(.*\)", "", raw_specifications)
    specifications = clean_specifications.split("|")

    data = {}

    for spec in specifications:
        if ":" not in spec:
            continue  

        key, value = spec.split(":", 1)

        if key == "ProductDimensions":
            data["Dimensions"] = value.replace("inches", "").strip()

        elif key == "ItemWeight" or key == "ShippingWeight":
            match = re.match(r"([0-9\.]+)", value)
            if match:
                numeric_value = match.group(1)
                if numeric_value and numeric_value != ".":
                    data[key] = float(
                        numeric_value
                    ) 
                else:
                    data[key] = None 

        elif key == "ASIN":
            data["ASIN"] = value.strip()
        elif key == "Itemmodelnumber":
            data["ItemModelNumber"] = value.strip()

        elif key == "Manufacturerrecommendedage":
            data["RecommendedAge"] = value.strip()

    return data

In [16]:
processed_data = df["Product Specification"].apply(process_specification)
processed_df = pd.json_normalize(processed_data)

df = pd.concat([df, processed_df], axis=1)

df.drop(columns=["Product Specification"], inplace=True)

### Add Unique UUID ( V4 ) On The Dataset

In [17]:
df["id"] = [str(uuid.uuid4()) for _ in range(len(df))]

df = df[["id"] + [col for col in df.columns if col != "id"]]

### Visualized Preprocessed Data

In [18]:
df.head()

Unnamed: 0,id,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,5afd0fbc-0d93-4081-bf00-e5cedf815769,crafttastic empower poster craft kit design...,"[Toys & Games, Arts & Crafts, Craft Kits, Pape...",perfect gift for ages 8 and above make this fe...,14.47,https://images-na.ssl-images-amazon.com/images...,3x10x15,15.2,15.2,B01D52Q1UC,CT1688,8-15years
1,3e67ae8c-83d2-4e1a-abc5-636bce6616f0,melissa doug dottodot letter coloring pad 3 ...,"[Toys & Games, Games & Accessories, Board Games]",3 jumbo connectthedots coloring pads abc farm ...,12.74,https://images-na.ssl-images-amazon.com/images...,11x0.8x14,3.25,3.25,B07ZG7QYR5,97076,4-6years
2,74a8b7cd-3425-421b-ad26-ae31db665b51,rpm rear shock tower for the nitro slash nitro...,"[Toys & Games, Hobbies, Remote & App Controlle...",great condition,9.06,https://images-na.ssl-images-amazon.com/images...,5.9x4x0.4,0.32,0.32,B000VQML1O,RPM80862,16yearsandup
3,dd29b153-935e-4d99-abca-5367c6f44d2e,disney pixar cars mini racers crank crash der...,"[Toys & Games, Play Vehicles, Toy Vehicles]",disneypixar cars 3 new crazy 8 track,27.85,https://images-na.ssl-images-amazon.com/images...,2.9x14x10,1.57,1.76,B076FLF7CC,FLG71,4-8years
4,d613ccda-6c0b-4cf3-b658-89d88cea7776,areaware cubebot small,"[Toys & Games, Puzzles, Brain Teasers, Assembl...",great condition,28.92,https://images-na.ssl-images-amazon.com/images...,,,,,,


In [19]:
df.tail()

Unnamed: 0,id,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
33319,2b778ce9-c3d3-439b-9eaf-c8c308dca99f,neato classics jacobs ladder retro wooden puzz...,"[Toys & Games, Novelty & Gag Toys, Magic Kits ...",the jacobs ladder makes use of a timeless opti...,5.99,https://images-na.ssl-images-amazon.com/images...,2.2x2x3.8,3.2,3.2,B000RAEBL2,6195,5-15years
33320,c7544ad0-636b-4a4e-8801-9b09554080ef,sushi roll the sushi go dice game,"[Toys & Games, Games & Accessories, Game Acces...",rice and dice roll with your favorite sushi go...,18.78,https://images-na.ssl-images-amazon.com/images...,11.4x11.4x16.3,1.91,1.95,B07PLFLR54,SUSHI_ROLL,8-15years
33321,94f79bad-9122-4ab2-8067-5c9f293cfaf5,white mountain puzzles craft room 1000 piece ...,"[Toys & Games, Puzzles, Jigsaw Puzzles]",more to puzzle building art jigsaw puzzles are...,15.99,https://images-na.ssl-images-amazon.com/images...,10x12x2,1.68,1.69,B07BS1W9R6,1372,12months-8years
33322,76513353-64dc-4cf3-8c14-d58955309a87,entertainment earth thor chair capes,"[Toys & Games, Party Supplies]",unique patented chair cape that celebrates thor,11.6,https://images-na.ssl-images-amazon.com/images...,24x30x0,7.2,7.4,B073FXDN4N,EE24503,14yearsandup
33323,08f7d3cd-57da-4fd7-92d8-1aa1ccd1d905,mega construx call of duty simon ghost riley,"[Toys & Games, Toy Figures & Playsets, Action ...",buildable superposeable simon ghost riley micr...,5.99,https://images-na.ssl-images-amazon.com/images...,6x1.1x3.9,0.96,0.96,B07MZG4MX4,GFW74,10-15years


### Save Preprocess Data Frame

In [20]:
df.to_csv("../datasets/products.csv")