### Import Libraries

In [164]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re

### Download Dataset From Hugging Face

In [165]:
ds = load_dataset("ckandemir/amazon-products")

In [166]:
ds

DatasetDict({
    train: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 23993
    })
    test: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 6665
    })
    eval: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 2666
    })
})

### Split Into Train, Test and Eval

In [167]:
df_train = pd.DataFrame(ds["train"])
df_test = pd.DataFrame(ds["test"])
df_eval = pd.DataFrame(ds["eval"])

In [168]:
df_train.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Craft-tastic – Empower Poster – Craft Kit – De...,Toys & Games | Arts & Crafts | Craft Kits | Pa...,PERFECT GIFT FOR AGES 8 AND ABOVE: Make this f...,$14.47,ProductDimensions:3x10x15inches|ItemWeight:15....,https://images-na.ssl-images-amazon.com/images...


In [169]:
df_test.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,amscan Festive Fall Thanksgiving Party Game Ac...,Toys & Games | Party Supplies,It is the perfect activity for your guests on ...,$4.80,ProductDimensions:11.3x8.5x0.7inches|ItemWeigh...,https://images-na.ssl-images-amazon.com/images...


In [170]:
df_eval.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Sweet Jojo Designs Vintage Floral Boho Baby Ki...,Baby Products | Nursery | Furniture | Storage ...,Great Condition.,$39.99,,https://images-na.ssl-images-amazon.com/images...


### Clean Text From The Data Frame

In [171]:
def clean_text(text):
    """
    Lowercase, remove special characters, and strip whitespace.
    """
    
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)

    return text.strip()

In [172]:
df_train["Product Name"] = df_train["Product Name"].apply(clean_text)
df_train["Description"] = df_train["Description"].apply(clean_text)

In [173]:
df_test["Product Name"] = df_test["Product Name"].apply(clean_text)
df_test["Description"] = df_train["Description"].apply(clean_text)

In [174]:
df_eval["Product Name"] = df_eval["Product Name"].apply(clean_text)
df_eval["Description"] = df_eval["Description"].apply(clean_text)

### Process Categories

In [175]:
def process_list(field):
    """
    Split a ' | ' separated string into a structured list.
    """

    if isinstance(field, str) and field.strip():
        return field.split(" | ")

    if isinstance(field, (np.ndarray, list)):
        field = " | ".join(map(str, field))
        return field.split(" | ")

    if pd.isna(field) or field == "":
        return []

    return []

In [176]:
df_train["Category"] = df_train["Category"].apply(process_list)

In [177]:
df_test["Category"] = df_test["Category"].apply(process_list)

In [178]:
df_eval["Category"] = df_eval["Category"].apply(process_list)

### Turn Price Into Numbers

In [179]:
def turn_price(price):
    if pd.isna(price):
        return None

    price_str = str(price)

    match = re.search(r"[\d]+(\.\d+)?", price_str)

    if not match:
        return None;

    return float(match.group())

In [180]:
df_train["Selling Price"] = df_train["Selling Price"].apply(turn_price)

In [181]:
df_test["Selling Price"] = df_test["Selling Price"].apply(turn_price)

In [182]:
df_eval["Selling Price"] = df_eval["Selling Price"].apply(turn_price)

### Process Specification

In [183]:
def process_specification(input_string):
    if pd.isna(input_string):
        return None
    
    array_of_strings = input_string.split("|")

    filtered_array = [item for item in array_of_strings if item and item.strip()]

    key_value_pairs = []
    for item in filtered_array:
        if ":" in item:
            key_value_pairs.append(item.split(":"))
        else:
            key_value_pairs.append(
                [item, None]
            )

    return key_value_pairs

In [184]:
df_train["Product Specification"] = df_train["Product Specification"].apply(
    process_specification
)

In [185]:
df_test["Product Specification"] = df_test["Product Specification"].apply(
    process_specification
)

In [186]:
df_eval["Product Specification"] = df_eval["Product Specification"].apply(
    process_specification
)

### Visualized Preprocessed Data

In [187]:
df_train.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,crafttastic empower poster craft kit design...,"[Toys & Games, Arts & Crafts, Craft Kits, Pape...",perfect gift for ages 8 and above make this fe...,14.47,"[[ProductDimensions, 3x10x15inches], [ItemWeig...",https://images-na.ssl-images-amazon.com/images...
1,melissa doug dottodot letter coloring pad 3 ...,"[Toys & Games, Games & Accessories, Board Games]",3 jumbo connectthedots coloring pads abc farm ...,12.74,"[[ProductDimensions, 11x0.8x14inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
2,rpm rear shock tower for the nitro slash nitro...,"[Toys & Games, Hobbies, Remote & App Controlle...",great condition,9.06,"[[ProductDimensions, 5.9x4x0.4inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
3,disney pixar cars mini racers crank crash der...,"[Toys & Games, Play Vehicles, Toy Vehicles]",disneypixar cars 3 new crazy 8 track,27.85,"[[ProductDimensions, 2.9x14x10inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
4,areaware cubebot small,"[Toys & Games, Puzzles, Brain Teasers, Assembl...",great condition,28.92,,https://images-na.ssl-images-amazon.com/images...


In [188]:
df_test.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,amscan festive fall thanksgiving party game ac...,"[Toys & Games, Party Supplies]",perfect gift for ages 8 and above make this fe...,4.8,"[[ProductDimensions, 11.3x8.5x0.7inches], [Ite...",https://images-na.ssl-images-amazon.com/images...
1,gmp 118 home improvement 199199 tv series 199...,"[Toys & Games, Dress Up & Pretend Play, Preten...",3 jumbo connectthedots coloring pads abc farm ...,139.95,"[[ProductDimensions, 9.5x4x2.5inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
2,manhattan toy wimmerferguson double sided 3in1...,"[Toys & Games, Baby & Toddler Toys, Car Seat &...",great condition,15.39,"[[ProductDimensions, 12.2x12x3inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
3,whamo slip n slide wave rider 16,"[Toys & Games, Sports & Outdoor Play, Pools & ...",disneypixar cars 3 new crazy 8 track,16.19,"[[ProductDimensions, 11.8x10.5x2inches], [Item...",https://images-na.ssl-images-amazon.com/images...
4,wow stuff collection harry potter wingardium l...,"[Toys & Games, Novelty & Gag Toys]",great condition,15.51,"[[ProductDimensions, 7x0.2x0.2inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...


In [189]:
df_eval.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,sweet jojo designs vintage floral boho baby ki...,"[Baby Products, Nursery, Furniture, Storage & ...",great condition,39.99,,https://images-na.ssl-images-amazon.com/images...
1,ultimate guard deck box sidewinder 80 chromias...,"[Toys & Games, Collectible Toys, Collectible D...",additional visual protection you will clearly ...,19.14,"[[ProductDimensions, 3x3.8x3inches], [ItemWeig...",https://images-na.ssl-images-amazon.com/images...
2,xshot chaos orbit dart ball blaster 100 rounds...,"[Toys & Games, Sports & Outdoor Play, Blasters...",chaos orbit ultimate blaster pack the xshot ch...,39.99,"[[ProductDimensions, 9.8x4.3x23.4inches], [Ite...",https://images-na.ssl-images-amazon.com/images...
3,benjonah throw blankets perfect for the fall ...,"[Home & Kitchen, Bedding, Kids' Bedding, Blank...",celebrate the fall and winter season with the ...,24.45,"[[ProductDimensions, 60x50x0.5inches], [ItemWe...",https://images-na.ssl-images-amazon.com/images...
4,banzai battle blast adventure inflatable water...,"[Toys & Games, Sports & Outdoor Play, Pools & ...",15l x 1110w x 8h inflatable water park,349.99,"[[ProductDimensions, 180x142x96inches], [ItemW...",https://images-na.ssl-images-amazon.com/images...


### Save Preprocess Data Frame

In [190]:
df_train.to_csv("../datasets/train.csv")

In [191]:
df_test.to_csv("../datasets/test.csv")

In [192]:
df_eval.to_csv("../datasets/eval.csv")