### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re

  from .autonotebook import tqdm as notebook_tqdm


### Download Dataset From Hugging Face

In [2]:
ds = load_dataset("ckandemir/amazon-products")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 23993
    })
    test: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 6665
    })
    eval: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 2666
    })
})

### Split Into Train, Test and Eval

In [4]:
df_train = pd.DataFrame(ds["train"])
df_test = pd.DataFrame(ds["test"])
df_eval = pd.DataFrame(ds["eval"])

In [5]:
df_train.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Craft-tastic – Empower Poster – Craft Kit – De...,Toys & Games | Arts & Crafts | Craft Kits | Pa...,PERFECT GIFT FOR AGES 8 AND ABOVE: Make this f...,$14.47,ProductDimensions:3x10x15inches|ItemWeight:15....,https://images-na.ssl-images-amazon.com/images...


In [6]:
df_test.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,amscan Festive Fall Thanksgiving Party Game Ac...,Toys & Games | Party Supplies,It is the perfect activity for your guests on ...,$4.80,ProductDimensions:11.3x8.5x0.7inches|ItemWeigh...,https://images-na.ssl-images-amazon.com/images...


In [7]:
df_eval.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Sweet Jojo Designs Vintage Floral Boho Baby Ki...,Baby Products | Nursery | Furniture | Storage ...,Great Condition.,$39.99,,https://images-na.ssl-images-amazon.com/images...


### Clean Text From The Data Frame

In [8]:
def clean_text(text):
    """
    Lowercase, remove special characters, and strip whitespace.
    """
    
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)

    return text.strip()

In [9]:
df_train["Product Name"] = df_train["Product Name"].apply(clean_text)
df_train["Description"] = df_train["Description"].apply(clean_text)

In [10]:
df_test["Product Name"] = df_test["Product Name"].apply(clean_text)
df_test["Description"] = df_train["Description"].apply(clean_text)

In [11]:
df_eval["Product Name"] = df_eval["Product Name"].apply(clean_text)
df_eval["Description"] = df_eval["Description"].apply(clean_text)

### Process Categories

In [12]:
def process_list(field):
    """
    Split a ' | ' separated string into a structured list.
    """

    if isinstance(field, str) and field.strip():
        return field.split(" | ")

    if isinstance(field, (np.ndarray, list)):
        field = " | ".join(map(str, field))
        return field.split(" | ")

    if pd.isna(field) or field == "":
        return []

    return []

In [13]:
df_train["Category"] = df_train["Category"].apply(process_list)

In [14]:
df_test["Category"] = df_test["Category"].apply(process_list)

In [15]:
df_eval["Category"] = df_eval["Category"].apply(process_list)

### Turn Price Into Numbers

In [16]:
def turn_price(price):
    if pd.isna(price):
        return None

    price_str = str(price)

    match = re.search(r"[\d]+(\.\d+)?", price_str)

    if not match:
        return None;

    return float(match.group())

In [17]:
df_train["Selling Price"] = df_train["Selling Price"].apply(turn_price)

In [18]:
df_test["Selling Price"] = df_test["Selling Price"].apply(turn_price)

In [19]:
df_eval["Selling Price"] = df_eval["Selling Price"].apply(turn_price)

### Process Specification

In [20]:
def process_specification(raw_specifications):
    if pd.isna(raw_specifications):
        return None

    clean_specifications = re.sub(r"\(.*\)", "", raw_specifications)
    specifications = clean_specifications.split("|")

    data = {}

    for spec in specifications:
        if ":" not in spec:
            continue  

        key, value = spec.split(":", 1)

        if key == "ProductDimensions":
            data["Dimensions"] = value.replace("inches", "").strip()

        elif key == "ItemWeight" or key == "ShippingWeight":
            match = re.match(r"([0-9\.]+)", value)
            if match:
                numeric_value = match.group(1)
                if numeric_value and numeric_value != ".":
                    data[key] = float(
                        numeric_value
                    ) 
                else:
                    data[key] = None 

        elif key == "ASIN":
            data["ASIN"] = value.strip()
        elif key == "Itemmodelnumber":
            data["ItemModelNumber"] = value.strip()

        elif key == "Manufacturerrecommendedage":
            data["RecommendedAge"] = value.strip()

    return data

In [21]:
processed_data = df_train["Product Specification"].apply(process_specification)
processed_df = pd.json_normalize(processed_data)

df_train = pd.concat([df_train, processed_df], axis=1)

df_train.drop(columns=["Product Specification"], inplace=True)

In [22]:
processed_data = df_test["Product Specification"].apply(process_specification)
processed_df = pd.json_normalize(processed_data)

df_test = pd.concat([df_test, processed_df], axis=1)

df_test.drop(columns=["Product Specification"], inplace=True)

In [23]:
processed_data = df_eval["Product Specification"].apply(process_specification)
processed_df = pd.json_normalize(processed_data)

df_eval = pd.concat([df_eval, processed_df], axis=1)

df_eval.drop(columns=["Product Specification"], inplace=True)

### Visualized Preprocessed Data

In [24]:
df_train.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,crafttastic empower poster craft kit design...,"[Toys & Games, Arts & Crafts, Craft Kits, Pape...",perfect gift for ages 8 and above make this fe...,14.47,https://images-na.ssl-images-amazon.com/images...,3x10x15,15.2,15.2,B01D52Q1UC,CT1688,8-15years
1,melissa doug dottodot letter coloring pad 3 ...,"[Toys & Games, Games & Accessories, Board Games]",3 jumbo connectthedots coloring pads abc farm ...,12.74,https://images-na.ssl-images-amazon.com/images...,11x0.8x14,3.25,3.25,B07ZG7QYR5,97076,4-6years
2,rpm rear shock tower for the nitro slash nitro...,"[Toys & Games, Hobbies, Remote & App Controlle...",great condition,9.06,https://images-na.ssl-images-amazon.com/images...,5.9x4x0.4,0.32,0.32,B000VQML1O,RPM80862,16yearsandup
3,disney pixar cars mini racers crank crash der...,"[Toys & Games, Play Vehicles, Toy Vehicles]",disneypixar cars 3 new crazy 8 track,27.85,https://images-na.ssl-images-amazon.com/images...,2.9x14x10,1.57,1.76,B076FLF7CC,FLG71,4-8years
4,areaware cubebot small,"[Toys & Games, Puzzles, Brain Teasers, Assembl...",great condition,28.92,https://images-na.ssl-images-amazon.com/images...,,,,,,


In [25]:
df_test.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,amscan festive fall thanksgiving party game ac...,"[Toys & Games, Party Supplies]",perfect gift for ages 8 and above make this fe...,4.8,https://images-na.ssl-images-amazon.com/images...,11.3x8.5x0.7,3.2,3.2,B00G4F6VJO,270072,4-12years
1,gmp 118 home improvement 199199 tv series 199...,"[Toys & Games, Dress Up & Pretend Play, Preten...",3 jumbo connectthedots coloring pads abc farm ...,139.95,https://images-na.ssl-images-amazon.com/images...,9.5x4x2.5,3.2,3.2,B082QSZKKK,GMP-18920,14yearsandup
2,manhattan toy wimmerferguson double sided 3in1...,"[Toys & Games, Baby & Toddler Toys, Car Seat &...",great condition,15.39,https://images-na.ssl-images-amazon.com/images...,12.2x12x3,6.4,6.4,B0043QKB6K,210500,3months-15years
3,whamo slip n slide wave rider 16,"[Toys & Games, Sports & Outdoor Play, Pools & ...",disneypixar cars 3 new crazy 8 track,16.19,https://images-na.ssl-images-amazon.com/images...,11.8x10.5x2,2.0,2.15,B001X6F6HM,WaveRiderwithBoogie,8-15years
4,wow stuff collection harry potter wingardium l...,"[Toys & Games, Novelty & Gag Toys]",great condition,15.51,https://images-na.ssl-images-amazon.com/images...,7x0.2x0.2,3.84,4.8,B07F7P4DP7,WW-1018,8yearsandup


In [26]:
df_eval.head()

Unnamed: 0,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,sweet jojo designs vintage floral boho baby ki...,"[Baby Products, Nursery, Furniture, Storage & ...",great condition,39.99,https://images-na.ssl-images-amazon.com/images...,,,,,,
1,ultimate guard deck box sidewinder 80 chromias...,"[Toys & Games, Collectible Toys, Collectible D...",additional visual protection you will clearly ...,19.14,https://images-na.ssl-images-amazon.com/images...,3x3.8x3,5.0,5.0,B078TTJTG2,UGD010852,8yearsandup
2,xshot chaos orbit dart ball blaster 100 rounds...,"[Toys & Games, Sports & Outdoor Play, Blasters...",chaos orbit ultimate blaster pack the xshot ch...,39.99,https://images-na.ssl-images-amazon.com/images...,9.8x4.3x23.4,4.55,4.55,B07RBBV4P7,B07RBBV4P7,8yearsandup
3,benjonah throw blankets perfect for the fall ...,"[Home & Kitchen, Bedding, Kids' Bedding, Blank...",celebrate the fall and winter season with the ...,24.45,https://images-na.ssl-images-amazon.com/images...,60x50x0.5,1.0,1.0,B07YNXTGR4,Pkidthrows-Unicorn,
4,banzai battle blast adventure inflatable water...,"[Toys & Games, Sports & Outdoor Play, Pools & ...",15l x 1110w x 8h inflatable water park,349.99,https://images-na.ssl-images-amazon.com/images...,180x142x96,55.0,58.9,B072QCKDFR,35547,3yearsandup


### Save Preprocess Data Frame

In [27]:
df_train.to_csv("../datasets/train.csv")

In [28]:
df_test.to_csv("../datasets/test.csv")

In [29]:
df_eval.to_csv("../datasets/eval.csv")