### Import Libraries

In [46]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
import uuid

### Download Dataset From Hugging Face

In [47]:
ds = load_dataset("ckandemir/amazon-products")

In [48]:
ds

DatasetDict({
    train: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 23993
    })
    test: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 6665
    })
    eval: Dataset({
        features: ['Product Name', 'Category', 'Description', 'Selling Price', 'Product Specification', 'Image'],
        num_rows: 2666
    })
})

### Merge Train, Test and Eval

In [49]:
df_train = pd.DataFrame(ds["train"])
df_test = pd.DataFrame(ds["test"])
df_eval = pd.DataFrame(ds["eval"])

In [50]:
df = pd.concat([df_train, df_test, df_eval], ignore_index=True)

In [51]:
df.head(1)

Unnamed: 0,Product Name,Category,Description,Selling Price,Product Specification,Image
0,Craft-tastic – Empower Poster – Craft Kit – De...,Toys & Games | Arts & Crafts | Craft Kits | Pa...,PERFECT GIFT FOR AGES 8 AND ABOVE: Make this f...,$14.47,ProductDimensions:3x10x15inches|ItemWeight:15....,https://images-na.ssl-images-amazon.com/images...


In [52]:
len(df)

33324

### Clean Text From The Data Frame

In [53]:
def clean_text(text):
    """
    Lowercase, remove special characters, and strip whitespace.
    """
    
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)

    return text.strip()

In [54]:
df["Product Name"] = df["Product Name"].apply(clean_text)
df["Description"] = df["Description"].apply(clean_text)

### Process Categories

In [55]:
def process_list(field):
    """
    Split a ' | ' separated string into a structured list.
    """

    if isinstance(field, str) and field.strip():
        return field.split(" | ")

    if isinstance(field, (np.ndarray, list)):
        field = " | ".join(map(str, field))
        return field.split(" | ")

    if pd.isna(field) or field == "":
        return []

    return []

In [56]:
df["Category"] = df["Category"].apply(process_list)

### Turn Price Into Numbers

In [57]:
def turn_price(price):
    if pd.isna(price):
        return 0.0

    price_str = str(price)

    match = re.search(r"[\d]+(\.\d+)?", price_str)

    if not match:
        return 0.0;

    return float(match.group())

In [58]:
df["Selling Price"] = df["Selling Price"].apply(turn_price)

### Process Specification

In [59]:
def process_specification(raw_specifications):
    if pd.isna(raw_specifications):
        return None

    clean_specifications = re.sub(r"\(.*\)", "", raw_specifications)
    specifications = clean_specifications.split("|")

    data = {}

    for spec in specifications:
        if ":" not in spec:
            continue

        key, value = spec.split(":", 1)

        value = value.strip() if value.strip() else None

        if key == "ProductDimensions":
            data["Dimensions"] = value.replace("inches", "").strip() if value else 0.0

        elif key == "ItemWeight" or key == "ShippingWeight":
            match = re.match(r"([0-9\.]+)", value)
            if match:
                numeric_value = match.group(1)
                if numeric_value and numeric_value != ".":
                    data[key] = float(numeric_value)
                else:
                    data[key] = 0.0
            else:
                data[key] = 0.0

        elif key == "ASIN":
            data["ASIN"] = value if value else "0"

        elif key == "Itemmodelnumber":
            data["ItemModelNumber"] = value if value else "0"

        elif key == "Manufacturerrecommendedage":
            data["RecommendedAge"] = value if value else "0"

    data = {key: (value if value is not None else 0.0) for key, value in data.items()}

    return data

In [60]:
processed_data = df["Product Specification"].apply(process_specification)
processed_df = pd.json_normalize(processed_data)

df = pd.concat([df, processed_df], axis=1)

df.drop(columns=["Product Specification"], inplace=True)

### Add Unique UUID ( V4 ) On The Dataset

In [61]:
df["id"] = [str(uuid.uuid4()) for _ in range(len(df))]

df = df[["id"] + [col for col in df.columns if col != "id"]]

### Visualized Preprocessed Data

In [62]:
df.head()

Unnamed: 0,id,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
0,374ffda3-cb12-4b02-83a1-df9eb70ad921,crafttastic empower poster craft kit design...,"[Toys & Games, Arts & Crafts, Craft Kits, Pape...",perfect gift for ages 8 and above make this fe...,14.47,https://images-na.ssl-images-amazon.com/images...,3x10x15,15.2,15.2,B01D52Q1UC,CT1688,8-15years
1,dd934ab4-d18c-40b0-b153-3f03d435f441,melissa doug dottodot letter coloring pad 3 ...,"[Toys & Games, Games & Accessories, Board Games]",3 jumbo connectthedots coloring pads abc farm ...,12.74,https://images-na.ssl-images-amazon.com/images...,11x0.8x14,3.25,3.25,B07ZG7QYR5,97076,4-6years
2,bd14a528-1d57-4e3a-b893-0f063c2b456b,rpm rear shock tower for the nitro slash nitro...,"[Toys & Games, Hobbies, Remote & App Controlle...",great condition,9.06,https://images-na.ssl-images-amazon.com/images...,5.9x4x0.4,0.32,0.32,B000VQML1O,RPM80862,16yearsandup
3,3589be8d-962b-4db3-8c80-608e91a1ba18,disney pixar cars mini racers crank crash der...,"[Toys & Games, Play Vehicles, Toy Vehicles]",disneypixar cars 3 new crazy 8 track,27.85,https://images-na.ssl-images-amazon.com/images...,2.9x14x10,1.57,1.76,B076FLF7CC,FLG71,4-8years
4,02e29fb2-ecd9-4e4b-bb3a-f1b4496b7a4e,areaware cubebot small,"[Toys & Games, Puzzles, Brain Teasers, Assembl...",great condition,28.92,https://images-na.ssl-images-amazon.com/images...,,,,,,


In [63]:
df.tail()

Unnamed: 0,id,Product Name,Category,Description,Selling Price,Image,Dimensions,ItemWeight,ShippingWeight,ASIN,ItemModelNumber,RecommendedAge
33319,3cdb6bc8-65b0-425c-9e71-dd8378707a7c,neato classics jacobs ladder retro wooden puzz...,"[Toys & Games, Novelty & Gag Toys, Magic Kits ...",the jacobs ladder makes use of a timeless opti...,5.99,https://images-na.ssl-images-amazon.com/images...,2.2x2x3.8,3.2,3.2,B000RAEBL2,6195,5-15years
33320,87427045-f168-4194-9565-36e8015b87da,sushi roll the sushi go dice game,"[Toys & Games, Games & Accessories, Game Acces...",rice and dice roll with your favorite sushi go...,18.78,https://images-na.ssl-images-amazon.com/images...,11.4x11.4x16.3,1.91,1.95,B07PLFLR54,SUSHI_ROLL,8-15years
33321,2e3277fb-c8d3-4864-8d31-050c4d262167,white mountain puzzles craft room 1000 piece ...,"[Toys & Games, Puzzles, Jigsaw Puzzles]",more to puzzle building art jigsaw puzzles are...,15.99,https://images-na.ssl-images-amazon.com/images...,10x12x2,1.68,1.69,B07BS1W9R6,1372,12months-8years
33322,7b0fb3ed-d2a9-4e9b-8e5b-cb739cbd4422,entertainment earth thor chair capes,"[Toys & Games, Party Supplies]",unique patented chair cape that celebrates thor,11.6,https://images-na.ssl-images-amazon.com/images...,24x30x0,7.2,7.4,B073FXDN4N,EE24503,14yearsandup
33323,8fe924e4-4af3-4bd4-ba47-4c19390e2ee7,mega construx call of duty simon ghost riley,"[Toys & Games, Toy Figures & Playsets, Action ...",buildable superposeable simon ghost riley micr...,5.99,https://images-na.ssl-images-amazon.com/images...,6x1.1x3.9,0.96,0.96,B07MZG4MX4,GFW74,10-15years


In [64]:
df.isna().sum()

id                     0
Product Name           0
Category               0
Description            0
Selling Price          0
Image                  0
Dimensions          8591
ItemWeight          8669
ShippingWeight      7160
ASIN                5953
ItemModelNumber    10040
RecommendedAge     12961
dtype: int64

In [65]:
df.isnull().sum()

id                     0
Product Name           0
Category               0
Description            0
Selling Price          0
Image                  0
Dimensions          8591
ItemWeight          8669
ShippingWeight      7160
ASIN                5953
ItemModelNumber    10040
RecommendedAge     12961
dtype: int64

### Fill All The Invalid Value

In [66]:
df["ItemWeight"].fillna("Unknown", inplace=True)
df["ShippingWeight"].fillna("Unknown", inplace=True)
df["Dimensions"].fillna("Unknown", inplace=True)
df["ASIN"].fillna("Unknown", inplace=True)
df["ItemModelNumber"].fillna("Unknown", inplace=True)
df["RecommendedAge"].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ItemWeight"].fillna("Unknown", inplace=True)
  df["ItemWeight"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ShippingWeight"].fillna("Unknown", inplace=True)
  df["ShippingWeight"].fillna("Unknown", inplace=True)
The behavior will change

In [67]:
df.isna().sum()

id                 0
Product Name       0
Category           0
Description        0
Selling Price      0
Image              0
Dimensions         0
ItemWeight         0
ShippingWeight     0
ASIN               0
ItemModelNumber    0
RecommendedAge     0
dtype: int64

In [68]:
df.isnull().sum()

id                 0
Product Name       0
Category           0
Description        0
Selling Price      0
Image              0
Dimensions         0
ItemWeight         0
ShippingWeight     0
ASIN               0
ItemModelNumber    0
RecommendedAge     0
dtype: int64

In [69]:
len(df)

33324

In [73]:
df = df.map(lambda x: tuple(x) if isinstance(x, list) else x)
df = df.drop_duplicates(subset=["Product Name"], keep="last")

In [74]:
len(df)

9631

In [75]:
df.duplicated().sum()

np.int64(0)

### Save Preprocess Data Frame

In [76]:
df.to_csv("../datasets/products.csv")