### Import Packages

In [117]:
import pandas as pd
import numpy as np
from datasets import load_dataset

### Download Dataset

In [118]:
ds = load_dataset("wdc/products-2017", "cameras_large")

In [119]:
ds

DatasetDict({
    train: Dataset({
        features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
        num_rows: 16028
    })
    test: Dataset({
        features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
        num_rows: 1100
    })
    validation: Dataset({
        features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'br

### Manage Missing Values

In [120]:
defaults = {
    "label": 0,
    "id_right": -1,
    "category_right": "Unknown",
    "cluster_id_right": -1,
    "brand_right": "Unknown",
    "title_right": "No Title",
    "description_right": "No Description",
    "price_right": "$0.00",
    "specTableContent_right": "No Specs",
}

In [121]:
def fill_missing(example):
    for key, default_value in defaults.items():
        if example[key] is None or (
            isinstance(example[key], float) and pd.isna(example[key])
        ):
            example[key] = default_value
    return example

In [122]:
ds = ds.map(fill_missing)

### Split Datasets Into Train, Validation And Test

In [123]:
train_dataset = ds["train"]

In [124]:
train_dataset

Dataset({
    features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 16028
})

In [125]:
val_dataset = ds["validation"]

In [126]:
val_dataset

Dataset({
    features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 4008
})

In [127]:
test_dataset = ds["test"]

In [128]:
test_dataset

Dataset({
    features: ['pair_id', 'label', 'id_left', 'category_left', 'cluster_id_left', 'brand_left', 'title_left', 'description_left', 'price_left', 'specTableContent_left', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 1100
})

### Preprocess The Dataset

In [129]:
columns_to_remove = [
    "id_left",
    "category_left",
    "cluster_id_left",
    "brand_left",
    "title_left",
    "description_left",
    "price_left",
    "specTableContent_left",
    "pair_id",
]

In [130]:
train_dataset = train_dataset.remove_columns(columns_to_remove)

In [131]:
train_dataset

Dataset({
    features: ['label', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 16028
})

In [132]:
val_dataset = val_dataset.remove_columns(columns_to_remove)

In [133]:
val_dataset

Dataset({
    features: ['label', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 4008
})

In [134]:
test_dataset = test_dataset.remove_columns(columns_to_remove)

In [135]:
test_dataset

Dataset({
    features: ['label', 'id_right', 'category_right', 'cluster_id_right', 'brand_right', 'title_right', 'description_right', 'price_right', 'specTableContent_right'],
    num_rows: 1100
})

In [136]:
train_dataset[0]

{'label': 1,
 'id_right': 5931545,
 'category_right': 'Camera_and_Photo',
 'cluster_id_right': 9309675,
 'brand_right': '"Veho"@en-US',
 'title_right': ' "Veho VCC-005-MUVI-NPNG MUVI HD Mini Handsfree ActionCam with Waterproof Case and 8 GB Memory - No Proof Glory Edition"@en-US "Sports & Action Video Cameras Page 7 | Come As You Arts"@en-US',
 'description_right': '"Veho are pleased to announce the partnership with new and exciting lifestyle and action sports media partner No Proof No Glory . As part of this partnership, Veho have released the MUVI HD Special Edition No Proof No Glory\' Bundle. The Special Edition No Proof No Glory MUVI HD includes a Waterproof Case that is capable of depths of 60 Meters underwater for a full 60 minutes, a Helmet Front Mount that allows you to create a true POV angle when filming hands free.The MUVI HD has updated firmware to allow you to record at 960p at 30fps and 720p at 60 & 30fps giving you more versatility with your MUVI HD action camera. The MU

In [139]:
train_dataset.to_pandas().to_csv("../dataset/train.csv")

In [140]:
val_dataset.to_pandas().to_csv("../dataset/val.csv")

In [141]:
test_dataset.to_pandas().to_csv("../dataset/test.csv")