In [37]:
from pathlib import Path

import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset

## Raw dataset
Original dataset can be downloaded from [this link](https://github.com/amazon-science/esci-data/tree/main/shopping_queries_dataset).

In [14]:
dataset_dir = Path("~/Datasets/amazon_science").expanduser()
assert dataset_dir.exists(), f"{dataset_dir} does not exist"
dataset_sources = dataset_dir / "shopping_queries_dataset_sources.csv"
assert dataset_sources.exists(), f"{dataset_sources} does not exist"
dataset_examples = dataset_dir / "shopping_queries_dataset_examples.parquet"
assert dataset_examples.exists(), f"{dataset_examples} does not exist"
dataset_products = dataset_dir / "shopping_queries_dataset_products.parquet"
assert dataset_products.exists(), f"{dataset_products} does not exist"

---

### Examples/Queries

In [25]:
df_examples = pd.read_parquet(dataset_examples)
print(f"Number of rows in examples: {len(df_examples):_}")

Number of rows in examples: 2_621_288


In [22]:
df_examples.sample(5)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
1041802,1041802,ibuprofeno 600 mg,52571,B06Y5TPFRK,es,I,1,1,train
2188721,2188721,wireless router modem,112245,B07L9282JK,us,E,0,1,train
2027552,2027552,toilet bowl brush brushed nickel,103854,B07M71XTZY,us,E,0,1,train
1527707,1527707,paper towels,77744,B07TVKQJSG,us,E,0,1,test
703932,703932,double umbrella stroller,35032,B008TTQ2A6,us,E,0,1,train


---

### Products

In [24]:
df_products = pd.read_parquet(dataset_products)
print(f"Number of rows in products: {len(df_products):_}")

Number of rows in products: 1_814_924


In [34]:
df_products.sample(5)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale
1590968,B07SYNPCNH,Chihee Hammock Chair Super Large Hanging Chair...,Warning!<br><br> Do not spin or swing in your ...,Most Relaxing.Hanging chairs are the trend of ...,Chihee,Natural White,us
1129378,B0841N8V7H,montesoro 単品 すき率10％ すきばさみ ヘアカット 美容師 散髪 セニング セル...,世界の最高級と謳われた 錆びにくく、切れ味が継続する素材、ステンレス炭素鉱440Cを採用 ...,簡単♪プロ仕様カット】自宅で手軽にカット出来るプロ仕様\n【素材へのこだわり】錆びにくい高い...,montesoro,,jp
1022068,B087P27YWM,半ズボン 夏 速乾 おしゃれ 男 無地 ゆったり 通気性 gray M,,★ 注意：日本通常サイズより1~2サイズ大きめなので、いつもより1~2つ小さいサイズのご購入...,SWRIKIIEF,グレー-3,jp
926527,B01BUROE6W,Omega-3 Krill Oil 750mg Supplement- Megared Ul...,,"Ultra concentrate 100% pure Krill oil, 35% mor...",Megared,,us
275930,B07PJDBFM9,Marvel Captain America Avengers Shield Waterco...,,"Captain America shirt, Captain America shirt f...",Marvel,Black,us


---

### Merge Products and Queries

In [35]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how="left",
    left_on=["product_locale", "product_id"],
    right_on=["product_locale", "product_id"],
)
print(f"Number of rows in examples_products: {len(df_examples_products):_}")

Number of rows in examples_products: 2_621_288


In [36]:
df_examples_products.sample(5)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split,product_title,product_description,product_bullet_point,product_brand,product_color
1503164,1503164,otterbox oneplus 6,76463,B07DW85TFD,us,E,0,1,train,OTTERBOX Commuter Series Case for OnePlus 6 - ...,,Compatible with OnePlus 6\nDOES NOT come with ...,OtterBox,BLACK
2129218,2129218,vmax charizard,109062,B09JGBLDPN,us,E,0,1,train,kekafu 4Pcs Charizard Pokemon Metal Gold Plate...,<p>100% brand new and high quality</p> <p></p>...,Customized Pokemon Cards Monolithic size: 3.46...,kekafu,
873666,873666,g string for women sexy slutty,43763,B072BSJ4WH,us,E,0,1,train,Oneheekini Women Sexy See Through Mini Micro B...,Hot G-String Bottom:The bottom of this bikini ...,High Quality Cotton:The material of this mini ...,Oneheekini,White
1179758,1179758,lamparas infantiles,59758,B06WXXDD38,es,E,0,1,train,Malovecf - Lámpara de techo para dormitorio do...,Lámpara de techo: plancha de techo. <br/>Mater...,"Montaje sencillo, fácil de instalar.\nEmbalaje...",Malovecf,Rosa.
2579498,2579498,肉,129030,B00H8MWC04,jp,E,0,1,train,【桐箱入り】松阪牛 黄金 特選すき焼き 800g すき焼き 牛肉 肉 お歳暮 ギフト しゃぶ...,名称：松阪牛黄金の特選すき焼き 生産地：三重県 内容量：800g 賞味期限：冷凍にて約30日...,正午までのご注文で即日発送！(※休業日を除く/ご指定日不可)\nヤマト運輸クール冷凍便 (送...,松阪牛,


In [None]:
df_examples_products["split"].unique()

array(['train', 'test'], dtype=object)

In [53]:
print(
    f"Number of train examples: {len(df_examples_products[df_examples_products['split'] == 'train']):_}"
)
print(
    f"Number of test examples: {len(df_examples_products[df_examples_products['split'] == 'test']):_}"
)

Number of train examples: 1_983_272
Number of test examples: 638_016


---

### Upload to HF

In [50]:
dataset_id = "Studeni/amazon-esci-data"

In [54]:
dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            df_examples_products[df_examples_products["split"] == "train"]
        ),
        "test": Dataset.from_pandas(
            df_examples_products[df_examples_products["split"] == "test"]
        ),
    }
)

In [55]:
dataset

DatasetDict({
    train: Dataset({
        features: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'split', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', '__index_level_0__'],
        num_rows: 1983272
    })
    test: Dataset({
        features: ['example_id', 'query', 'query_id', 'product_id', 'product_locale', 'esci_label', 'small_version', 'large_version', 'split', 'product_title', 'product_description', 'product_bullet_point', 'product_brand', 'product_color', '__index_level_0__'],
        num_rows: 638016
    })
})

In [None]:
dataset.push_to_hub(repo_id=dataset_id)

---

## Soruces

In [26]:
df_sources = pd.read_csv(dataset_sources)
print(f"Number of rows in sources: {len(df_sources):_}")

Number of rows in sources: 130_652


In [None]:
df_sources["source"].unique()

array(['other', 'negations', 'behavioral', 'parse_pattern', 'nlqec'],
      dtype=object)