In [16]:
import os
from pathlib import Path

import pandas as pd
from utils import create_and_push_dataset

## Raw dataset
Original dataset can be downloaded from [this link](https://github.com/amazon-science/esci-data/tree/main/shopping_queries_dataset).

In [2]:
dataset_dir = Path("~/Datasets/amazon_science").expanduser()
assert dataset_dir.exists(), f"{dataset_dir} does not exist"
dataset_sources = dataset_dir / "shopping_queries_dataset_sources.csv"
assert dataset_sources.exists(), f"{dataset_sources} does not exist"
dataset_examples = dataset_dir / "shopping_queries_dataset_examples.parquet"
assert dataset_examples.exists(), f"{dataset_examples} does not exist"
dataset_products = dataset_dir / "shopping_queries_dataset_products.parquet"
assert dataset_products.exists(), f"{dataset_products} does not exist"

In [None]:
repo_id = "Studeni/amazon-esci-data"

---

### Examples/Queries

In [4]:
df_examples = pd.read_parquet(dataset_examples)
print(f"Number of rows in examples: {len(df_examples):_}")

Number of rows in examples: 2_621_288


In [5]:
df_examples.sample(5)

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
179644,179644,alfombras coche mitsubishi,7944,B07S354BMT,es,E,1,1,train
1318162,1318162,medicine measuring cups,66852,B07D5Q12K2,us,I,0,1,train
1768185,1768185,samsung note 8 smartphone,90228,B07Y1D52LV,es,S,0,1,train
1174982,1174982,lacoste shirts for men,59513,B0779NRH1Q,us,S,1,1,test
1636660,1636660,print dress,83397,B071SDSKHF,us,E,0,1,test


In [None]:
create_and_push_dataset(
    df=df_examples,
    repo_id=repo_id,
    hf_token=os.environ["HF_TOKEN"],
    config_name="queries",
)

---

### Products

In [6]:
df_products = pd.read_parquet(dataset_products)
print(f"Number of rows in products: {len(df_products):_}")

Number of rows in products: 1_814_924


In [7]:
df_products.sample(5)

Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale
43480,B07H3Q5XKL,Retro-Bit Official SEGA Saturn Cool Pad [Impor...,Compatible con las consolas SEGA Saturn origin...,,retro-bit,,es
1686552,B00ICYHWX4,Linguistics: A Complete Introduction: Teach Yo...,"<p>Written by David Hornsby, who is a current ...",,,,us
967738,B07MYX1GN7,OLFA 替刃(M厚)20枚入 MTB20K,,全長(mm)：83\n刃厚(mm)：0.45\n適合本体：203B・237B\n刃長(mm)...,オルファ(OLFA),,jp
1058684,B0993979ZZ,YOVEKAT 女性の女の子のレトロな男は、誕生日の結婚式アクセサリーのための真珠の王冠の花...,<p>特徴：</p> <p></p> <p>素晴らしい技量で高品質材料でできている。</p>...,サイズ：直径：12 cm（4.72インチ）高さ：7.5センチメートル（2.95インチ）。\n...,YOVEKAT,,jp
134368,B00BGHBUFU,Gabol - Week | Bolso con Ruedas de Viaje Grand...,,Bolso grande con ruedas idónea para viajes lar...,Gabol,Negro,es


In [8]:
df_products = pd.merge(
    df_products,
    df_examples[["product_id", "split"]].drop_duplicates(subset=["product_id"]),
    on="product_id",
    how="left",
)
print(f"Number of rows in products with split: {len(df_products):_}")
df_products.sample(5)

Number of rows in products with split: 1_814_924


Unnamed: 0,product_id,product_title,product_description,product_bullet_point,product_brand,product_color,product_locale,split
1625542,B07ZVRFYHM,Wonderience Neoprene Sauna Suit for Men Waist ...,<p> Wonderience Neoprene Sauna Suit for Men Wa...,READY FOR SAUNA SWEAT:This 2-in-1 vest and wai...,Wonderience,Black,us,train
1057750,4062700735,特別編成 山陽・九州新幹線ライン 全線・全駅・全配線 (【図説】日本の鉄道),,,講談社,,jp,train
546902,B00N9IJP06,Nerf Dog Rubber Ball Dog Toy with Checkered Sq...,,NERF-TOUGH: Nerf-quality materials make our sq...,Nerf Dog,Red,us,train
1163763,8417549749,Batman: Harvest Breed,,,ECC Ediciones,,es,train
660362,B01C65DDSW,2lbDepot Shower Curtain Rings Hooks - Bronze F...,<p>Our premium quality 2LB Depot double glide ...,★ SHOWER CURTAIN RINGS HOOKS ★ 2LB Depot premi...,2 Lb. Depot,Oil Rubbed Bronze,us,train


In [None]:
create_and_push_dataset(
    df=df_products,
    repo_id=repo_id,
    hf_token=os.environ["HF_TOKEN"],
    config_name="products",
)

In [10]:
del df_products

---

## Soruces

In [11]:
df_sources = pd.read_csv(dataset_sources)
print(f"Number of rows in sources: {len(df_sources):_}")

Number of rows in sources: 130_652


In [None]:
df_sources["source"].unique()

array(['other', 'negations', 'behavioral', 'parse_pattern', 'nlqec'],
      dtype=object)

In [12]:
df_sources.head()

Unnamed: 0,query_id,source
0,0,other
1,1,negations
2,2,negations
3,3,negations
4,4,behavioral


In [13]:
df_sources = pd.merge(
    df_sources,
    df_examples[["query_id", "split"]].drop_duplicates(subset=["query_id"]),
    on="query_id",
    how="left",
)
print(f"Number of rows in products with split: {len(df_sources):_}")
df_sources.sample(5)

Number of rows in products with split: 130_652


Unnamed: 0,query_id,source,split
118736,118736,other,train
59166,59166,other,train
22648,22648,negations,test
100973,100973,other,train
120314,120314,other,test


In [None]:
create_and_push_dataset(
    df=df_sources,
    repo_id=repo_id,
    hf_token=os.environ["HF_TOKEN"],
    config_name="sources",
)