# About

## Setup

In [1]:
!sudo apt-get install git-lfs -y

#!git clone https://github.com/amazon-science/esci-data.git

Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 3 not upgraded.


In [2]:
!ls esci-data/shopping_queries_dataset

shopping_queries_dataset_examples.parquet  shopping_queries_dataset_sources.csv
shopping_queries_dataset_products.parquet


In [3]:
!pip install langdetect  pandarallel  chardet --quiet


[0m

## Imports

In [4]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
import pandas as pd
from langdetect import detect, detect_langs
from langdetect import detect, LangDetectException
import re
import string
import unicodedata
from unidecode import unidecode
import chardet

In [6]:
!ls -lah esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet

-rw-r--r-- 1 root root 49M Nov 19 15:55 esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet


Load examples, products and sources


## IO

In [7]:
df_examples = pd.read_parquet('esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('esci-data/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("esci-data/shopping_queries_dataset/shopping_queries_dataset_sources.csv")
# df_dataset.head()

In [8]:
df_examples.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train


In [9]:
df_sources

Unnamed: 0,query_id,source
0,0,other
1,1,negations
2,2,negations
3,3,negations
4,4,behavioral
...,...,...
130647,130647,other
130648,130648,other
130649,130649,other
130650,130650,other


In [10]:
df_sources['source'].value_counts()

other            113656
negations          6964
parse_pattern      6083
behavioral         3780
nlqec               169
Name: source, dtype: int64

## Amazon Metadata

In [11]:
df_products_metadata = df_products.drop_duplicates(['product_id'])
# only interested in us items
df_products_metadata = df_products_metadata [ df_products_metadata['product_locale']=="us" ]
df_products_metadata = df_products_metadata[['product_id']]

In [12]:
def get_canonical_product_url(product_id):
    
    url = f"https://www.amazon.com/dp/{product_id}"
    
    return url

def get_canonical_product_image_url(product_id):
    # ref
    # https://www.oreilly.com/library/view/amazon-hacks/0596005423/ch01s07.html
    
    # medium image
    url = f"https://www.amazon.com/dp/{product_id}.01._SCMZZZZZZZ_.jpg"
    # large image
    url = f"https://www.amazon.com/dp/{product_id}.01._SCLZZZZZZZ_.jpg"
    
    
    return url

In [13]:
df_products_metadata['url_product'] = df_products_metadata['product_id'].apply(get_canonical_product_url)
df_products_metadata['url_image'] = df_products_metadata['product_id'].apply(get_canonical_product_image_url)



Merge examples with products


In [14]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)



merge examples with product metadata

In [15]:
df_examples_products = pd.merge(
    df_examples_products,
    df_products_metadata,
    how='inner',
    on=['product_id'],
)


Merge examples with query type


In [16]:
# def isEnglish(s):
#     try:
#         s.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True
    
    
# def is_english(s):
#     try:
#         return detect(s) == 'en'
#     except Exception as e:
#         print(( s,e) ) 
#         return False
#     return False

def is_english(text):
    # try:
    #     text = text.encode(encoding='utf-8').decode('ascii')
    # except UnicodeDecodeError as e:
    #     print(e)
    #     return False
    
    if re.search(r"[\u3000-\u303F]", text):
        return False
    
        

    allowed_characters = set(["$", "!", '"', "-", '#',",",".", "%", "'", '’', "’", ")" ,"(","”",'’','$', '”'])
    
    try:
        return all(c.isalnum() or c.isspace() or c in allowed_characters or c in string.punctuation for c in text)
    except:
        return False
    
def is_english(text):
    text = unicodedata.normalize('NFKC', text)
    
    lang = None
    try:
        lang = detect(text)
    except Exception as e:
        print (text, e)

    if lang in ['en','ja','es']:
        return lang
    
    text = unidecode(text)
    try:
        res = text.encode(encoding='utf-8').decode('ascii')
        return "en"
        
    except UnicodeDecodeError:
        pass
    
    
    

    
    return "unknown"


def contains_spanish_accents(text):
    spanish_accents = {'á', 'é', 'í', 'ó', 'ú', 'ñ'}
    return any(char in spanish_accents for char in text)


def contains_japanese(text):
    for char in text:
        if '\u3040' <= char <= '\u309F' or '\u30A0' <= char <= '\u30FF' or '\u4E00' <= char <= '\u9FAF':
            return True
    return False

# Example usage
def is_english(text):
    text = unicodedata.normalize('NFC', text)

    is_japanese = contains_japanese(text)

   
    if is_japanese:
        return "jp"
    
    is_spanish = contains_spanish_accents(text)
    
    if is_spanish:
        return "es"
    

    
    try:
        lang = detect(text)

        if lang =="en":
            return "en"

        #print(lang)

        
        allowed_characters = set(["$", "!", '"', "-", '#',",",".", "%", "'", '’', "’", ")" ,"(","”",'’','$', '”'])
    
        if all(c.isalnum() or c.isspace() or c in allowed_characters or c in string.punctuation for c in text):
            return "en"

        
        #text = unidecode(text)
        lang = detect(text)

        #print(text,lang)
        return lang
    except:
        return "unknown"

    
    
def clean_text(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def detect_language(text):
    
    is_japanese = contains_japanese(text)
    if is_japanese:
        return "jp"
   
    is_spanish = contains_spanish_accents(text)
    
    if is_spanish:
        return "es"
    
    try:
        res = text.encode(encoding='utf-8').decode('ascii')
        return "en"
        
    except UnicodeDecodeError:
        pass
    
    cleaned_text = clean_text(text)
    try:
        lang = detect(cleaned_text)
        if lang == 'en':
            return 'en'
        elif lang == 'ja':
            return 'ja'
        elif lang == 'es':
            return 'es'
        else:
            return 'unknown'
    except LangDetectException:
        return 'unknown'
    
    return 'unknown'

In [17]:
chardet.detect("1” sink stopper and strainer".encode())

{'encoding': 'Windows-1254',
 'confidence': 0.7374656603237749,
 'language': 'Turkish'}

In [18]:
is_english("1” sink stopper and strainer")

'en'

In [19]:
is_english("!solid camiseta sin manga"), detect("!solid camiseta sin manga")

('en', 'et')

In [20]:
is_english( 'alpro soja sin azúcar')

'es'

In [21]:
is_english( '32ミリ コテ')

'jp'

In [22]:
text = "1” sink stopper and strainer"
text = unicodedata.normalize('NFKD', text)
text =unidecode(text)
print(text)
text.encode(encoding='utf-8').decode('ascii')

1" sink stopper and strainer


'1" sink stopper and strainer'

In [23]:
?unidecode

[0;31mSignature:[0m [0munidecode[0m[0;34m([0m[0mstring[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Transliterate an Unicode object into an ASCII string

>>> unidecode(u"北亰")
"Bei Jing "

This function first tries to convert the string using ASCII codec.
If it fails (because of non-ASCII characters), it falls back to
transliteration using the character tables.

This is approx. five times faster if the string only contains ASCII
characters, but slightly slower than using unidecode directly if non-ASCII
chars are present.
[0;31mFile:[0m      /usr/lib/python3/dist-packages/unidecode/__init__.py
[0;31mType:[0m      function


In [24]:
detect("War doesn't show who's right, just who's left.")

'en'

In [25]:
detect_langs("Sink stopper")

[no:0.756904732631223, af:0.2430950268800618]

In [26]:
df_sources

Unnamed: 0,query_id,source
0,0,other
1,1,negations
2,2,negations
3,3,negations
4,4,behavioral
...,...,...
130647,130647,other
130648,130648,other
130649,130649,other
130650,130650,other


In [27]:
#df_examples_products_query['is_english'] =  df_examples_products_query['query'].apply(isEnglish)

In [28]:
#detect("hi

In [29]:
#df_query_lang = df_examples_products[df_examples_products ["product_locale"]=="us"]
df_query_lang = df_examples_products[['query','query_id']].drop_duplicates()



In [30]:
df_query_lang = pd.merge(df_query_lang,df_sources, on=['query_id'])

In [31]:
df_query_lang

Unnamed: 0,query,query_id,source
0,revent 80 cfm,0,other
1,bathroom fan without light,13723,negations
2,bathroom fan with light,13722,other
3,110cfm bathroom exhaust fan without light,1750,negations
4,12 inch bathroomwall mounted fan,1820,other
...,...,...,...
101831,земфира,115950,other
101832,кроссовки,115951,other
101833,‘master/slave mastery’,115955,other
101834,• bradley’s neurology in clinical practice,115962,other


In [32]:
df_query_lang

Unnamed: 0,query,query_id,source
0,revent 80 cfm,0,other
1,bathroom fan without light,13723,negations
2,bathroom fan with light,13722,other
3,110cfm bathroom exhaust fan without light,1750,negations
4,12 inch bathroomwall mounted fan,1820,other
...,...,...,...
101831,земфира,115950,other
101832,кроссовки,115951,other
101833,‘master/slave mastery’,115955,other
101834,• bradley’s neurology in clinical practice,115962,other


In [33]:
df_query_lang['lang_code'] = df_query_lang['query'].parallel_apply(detect_language)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=12730), Label(value='0 / 12730')))…

In [34]:
df_query_lang['lang_code'].value_counts()

en         99604
jp          1845
es           281
unknown      101
ja             5
Name: lang_code, dtype: int64

In [35]:
df_query_lang [ df_query_lang['lang_code'].isin(['en']) ].head(500)

Unnamed: 0,query,query_id,source,lang_code
0,revent 80 cfm,0,other,en
1,bathroom fan without light,13723,negations,en
2,bathroom fan with light,13722,other,en
3,110cfm bathroom exhaust fan without light,1750,negations,en
4,12 inch bathroomwall mounted fan,1820,other,en
...,...,...,...,...
497,"My new laptop would be silver, have the best s...",5742,nlqec,en
498,eat veggies not friends,36529,negations,en
499,eating for 2 i'm not pregnant shirt,36532,negations,en
500,emos not dead,37558,negations,en


In [36]:
df_query_lang [ ~df_query_lang['lang_code'].isin(['en','jp','es','jp']) ]

Unnamed: 0,query,query_id,source,lang_code
142,bench dog® 3-piece safety kit,14901,other,unknown
238,"9 x 6 1⁄2 x 2 3⁄4"" box",5468,other,unknown
834,wide women’s shoes,111540,other,unknown
1324,bronze 1” screw,19920,other,unknown
5307,men’s deodorant,68087,other,unknown
...,...,...,...,...
99798,skate men’s,93929,other,unknown
101800,zerøgrand hiker boot,115669,other,unknown
101831,земфира,115950,other,unknown
101832,кроссовки,115951,other,unknown


In [37]:
df_query_lang_en = df_query_lang[ df_query_lang['lang_code'] =="en" ]

In [38]:
df_examples_products_query = pd.merge(
    df_examples_products,
    df_query_lang_en,
    how='inner',
    on=['query_id','query']
)


In [39]:
df_final = df_examples_products_query\
.pipe(lambda x: x[x.product_locale =="us"]) \
.replace({"esci_label": {'E':'Exact' ,'S':'Substitute', 'C':'Complement', 'I':'Irrelevant' } })\
.rename (columns = {'esci_label':'relevance_label','source':'query_type'} ) \
.drop(columns = ['example_id','query_id','product_locale','small_version','large_version','split','lang_code'] )

df_final

Unnamed: 0,query,product_id,relevance_label,product_title,product_description,product_bullet_point,product_brand,product_color,url_product,url_image,query_type
0,revent 80 cfm,B000MOO21W,Irrelevant,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White,https://www.amazon.com/dp/B000MOO21W,https://www.amazon.com/dp/B000MOO21W.01._SCLZZ...,other
1,revent 80 cfm,B07X3Y6B1V,Exact,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,https://www.amazon.com/dp/B07X3Y6B1V,https://www.amazon.com/dp/B07X3Y6B1V.01._SCLZZ...,other
2,revent 80 cfm,B07WDM7MQQ,Exact,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,https://www.amazon.com/dp/B07WDM7MQQ,https://www.amazon.com/dp/B07WDM7MQQ.01._SCLZZ...,other
3,revent 80 cfm,B07RH6Z8KW,Exact,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,https://www.amazon.com/dp/B07RH6Z8KW,https://www.amazon.com/dp/B07RH6Z8KW.01._SCLZZ...,other
4,revent 80 cfm,B07QJ7WYFQ,Exact,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,https://www.amazon.com/dp/B07QJ7WYFQ,https://www.amazon.com/dp/B07QJ7WYFQ.01._SCLZZ...,other
...,...,...,...,...,...,...,...,...,...,...,...
1795033,• bradley’s neurology in clinical practice,0323377092,Exact,Neurology Self-Assessment: A Companion to Brad...,,Elsevier,Elsevier,,https://www.amazon.com/dp/0323377092,https://www.amazon.com/dp/0323377092.01._SCLZZ...,other
1795034,• bradley’s neurology in clinical practice,0323287832,Exact,"Bradley's Neurology in Clinical Practice, 2-Vo...",,Elsevier,Elsevier,,https://www.amazon.com/dp/0323287832,https://www.amazon.com/dp/0323287832.01._SCLZZ...,other
1795035,• bradley’s neurology in clinical practice,0198566344,Exact,Autonomic Failure: A Textbook of Clinical Diso...,,,Oxford University Press,,https://www.amazon.com/dp/0198566344,https://www.amazon.com/dp/0198566344.01._SCLZZ...,other
1795036,• bradley’s neurology in clinical practice,0071845836,Exact,The Hospital Neurology Book,,,McGraw-Hill Education / Medical,,https://www.amazon.com/dp/0071845836,https://www.amazon.com/dp/0071845836.01._SCLZZ...,other


In [40]:
df_final.to_parquet("../data/cleaned_input.parquet")

In [41]:
df_final = pd.read_parquet("../data/cleaned_input.parquet")

In [42]:
df_final[['query','query_type']].drop_duplicates().to_csv("../data/only_queries.csv", index=False )

In [43]:
df_final['query_type'].value_counts()

other            1504918
parse_pattern     150453
negations          87817
behavioral         43606
nlqec               4239
Name: query_type, dtype: int64

## Query types

**Behavioral**:  We use several statistics to sample queries leading to
results or purchases with non-representative click distributions.

**Negations** We use several regular expressions to sample queries
with negations. (for e.g., ‘energy bar without nuts’.)

**Parse Pattern** We use several regular expressions on the parsed
query to sample queries with some linguistic complexity,
such as queries containing quantities, a product type with
an adjective, etc. (for e.g., ’gluten free english biscuits’.)

**Price Pattern** We use several statistics to sample queries leading
to results or purchases with non-representative price distributions.

**Other** We sample queries from a number of random query sampling processes, removing those that result in perfect or near
perfect results.

**NLQEC** Queries from the NLQEC dataset [13] with 30 tokens or
less.

In [44]:
list ( df_final [ df_final['query_type']=="other"].drop_duplicates(['query'])['query'].head(100) )

[' revent 80 cfm',
 'bathroom fan with light',
 '12 inch bathroomwall mounted fan',
 'bathroom fan',
 'bathroom fan quiet',
 'fv relay',
 'heat recovery ventilator',
 'quiet bathroom exhaust fan drop ceiling',
 '4 wheel go kart',
 'lawn mower tires',
 'dirt bike tools',
 '23x10.5 tractor intertubes',
 'lawn mower trailer tires and wheels 5/8 bearings',
 'riding mowers clearance',
 'honda versattach',
 'tires 6in',
 'char broil grill wheel',
 'havana lawn mower',
 'honda lawn mower',
 'honda lawnmower',
 'landmowers',
 'lawn mower',
 'no engine pish mower',
 'reel mower with grass catcher',
 'hoop house garden',
 'plants shade cover',
 'half fence wood',
 'metal gates fencing',
 'out door dog fence',
 'wooden fencing material for yards privacy',
 'wrought iron fence panels',
 'black screen covering',
 'privacy mesh fence',
 'greenhouse dome',
 'propogator',
 'reduce reuse starter kit',
 'starter trey',
 'bamboo fencing',
 'chicken coop tarp',
 'fences',
 'outdoor wall covering',
 'priva

In [45]:
list ( df_final [ df_final['query_type']=="parse_pattern"].drop_duplicates(['query'])['query'].head(100) )

['household ventilation fans',
 'outside screen for patio',
 '19 inch monitor hdmi',
 '8 panel play yard',
 'free standing gates for kids or pets',
 'ware small animal playpen',
 'manilla envelopes 10x13',
 'classroom friendly supplies pencil sharpener',
 'parting crowd black friday shirt',
 'the all american rejects shirt',
 'golden doodles merchandise funny',
 'i got my fauci ouchi shirt',
 'morbid podcast merch',
 'taylor ham pork roll',
 'tony ferguson shirt',
 'wkrp in cincinnati t shirt',
 'work hard stay humble shirt',
 'zootopia tee shirt',
 'dinosaur shirt adult',
 'alkaline trio is this thing cursed',
 'dadorlian shirt',
 'ming clan',
 'seven oceans food',
 'dead foot skin',
 'shaving powder magic shave',
 'zen dry brush',
 'bath aids for disabled and elderly',
 'base border cleaner tool',
 'dremel cleaning tool',
 'esponjas para pulir',
 'beauty products for women',
 'wet skin sunscreen',
 'season blue medicated shampoo',
 'brazilian crush perfume',
 'jergens medium to tan',

In [46]:
list ( df_final [ df_final['query_type']=="negations"].drop_duplicates(['query'])['query'].head(100) )

['bathroom fan without light',
 '110cfm bathroom exhaust fan without light',
 '!awnmower tires without rims',
 'lawn mower without motor',
 '!qscreen fence without holes',
 '10 plastic pot without drain',
 '11 x 21 plant trays for seedlings without holes',
 '$12 label maker that’s not a cheap one',
 "1 20 volt extension cord no it's not what i",
 'do not water plants sign',
 'laughing without an accent by firoozeh dumas',
 '00m vesa mounting pattern mounting bracket not included',
 '100 days without sunlight book',
 '2 be or not 2b',
 'monitor mount without holes',
 '100 inchfoot baby gate without attached',
 '100” foot baby gate without attached',
 '.when you walk thru the fire you will not be burned',
 '(8 pck, one without packaging ) umbra single',
 'outdoor solar lights without motion sensor',
 'solar light without motion sensor',
 'solar lights outdoor without motion sensor',
 '# 10 self-seal envelopes without window',
 '#10 envelopes without security tint',
 '#10 window envelopes

In [47]:
list ( df_final [ df_final['query_type']=="behavioral"].drop_duplicates(['query'])['query'].head(100) )

['broan replacement parts',
 'small wheel barrel type tires',
 'ztr mower lift',
 'zk g1',
 'fear factor games',
 'solar led emergency lights',
 'flower text shirt',
 'as strong as the woman next to me t-shirt',
 'dont sit on my lunch',
 'i choose freedom',
 'turbuhaler',
 'drink your water right meow',
 'oh my my hael',
 "this is marketing you can't be seen until you learn to see",
 'your mothers story',
 'kamea exfoliating foot cream',
 'back exfoliator brush',
 'tecnu wipes',
 'kingslover, barbara',
 'tub stopper and hair catcher',
 'beach must haves for family',
 'me! bath',
 'circ02 oxygen booster and circulation support',
 'supplements to reduce inflammation in the body',
 'slip on walking shoes for men',
 'trending mens fashion',
 'turning shoe',
 'karundul',
 'electolyte drops',
 'frescolita',
 'kalm assure',
 'uti over the counter',
 'flora q probiotic',
 'kyo-dophilus',
 'mccoy pills',
 'megaflora probiotic',
 'phresh probiotics for women',
 'prohormones stack',
 'saccharomyc

In [48]:
list ( df_final [ df_final['query_type']=="nlqec"].drop_duplicates(['query'])['query'].tail(100) )

['i would be buying an asus laptop or another brand of similar quality.',
 'I would buy a vintage jean jacket, with a shearling lined interior. ',
 'I would like a long black asymmetrical hoodie jacket. Totally goth looking and awesome enough to wear over a dress or a pair of jeans ',
 "I'd go with a leather jacket with good protection for use on a motorcycle. Elbow/Shoulder armor is not necessary, but having back protection would be great.",
 'Not sure. Maybe a a standard size (15 or 17??) screen middle of the range laptop that is either black or purple. It might be a Dell or Acer ',
 'i am looking for an olive green fur coat that has a cheetah print on it. it is a long jacket with buttons. ',
 'I would choose a laptop with a good screen, large RAM, great processor, and a good video card.',
 'A laptop that is light weight and with high specifications which I could use for working when at home or when travelling.',
 'I want a black leather jacket with gold zips and pocket buttons',
 'H