In [None]:
!sudo apt-get install git-lfs -y

!git clone https://github.com/amazon-science/esci-data.git

In [None]:
!ls esci-data/shopping_queries_dataset

In [1]:
!pip install langdetect  pandarallel  chardet --quiet


[0m

In [2]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
import pandas as pd
from langdetect import detect, detect_langs
from langdetect import detect, LangDetectException
import re
import string
import unicodedata
from unidecode import unidecode
import chardet

In [4]:
!ls -lah esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet

-rw-r--r-- 1 root root 49M Nov 19 15:55 esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet


Load examples, products and sources


In [5]:
df_examples = pd.read_parquet('esci-data/shopping_queries_dataset/shopping_queries_dataset_examples.parquet')
df_products = pd.read_parquet('esci-data/shopping_queries_dataset/shopping_queries_dataset_products.parquet')
df_sources = pd.read_csv("esci-data/shopping_queries_dataset/shopping_queries_dataset_sources.csv")
# df_dataset.head()

In [6]:
df_examples.head()

Unnamed: 0,example_id,query,query_id,product_id,product_locale,esci_label,small_version,large_version,split
0,0,revent 80 cfm,0,B000MOO21W,us,I,0,1,train
1,1,revent 80 cfm,0,B07X3Y6B1V,us,E,0,1,train
2,2,revent 80 cfm,0,B07WDM7MQQ,us,E,0,1,train
3,3,revent 80 cfm,0,B07RH6Z8KW,us,E,0,1,train
4,4,revent 80 cfm,0,B07QJ7WYFQ,us,E,0,1,train


In [7]:
df_sources

Unnamed: 0,query_id,source
0,0,other
1,1,negations
2,2,negations
3,3,negations
4,4,behavioral
...,...,...
130647,130647,other
130648,130648,other
130649,130649,other
130650,130650,other


In [8]:
df_sources['source'].value_counts()

other            113656
negations          6964
parse_pattern      6083
behavioral         3780
nlqec               169
Name: source, dtype: int64

Merge examples with products


In [9]:
df_examples_products = pd.merge(
    df_examples,
    df_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)



Merge examples with query type


In [10]:
# def isEnglish(s):
#     try:
#         s.encode(encoding='utf-8').decode('ascii')
#     except UnicodeDecodeError:
#         return False
#     else:
#         return True
    
    
# def is_english(s):
#     try:
#         return detect(s) == 'en'
#     except Exception as e:
#         print(( s,e) ) 
#         return False
#     return False

def is_english(text):
    # try:
    #     text = text.encode(encoding='utf-8').decode('ascii')
    # except UnicodeDecodeError as e:
    #     print(e)
    #     return False
    
    if re.search(r"[\u3000-\u303F]", text):
        return False
    
        

    allowed_characters = set(["$", "!", '"', "-", '#',",",".", "%", "'", '’', "’", ")" ,"(","”",'’','$', '”'])
    
    try:
        return all(c.isalnum() or c.isspace() or c in allowed_characters or c in string.punctuation for c in text)
    except:
        return False
    
def is_english(text):
    text = unicodedata.normalize('NFKC', text)
    
    lang = None
    try:
        lang = detect(text)
    except Exception as e:
        print (text, e)

    if lang in ['en','ja','es']:
        return lang
    
    text = unidecode(text)
    try:
        res = text.encode(encoding='utf-8').decode('ascii')
        return "en"
        
    except UnicodeDecodeError:
        pass
    
    
    

    
    return "unknown"


def contains_spanish_accents(text):
    spanish_accents = {'á', 'é', 'í', 'ó', 'ú', 'ñ'}
    return any(char in spanish_accents for char in text)


def contains_japanese(text):
    for char in text:
        if '\u3040' <= char <= '\u309F' or '\u30A0' <= char <= '\u30FF' or '\u4E00' <= char <= '\u9FAF':
            return True
    return False

# Example usage
def is_english(text):
    text = unicodedata.normalize('NFC', text)

    is_japanese = contains_japanese(text)

   
    if is_japanese:
        return "jp"
    
    is_spanish = contains_spanish_accents(text)
    
    if is_spanish:
        return "es"
    

    
    try:
        lang = detect(text)

        if lang =="en":
            return "en"

        #print(lang)

        
        allowed_characters = set(["$", "!", '"', "-", '#',",",".", "%", "'", '’', "’", ")" ,"(","”",'’','$', '”'])
    
        if all(c.isalnum() or c.isspace() or c in allowed_characters or c in string.punctuation for c in text):
            return "en"

        
        #text = unidecode(text)
        lang = detect(text)

        #print(text,lang)
        return lang
    except:
        return "unknown"

    
    
def clean_text(text):
    # Normalize Unicode characters
    text = unicodedata.normalize('NFKC', text)
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def detect_language(text):
    
    is_japanese = contains_japanese(text)
    if is_japanese:
        return "jp"
   
    is_spanish = contains_spanish_accents(text)
    
    if is_spanish:
        return "es"
    
    try:
        res = text.encode(encoding='utf-8').decode('ascii')
        return "en"
        
    except UnicodeDecodeError:
        pass
    
    cleaned_text = clean_text(text)
    try:
        lang = detect(cleaned_text)
        if lang == 'en':
            return 'en'
        elif lang == 'ja':
            return 'ja'
        elif lang == 'es':
            return 'es'
        else:
            return 'unknown'
    except LangDetectException:
        return 'unknown'
    
    return 'unknown'

In [None]:
chardet.detect("1” sink stopper and strainer".encode())

In [None]:
is_english("1” sink stopper and strainer")

In [None]:
is_english("!solid camiseta sin manga"), detect("!solid camiseta sin manga")

In [None]:
is_english( 'alpro soja sin azúcar')

In [None]:
is_english( '32ミリ コテ')

In [None]:
text = "1” sink stopper and strainer"
text = unicodedata.normalize('NFKD', text)
text =unidecode(text)
print(text)
text.encode(encoding='utf-8').decode('ascii')

In [None]:
?unidecode

In [None]:
detect("War doesn't show who's right, just who's left.")

In [None]:
detect_langs("Sink stopper")

In [None]:
df_sources

In [None]:
#df_examples_products_query['is_english'] =  df_examples_products_query['query'].apply(isEnglish)

In [None]:
#detect("hi

In [11]:
#df_query_lang = df_examples_products[df_examples_products ["product_locale"]=="us"]
df_query_lang = df_examples_products[['query','query_id']].drop_duplicates()



In [21]:
df_query_lang = pd.merge(df_query_lang,df_sources, on=['query_id'])

In [22]:
df_query_lang

Unnamed: 0,query,query_id,lang_code,source
0,revent 80 cfm,0,en,other
1,!awnmower tires without rims,1,en,negations
2,!qscreen fence without holes,2,en,negations
3,!solid camiseta sin manga,3,en,negations
4,"""vitamina c""",4,en,behavioral
...,...,...,...,...
130647,ﾎﾙｽﾀｰ,130647,ja,other
130648,ﾏｼﾞｯｸﾘﾝ,130648,ja,other
130649,ﾒｽﾃｨﾝ,130649,ja,other
130650,ﾚﾃﾞｨｰｽ水着,130650,jp,other


In [None]:
df_query_lang

In [12]:
df_query_lang['lang_code'] = df_query_lang['query'].parallel_apply(detect_language)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16332), Label(value='0 / 16332')))…

In [13]:
df_query_lang['lang_code'].value_counts()

en         112061
jp          16756
es           1676
unknown       145
ja             14
Name: lang_code, dtype: int64

In [14]:
df_query_lang [ df_query_lang['lang_code'].isin(['en']) ].head(500)

Unnamed: 0,query,query_id,lang_code
0,revent 80 cfm,0,en
16,!awnmower tires without rims,1,en
32,!qscreen fence without holes,2,en
70,!solid camiseta sin manga,3,en
109,"""vitamina c""",4,en
...,...,...,...
15163,05 syringe without needle,530,en
15179,05 tahoe side mirror not heated,531,en
15194,05 toyota tacoma grill guard,532,en
15210,05 vw passat window regulator without motor set,533,en


In [15]:
df_query_lang [ ~df_query_lang['lang_code'].isin(['en','jp','es','jp']) ]

Unnamed: 0,query,query_id,lang_code
8441,- it’s not you (sara eckel),281,unknown
17515,1 1/2” wide mens belt without buckle,626,unknown
17906,1 1/4” npt,637,unknown
29474,1/18 ｲｸﾞﾆｯｼｮﾝﾓﾃﾞﾙ ｿｱﾗ,1041,ja
46573,10” jack skellingtom funko,1671,unknown
...,...,...,...
2621136,ﾊﾞﾀｰｹｰｽ,130644,ja
2621208,ﾎﾙｽﾀｰ,130647,ja
2621224,ﾏｼﾞｯｸﾘﾝ,130648,ja
2621240,ﾒｽﾃｨﾝ,130649,ja


In [26]:
df_query_lang_en = df_query_lang[ df_query_lang['lang_code'] =="en" ]

In [27]:
df_examples_products_query = pd.merge(
    df_examples_products,
    df_query_lang_en,
    how='inner',
    on=['query_id','query']
)


In [28]:
df_final = df_examples_products_query\
.pipe(lambda x: x[x.product_locale =="us"]) \
.replace({"esci_label": {'E':'Exact' ,'S':'Substitute', 'C':'Complement', 'I':'Irrelevant' } })\
.rename (columns = {'esci_label':'relevance_label','source':'query_type'} ) \
.drop(columns = ['example_id','query_id','product_locale','small_version','large_version','split','lang_code'] )

df_final

Unnamed: 0,query,product_id,relevance_label,product_title,product_description,product_bullet_point,product_brand,product_color,query_type
0,revent 80 cfm,B000MOO21W,Irrelevant,Panasonic FV-20VQ3 WhisperCeiling 190 CFM Ceil...,,WhisperCeiling fans feature a totally enclosed...,Panasonic,White,other
1,revent 80 cfm,B07X3Y6B1V,Exact,Homewerks 7141-80 Bathroom Fan Integrated LED ...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,80 CFM,other
2,revent 80 cfm,B07WDM7MQQ,Exact,Homewerks 7140-80 Bathroom Fan Ceiling Mount E...,,OUTSTANDING PERFORMANCE: This Homewerk's bath ...,Homewerks,White,other
3,revent 80 cfm,B07RH6Z8KW,Exact,Delta Electronics RAD80L BreezRadiance 80 CFM ...,This pre-owned or refurbished product has been...,Quiet operation at 1.5 sones\nBuilt-in thermos...,DELTA ELECTRONICS (AMERICAS) LTD.,White,other
4,revent 80 cfm,B07QJ7WYFQ,Exact,Panasonic FV-08VRE2 Ventilation Fan with Reces...,,The design solution for Fan/light combinations...,Panasonic,White,other
...,...,...,...,...,...,...,...,...,...
2161585,• bradley’s neurology in clinical practice,0323377092,Exact,Neurology Self-Assessment: A Companion to Brad...,,Elsevier,Elsevier,,other
2161586,• bradley’s neurology in clinical practice,0323287832,Exact,"Bradley's Neurology in Clinical Practice, 2-Vo...",,Elsevier,Elsevier,,other
2161587,• bradley’s neurology in clinical practice,0198566344,Exact,Autonomic Failure: A Textbook of Clinical Diso...,,,Oxford University Press,,other
2161588,• bradley’s neurology in clinical practice,0071845836,Exact,The Hospital Neurology Book,,,McGraw-Hill Education / Medical,,other


In [51]:
df_final.to_parquet("../data/cleaned_input.parquet")

In [52]:
df_final = pd.read_parquet("../data/cleaned_input.parquet")

In [53]:
df_final[['query','query_type']].drop_duplicates().to_csv("../data/only_queries.csv", index=False )

In [30]:
df_final['query_type'].value_counts()

other            1527111
parse_pattern     151704
negations          88638
behavioral         44066
nlqec               4355
Name: query_type, dtype: int64

## Query types

**Behavioral**:  We use several statistics to sample queries leading to
results or purchases with non-representative click distributions.

**Negations** We use several regular expressions to sample queries
with negations. (for e.g., ‘energy bar without nuts’.)

**Parse Pattern** We use several regular expressions on the parsed
query to sample queries with some linguistic complexity,
such as queries containing quantities, a product type with
an adjective, etc. (for e.g., ’gluten free english biscuits’.)

**Price Pattern** We use several statistics to sample queries leading
to results or purchases with non-representative price distributions.

**Other** We sample queries from a number of random query sampling processes, removing those that result in perfect or near
perfect results.

**NLQEC** Queries from the NLQEC dataset [13] with 30 tokens or
less.

In [46]:
list ( df_final [ df_final['query_type']=="other"].drop_duplicates(['query'])['query'].head(100) )

[' revent 80 cfm',
 '# do not disturb',
 '# mom life',
 '# sharp not hashtag shirt',
 '#10 cans applesauce',
 '#10 envelopes self seal',
 '#14 x 1-1/2 stainless self tapping',
 '#15 charm',
 '#2 pencils',
 '#5 machine screws',
 '#5 pull cord',
 '#8 phillips head wood screws',
 '#9x5" wood screws',
 '#stuccoville life without a net',
 '$1',
 '$1 items for men',
 '$1 stuffed toy',
 '$10 blanket',
 '$10 candles',
 '$10 gold eagle',
 '$275 airsoft guns',
 "'grinch ornaments'",
 "'m not judging you i'm a social worker i'm diagnosing you",
 "'m team jesus i'm not religious shirt",
 "'m tired of waking up and not being in hawaii",
 "(can't live without your) love and affection nelson",
 '(i’m not your) steppin’ stone',
 '(usb-c cable and lightning cable not included',
 ') vintage surely not everyone was kung fu fighting',
 '*americanized: rebel without a green card by sara saedi',
 '*i have gone 0 days without making a dad joke* shirt',
 '*its not easy being my wifes arm candy* shirt',
 ', da

In [47]:
list ( df_final [ df_final['query_type']=="parse_pattern"].drop_duplicates(['query'])['query'].head(100) )

['.17 cleaning rod',
 '0-3 month swimsuit boy',
 '0.12 airsoft bbs',
 '00 grease',
 '08 chevy silverado led headlights',
 '1 1/2 inch binders 3 ring',
 '1 3/4 belt buckle',
 '1 blue binder',
 '1 inch mending plate',
 '1 inch paint brushes for acrylic painting',
 '1 inch ribbon satin',
 '1 minute sand timers',
 '1.5 inch carabiner clip',
 '1.5 mortise cylinder',
 '1/18 display case',
 '1/2 inch avery binder',
 '1/2 inch curling iron for hair',
 '1/2 inch high torque impact wrench',
 '1/4 npt male to 1/8 npt female bushing',
 '1/4 socket to bit',
 '1/6 hole punch',
 '1/8 cord',
 '1/8 inch grosgrain ribbon',
 '1/8 inch microphone windscreen',
 '10 deep conditioner',
 '10 exercise ball',
 '10 ft iphone charger with wall plug apple certified',
 '10 ft. x 10 ft. heavy duty tarp',
 '10 gallon hot water heater',
 '10 gifts for women',
 '10 inch fan',
 '10 inflatable pool',
 '10 lb weights',
 '10 liter trash can',
 '10 ounce sports bottle',
 '10 pack extension cord',
 '10 spanish short stories 

In [49]:
list ( df_final [ df_final['query_type']=="negations"].drop_duplicates(['query'])['query'].head(100) )

['!awnmower tires without rims',
 '!qscreen fence without holes',
 '# 10 self-seal envelopes without window',
 '# 2 pencils not sharpened',
 '# cellist thats not a hashtag',
 '#1 best and not expensive bath back brush cream color',
 '#1 black natural hair dye without ammonia or peroxide',
 '#1 rated resveratrol supplement without tea leaves',
 '#1 selling shoes for men without shoeleases',
 '#1 small corded treadmill without remote control',
 '#10 envelopes without security tint',
 '#10 standard no tint no window not self seal',
 '#10 window envelopes not self seal',
 '#10 window envelopes without plastic',
 '#11 mrs. kormel is not normal',
 '#12 black boys chain necklace without baseball stitches',
 '#2 dixon oriole pencils not sharpened',
 '#2 pencils with erasers sharpened not soft',
 '#2 pencils without erasers',
 '#20 paper bags without handle',
 '#3 metal zipper slider not made in america',
 '#4 braiding hair not stretched',
 '#4 pads without wings',
 '#5 coil zipper without lock

In [45]:
list ( df_final [ df_final['query_type']=="behavioral"].drop_duplicates(['query'])['query'].head(100) )

['$5 items',
 '$50 gifts for men',
 '110v led strip lights',
 '12v transformer led light',
 '2 tent person tent with vestibule',
 '20 lb ankle weight set',
 "3'x5' rugs",
 '3d dies for card making',
 '4moms bathtub with thermometer',
 '6xl knee sleeve with straps',
 '7x5 notebook spiral',
 '90fun',
 'a gaming set up',
 'a gift for sister',
 'a history of money and banking in the united states',
 'a ring gold',
 'a-shirts undershirts',
 'a/c thermostats for home',
 'a15 case tactical',
 'a2 cow ghee organic',
 'aaa aa batteries combo',
 'aby bouncer',
 'acura floor mats',
 'ada paper towel dispenser',
 'adam and eve toys for women',
 'adnoc',
 'adult learning linking theory and practice',
 'adulting stickers',
 "aeon's end",
 'after birth sitzbath',
 'airsoft gear and equipment',
 'airxwills',
 'aiyima a07',
 'aiyuan',
 'akibento',
 'akileine',
 'albion chelated magnesium',
 'all superhero lego sets',
 'allulose sweetener',
 'almost anywhere',
 'aloe powder organic',
 'alogic usb-c dock

In [44]:
list ( df_final [ df_final['query_type']=="nlqec"].drop_duplicates(['query'])['query'].tail(100) )

['I want a warm, waterproof coat with a hood. One that zips up. Preferably black or khaki green in colour. ',
 'I want the burgundy hoodie jacket with the two big center pockets',
 'I want the new laptop to have a replaceable battery and to be solid state and ruggedized to deal with bumps and scrapes.',
 'I want to buy a laptop that has a lot of features, and is of good quality.',
 'I want to buy a laptop that is touchscreen and has a long battery time. I also want it to be sleek and lightweight.',
 'I want to buy a powerful laptop to put linux on, so maybe a lenovo or dell with a clear screen and lightweight.',
 'I want to get fancier one than the laptop I had. I think I want one of the ones with a touchscreen that folds in different ways',
 'I would be looking for a laptop that is good value and lightweight with long battery life.',
 'I would buy a baubax travel jacket since I love all the pockets.',
 'I would buy a long coat, like a fitted trench coat? ',
 'I would buy a macbook pro