In [13]:
# !pip install pyarrow fastparquet

In [14]:
# !pip install pandas
# !pip install  pandarallel mitosheet --quiet
# !pip install nltk --quiet

In [3]:
import pandas as pd
from collections import Counter
import itertools
import mitosheet
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', None)

In [5]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/rky/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/rky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [7]:
stemmer = PorterStemmer()
stops = set(stopwords.words('english'))

In [15]:
df = pd.read_parquet("../data/cleaned_input.parquet")

In [16]:
df_product = df[['product_id','product_title','url_product','url_image']].drop_duplicates(['product_id']).reset_index(drop=True)
df_product

df.shape

(1790990, 11)

In [17]:
def normalize_and_stem(query):
    words = nltk.word_tokenize(query.lower())
    return [stemmer.stem(word) for word in words]
    
def query_title_similarity(query,title):
    score = 0
    
    query = normalize_and_stem(query)
    title = normalize_and_stem(title)
    
    title_words = set(title)
    query_words = set(query)
    
    for word in query_words:
        if word not in stops and word not in title:
            score+=1
            
    return score

In [18]:
df_res = df [ df['relevance_label']=="Exact" ]

df_res = df_res.groupby(['product_id'],as_index=False).agg(queries=('query',pd.Series.unique), num_queries=('query',len ), title=('product_title', "first") )

df_res = df_res [ df_res['num_queries'] >= 2 ]

In [28]:
df_res

Unnamed: 0,product_id,queries,num_queries,title
16,0060245867,"[best childrens books by age 1-3, if you give a pig the white house]",2,If You Give a Mouse a Cookie
18,0060256656,"[children books by age 1-3, best childrens books by age 1-3, children's books ages 3-5, best books for kids]",4,The Giving Tree
19,0060256672,"[best books for kids, rosie revere engineer, classic poetry books]",3,Where the Sidewalk Ends: Poems and Drawings
23,0060555661,"[forex trading for dummies, the interpretation of financial statements benjamin graham]",2,The Intelligent Investor Rev Ed.: The Definitive Book on Value Investing
35,0060935464,"[book of lists 2, give thanks to the lord, pbs great american read]",3,To Kill a Mockingbird
...,...,...,...,...
109609,B09FK9DFL1,"[watch tv without cable, indoor antenna for tv without cable]",2,"Long Range Signal Booster TV 6.35mm Mono 1/4"" Male Stereo to XLR Male KTV,Microphone,fa2762"
109628,B09FX4C7SJ,"[amplified tv antenna outdoor, digital tv antenna outdoor]",2,HDTV Antenna
109645,B09G76JHXL,"[didlo for women realistic thick, didos toys for women realistic thick]",2,Ðịllo Tọy for Womeṇ Bịg - Ðịdlọ for Womeṇ Pleasụre - Ðịdos for Sẹx Men Sụctioṇ - Tọys Adụlt Pọrtablẹ Sẹx Reạlistic Pleasụre G Bạlls Begiṇners
109665,B09GPFLZ4W,"[pennis extender sleeves realistic, girth enhancer for men]",2,Reusable Male Péňîs Cöndöm Ḉöck ḊîḈk Péňnîs Sléévé 7 Inch Adúlt Šëx Tõÿs Pénnǐs Extension Sleeve for Men-Realistic-Condom-Thick-Girth-Enhancer-Enlarger-Extender-Growth K22


In [29]:

df_res [ df_res['num_queries'] > 3 ]

Unnamed: 0,product_id,queries,num_queries,title
18,0060256656,"[children books by age 1-3, best childrens books by age 1-3, children's books ages 3-5, best books for kids]",4,The Giving Tree
49,0061992275,"[books for 4th grade boys, best books for kids, tuesday mooney talks to ghosts, award winning childrens books ages 9-12]",4,The One and Only Ivan
90,0062377027,"[books for 12 year old girls best sellers, books for 4th grade boys, best books for kids, books for boys age 8, nature books for kids 5-7]",5,Pax
138,0062841742,"[children books by age 1-3, children’s board books, best childrens books by age 1-3, best selling books for toddlers, picture books for babies, celebrity childrens books]",6,I've Loved You Since Forever
355,030797586X,"[book pack for 3 year olds, best selling books for toddlers, books for 2 year olds, dr suess books for babies]",4,The Little Blue Box of Bright and Early Board Books by Dr. Seuss (Bright & Early Board Books(TM))
...,...,...,...,...
108877,B096GR9NX9,"[blood sugar monitor without finger pricks, diabetes test without blood, sugar tester diabetes kit without needle, test blood sugar without pricking your finger, diabetic testing kit without blood]",5,"eTouch ETM-G01 World's First Noninvasive Blood Glucose MonitorMeter, White, 1 Count (Pack of 1)"
108884,B096KBJQQL,"[bathtub mats without suction cups, bath mat without suction cups, tub mat without suction cups, bath mat for tub without suction cups]",4,"Non-Slip Bathtub Mat PVC Loofah Bath Mat for Tub Comfort Shower Tub Mat for Wet Areas, Quick Drying Soft Anti-Skid Bathroom Mats DIY Cutting (S)"
109187,B098JS3C8X,"[A standard new laptop that is budget friendly and which has internet access, a windows operating system, large clean screen and user friendly , I would choose a laptop that has a lot of memory storage, a long battery life, a decent screen size and in a nice colour. , An everyday laptop for surfing the net and emails and not too expensive., A lightweight one with long battery life and quick processor with adequate storage. One that can be transported easily with at least a 13 inch screen and backlit keyboard., Anything with high power capabilities/a gaming laptop.]",5,"TECLAST 15.6” Windows 10 Laptop Computer, 8GB+256GB SSD, Up to 2.6GHz Quad Core Intel N4120 Windows Laptop, 1920x1080 Traditional Laptop 2.4G+5G WiFi, Bluetooth Mini-HDMI for Work and Entertainment"
109199,B098N7KWTB,"[shower mat without suction cups for textured tub surface, bathtub mats non slip mildew resistant without suction cups, bathtub mats without suction cups, bath mat without suction cups, tub mat without suction cups, bath mat for tub without suction cups]",6,"Bath Tub Slip Mat, 16x24 Inch Non Slip Bath Mat for Tub, Shower Mats for Showers Anti Slip , PVC Quick Drying Soft Comfort Bathroom Mats for Wet Areas or Floor, Grey"


In [30]:
df_res = df [ df['relevance_label']=="Exact" ]

df_res['new_tokens'] = df_res.parallel_apply(lambda x: query_title_similarity(x['query'] , x['product_title']), axis=1)

df_res  =df_res [ df_res['new_tokens']>2 ]

df_res = df_res.groupby(['product_id'],as_index=False).agg(queries=('query',pd.Series.unique), num_queries=('query',len ), title=('product_title', "first") )
df_res = df_res [ df_res['num_queries'] >= 2 ]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=122967), Label(value='0 / 122967')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_res['new_tokens'] = df_res.parallel_apply(lambda x: query_title_similarity(x['query'] , x['product_title']), axis=1)


In [31]:
df_res.sample(100)[['product_id','queries','title']]

Unnamed: 0,product_id,queries,title
96471,B0828PS47L,"[shelves without screws, wall shelves without nails]","KES Small Shelf for Wall 8 Inches Floating Shelf with Charger Cable Hole for Bluetooth Speaker Smart Speaker Aluminum 2 Pack Matte Black, BSC410S20DG-BK-P2"
46276,B01NCEXCWO,"[smart wool socks pack, smart wool socks womens medium]",Smartwool Women's Hike Ultra Light Crew Glacial Blue M
104230,B08G11Y2XK,"[breathable face masks for women, vented face mask]","5 Pack colors Protective Covers with 10 Carbon Filter,Washable Reusable Cotton Blend Protection Cover with breathing valve"
36116,B01AK991SC,"[mens sweatpants open bottom with pockets, sweat pants men streight leg, mens open bottom sweatpants]","Hanes Men's Jersey Pant, Black, X-Large"
8195,B000UVVX28,"[bajas para el estomago para mujer, stomach fat burner belt for men]","McDavid Waist Trimmer Belt Neoprene Fat Burning Sauna Waist Trainer - Promotes Healthy Sweat, Weight Loss, Lower Back Posture (Includes 1 Belt) , Black"
...,...,...,...
79180,B07NLKD4FY,"[pacman video game machine, pacman arcade game, pinball machine pacman]",ARCADE1UP Classic Cabinet Riser (Pac-Man)
49306,B071JQ6LCC,"[kids smart watch, smart watch for kids, kids watches girls, kids fitness tracker for girls, tech smart friends, kid smart watches, kids phone watch]","VTech KidiZoom Smartwatch DX2, Pink"
69264,B07GG1RMGB,"[18 mo boys long sleeve, 18 month long sleeve onesie boy]","Gerber Baby Multi-Pack Long-Sleeve Onesies Bodysuit, 6-Pack White, 18 Months"
31019,B00VWM6DT2,"[xl fanny pack for plus size, women belt bag, belt bag for women]","Herschel Fifteen Waist Pack, Black, 2.0L"


In [23]:
# !pip install langchain

In [None]:
# from langchain.llms import OpenAI
# from langchain.prompts import PromptTemplate

In [38]:
# !pip install openai

In [30]:
from typing import List

In [31]:
from langchain.schema import StrOutputParser
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache
from langchain.callbacks import get_openai_callback

In [32]:
set_llm_cache(SQLiteCache(database_path=".langchain.db"))

In [47]:
from langchain.schema import BaseOutputParser
import json
class CommaSeparatedListOutputParser(BaseOutputParser[List[str]]):
    """Parse the output of an LLM call to a comma-separated list."""


    def parse(self, text: str) -> [str]:
        """Parse the output of an LLM call."""
        llm = text.split('\n')
        llm = [l[4:-1] for l  in llm]
        return llm

from langchain.chat_models import ChatOpenAI
from langchain.prompts.chat import ChatPromptTemplate

class GetQueriesFromLLM:
    
    def __init__(self,api_key, model='gpt-4'):
        self.template = "You are a helpful assistant that generates the queries needed to find products in an ecommerce."
        self.human_template = "Given two queries to find {product_title} are {sample_query_1} and {sample_query_2}, write {output_queries_size} more similar customer search queries."
        self.chat_prompt = ChatPromptTemplate.from_messages([
            ("system", self.template),
            ("human", self.human_template),
        ])
        self.chain = self.chat_prompt | ChatOpenAI(openai_api_key=api_key, model=model) | CommaSeparatedListOutputParser()

    def get_top_n_queries(self,product_title, input_queries, return_query_size=2):
        """
        Takes product title and two input queries. 
        """
        response = self.chain.invoke({"product_title": product_title, 
                                      "sample_query_1": input_queries[0], 
                                      "sample_query_2": input_queries[1], 
                                      "output_queries_size": return_query_size})
        
        return response
        

# chain = chat_prompt | ChatOpenAI(openai_api_key='') | CommaSeparatedListOutputParser()

In [48]:
object_LLM = GetQueriesFromLLM(api_key='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')

In [49]:
title = 'Leg Avenue Women\'s Costume, Black, Medium'
input_queries = ['batwoman costume accessories adult', 'halloween sexy bodysuit']
res = object_LLM.get_top_n_queries(title,input_queries,10)

In [50]:
title

"Leg Avenue Women's Costume, Black, Medium"

In [51]:
res

["Leg Avenue Women's Batwoman Costume Medium Size",
 "Women's Black Medium Costume by Leg Avenue",
 'Medium Size Sexy Halloween Bodysuit for Women',
 'Batwoman Adult Costume Accessories, Medium Size',
 'Black Leg Avenue Costume for Women in Medium',
 "Medium Size Leg Avenue Women's Halloween Costume",
 'Leg Avenue Batwoman Outfit in Medium Black',
 'Black Batwoman Costume by Leg Avenue, Medium',
 'Sexy Bodysuit Halloween Costume Leg Avenue Medium',
 '"Women\'s Medium Size Black Costume from Leg Avenue']

In [40]:
df_sample = df_res.sample(1000)[['product_id','queries','title']]
df_sample.set_index('product_id',inplace=True)
sample_dict = df_sample.to_dict(orient='index')


In [41]:
final_dict = dict()

In [44]:
counter = 0
for k,v in sample_dict.items():
    if k not in final_dict:
        final_dict[k] = v
        final_dict[k]['llm'] = object_LLM.get_top_n_queries(v['title'],v['queries'], 4)
    counter +=1
    if counter%50==0:
        print(counter)
    

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000


In [28]:
len(final_dict)

NameError: name 'final_dict' is not defined

In [60]:
import pickle

# save dictionary to person_data.pkl file
with open('llm_amazon_data_4_vals.pkl', 'wb') as fp:
    pickle.dump(final_dict, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [1]:
import pickle

# Read dictionary pkl file
with open('llm_amazon_data_4_vals.pkl', 'rb') as fp:
    another_final_dict = pickle.load(fp)

In [15]:
errors = []

In [16]:
for k,v in another_final_dict.items():
    try:
        llm = v['llm'].split('\n')
        llm = [l[4:-1] for l in llm]
        another_final_dict[k]['llm_updated'] = llm
    except:
        errors.append(k)

In [50]:
1

1

In [24]:
df_output = pd.DataFrame.from_dict(another_final_dict, orient='index').reset_index().rename(columns={'index':'product_id'})

In [27]:
df_output.to_parquet('cleaned_llm_output.parquet',index=False)

In [29]:
df_output

Unnamed: 0,product_id,queries,title,llm,llm_updated
0,B076C296F8,"[faux fur bedspread, white furry bed set, bed spreads fluffy]","Ceruleanhome 1pc 100% Velvet Flannel Duvet Cover, Solid Color, No Inside Filler, Zipper Close (Queen 1pc Duvet Cover, White)","1. ""Ceruleanhome white velvet flannel duvet cover""\n2. ""Queen size white zippered duvet cover""\n3. ""Solid color velvet flannel bedding""\n4. ""No filler white duvet cover""","[Ceruleanhome white velvet flannel duvet cover, Queen size white zippered duvet cover, Solid color velvet flannel bedding, No filler white duvet cover]"
1,B07T1FHFDN,"[10 set earbuds without plastic ear caps, otium bluetooth replacement eartips]",Earbudz 10 Pairs Medium Silicone Replacement Earbud Ear Buds Tips – Black,"1. ""Medium size silicone earbud replacements in black by Earbudz""\n2. ""10 pairs of Earbudz replacement ear tips in black""\n3. ""Black silicone earbud tips without plastic caps from Earbudz""\n4. ""Replacement ear tips for Otium Bluetooth earbuds""","[Medium size silicone earbud replacements in black by Earbudz, 10 pairs of Earbudz replacement ear tips in black, Black silicone earbud tips without plastic caps from Earbudz, Replacement ear tips for Otium Bluetooth earbuds]"
2,B085SVMMVJ,"[air conditioner wall mounted unit, window air conditioner]","TCL 6W3ER1-A Home Series Window-air-Conditioner, 6,000 BTU, White","1. ""TCL 6000 BTU Home Series air conditioner""\n2. ""White window-mounted air conditioner TCL 6W3ER1-A""\n3. ""TCL 6W3ER1-A Home Series 6,000 BTU air conditioner""\n4. ""TCL Home Series White window air conditioner unit""","[TCL 6000 BTU Home Series air conditioner, White window-mounted air conditioner TCL 6W3ER1-A, TCL 6W3ER1-A Home Series 6,000 BTU air conditioner, TCL Home Series White window air conditioner unit]"
3,B0030BEPPW,"[small kitchen shelf for amazon show, shelving unit 24 inches wide x 12 inches deep, metal storage rack, regency chrome two basket and one shelf cart]",5-Tier Chrome Heavy-Duty Adjustable Shelving Unit with 200-lb Per Shelf Weight Capacity,"1. ""Heavy-duty chrome shelving unit with 200-lb shelf capacity""\n2. ""5-Tier adjustable storage rack for kitchen on Amazon""\n3. ""200-lb capacity shelving unit for small kitchen""\n4. ""24x12 inch heavy-duty chrome shelving unit"".","[Heavy-duty chrome shelving unit with 200-lb shelf capacity, 5-Tier adjustable storage rack for kitchen on Amazon, 200-lb capacity shelving unit for small kitchen, 24x12 inch heavy-duty chrome shelving unit""]"
4,B000KKOKIS,"[bromine test strips for hot tubs, hot tub test kit]","Poolmaster 22212 Smart Test 6-Way Swimming Pool and Spa Water Chemistry Test Strips, 50 count","1. ""Poolmaster 22212 Smart Test 6-Way Pool and Spa Water Testing Strips""\n2. ""Swimming Pool and Spa Water Chemistry Test Strips 50 count""\n3. ""Smart Test 6-Way Water Chemistry Test Strips for Pools and Spas""\n4. ""50 count bromine test strips for swimming pools and hot tubs""","[Poolmaster 22212 Smart Test 6-Way Pool and Spa Water Testing Strips, Swimming Pool and Spa Water Chemistry Test Strips 50 count, Smart Test 6-Way Water Chemistry Test Strips for Pools and Spas, 50 count bromine test strips for swimming pools and hot tubs]"
...,...,...,...,...,...
995,B01J8RBRSY,"[lnsta facelifter fac, arm and bra fat workout, waist trainer for weight loss, fat burner cream for belly skin tightening, best fat burning cream for belly]",Premium Hot Cream Sweat Enhancer - Firming Body Lotion for Women and Men and Body Sculpting Cellulite Workout Cream - Invigorating and Moisturizing Body Lotion and Body Firming Cream with Natural Oils,"1. ""High-quality sweat-enhancing hot cream for body firming - Suitable for both genders""\n2. ""Search for workout body cream for cellulite reduction and skin moisturizing - Contains natural oils""\n3. ""Premium Body Sculpting Cream for arm and bra fat workout - Enriched with Natural Oils""\n4. ""Looking for Invigorating Body Lotion for Body Sculpting and Cellulite Workout""\n5. ""Need Insta facelifter face and arm toning cream with moisturizing properties""\n6. ""Searching for Men and Women's Firming lotion - Hot Cream Sweat Enhancer""","[High-quality sweat-enhancing hot cream for body firming - Suitable for both genders, Search for workout body cream for cellulite reduction and skin moisturizing - Contains natural oils, Premium Body Sculpting Cream for arm and bra fat workout - Enriched with Natural Oils, Looking for Invigorating Body Lotion for Body Sculpting and Cellulite Workout, Need Insta facelifter face and arm toning cream with moisturizing properties, Searching for Men and Women's Firming lotion - Hot Cream Sweat Enhancer]"
996,B003RYIWEM,"[oil rubbed bronze tub faucet without diverter, oil rubbed bronze tub spout without diverter]","Moen T2153EPORB Brantford Posi-Temp Pressure Balancing Eco-Performance Tub and Shower Trim Kit Valve Required, Oil-Rubbed Bronze","1. ""Moen T2153EPORB Brantford Posi-Temp Tub and Shower Trim in Oil-Rubbed Bronze without diverter""\n2. ""Oil-Rubbed Bronze Eco-Performance Tub faucet with no diverter""\n3. ""Brantford Posi-Temp Pressure Balancing Shower Trim Kit in Oil-Rubbed Bronze without diverter""\n4. ""Moen pressure balancing tub and shower kit in oil-rubbed bronze, no diverter required"".","[Moen T2153EPORB Brantford Posi-Temp Tub and Shower Trim in Oil-Rubbed Bronze without diverter, Oil-Rubbed Bronze Eco-Performance Tub faucet with no diverter, Brantford Posi-Temp Pressure Balancing Shower Trim Kit in Oil-Rubbed Bronze without diverter, Moen pressure balancing tub and shower kit in oil-rubbed bronze, no diverter required""]"
997,B07HVK4172,"[plus size night party dress, masquerade plus size dresses for women, sexy club outfits for women party club night]",GOBLES Women's Summer Sexy One Shoulder Ruffle Bodycon Midi Cocktail Dress Black,"1. ""GOBLES Women's Plus Size One Shoulder Cocktail Dress""\n2. ""Sexy One Shoulder Ruffle Midi Dress for Plus Size Women""\n3. ""Black Bodycon Dress for Plus Size Women for Night Parties""\n4. ""Plus Size Women's Masquerade Cocktail Dresses in Black"".","[GOBLES Women's Plus Size One Shoulder Cocktail Dress, Sexy One Shoulder Ruffle Midi Dress for Plus Size Women, Black Bodycon Dress for Plus Size Women for Night Parties, Plus Size Women's Masquerade Cocktail Dresses in Black""]"
998,B004G0GKW2,"[bliss flat iron for hair, babyliss pro nano titanium cool mist, babyliss pro nano titanium plated ionic straightening iron]","BaBylissPRO BABNTBK3070TN Nano Titanium Ultra-Sleek Straightening Iron, 1 Inch, Black","1. ""BaBylissPRO Nano Titanium 1 Inch Straightening Iron""\n2. ""Ultra-Sleek Black Straightening Iron by BaBylissPRO""\n3. ""BaBylissPRO BABNTBK3070TN Hair Straightener""\n4. ""Nano Titanium Flat Iron BaBylissPRO BABNTBK3070TN""","[BaBylissPRO Nano Titanium 1 Inch Straightening Iron, Ultra-Sleek Black Straightening Iron by BaBylissPRO, BaBylissPRO BABNTBK3070TN Hair Straightener, Nano Titanium Flat Iron BaBylissPRO BABNTBK3070TN]"


In [None]:
1. Finetuning of using alpaca format and axloctl library.
2. Format the data runpod