In [1]:
import numpy as np
import pandas as pd
import random
import json
from opensearchpy import OpenSearch
import os
OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost")
OPENSEARCH_PORT = os.getenv("OPENSEARCH_PORT", 9200)

In [2]:
def interleave(listA, listB, k):
    ids = []
    teams = []
    rank = 1
    idx_a = 0
    idx_b = 0
    len_a = len(listA)
    len_b = len(listB)

    while rank <= k:
        a_val = listA[idx_a]['_id'] if idx_a < len_a else None
        b_val = listB[idx_b]['_id'] if idx_b < len_b else None
        if not (a_val and b_val):
            # lists are done
            return zip(ids, teams)
        if not a_val:
            # take the rest of listB
            for hit in listB[idx_b:k]['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        if not b_val:
            # take the rest of listA
            for hit in listA['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        a_first = idx_a < idx_b or idx_a == idx_b and random.randint(0,1)
        if a_first:
            if a_val not in ids:
                ids.append(a_val)
                teams.append('TeamA')
                rank += 1
            idx_a += 1
        else:
            if b_val not in ids:
                ids.append(b_val)
                teams.append('TeamB')
                rank += 1
            idx_b += 1       
    return list(zip(ids, teams))


def get_list(listA, listB, k):
    A = listA['hits']['hits']
    B = listB['hits']['hits']
    interleaving = interleave(A, B, k)
    return interleaving

def get_search_config(client, name):
    search_configs_index = 'search-relevance-search-config'
    conf = client.search( body = {
      "query": {
        "match": {"name": name}
      },
      "size": 1
    }, index=search_configs_index)
    return conf['hits']['hits'][0]['_source'] if len(conf['hits']['hits']) else {}

def populate_query(query, config, size=10, source=["title", "description", "asin"]):
    query = query.replace('"', '\\"')
    body = json.loads(config['query'].replace("%SearchText%", query))
    body['size'] = size
    body['_source'] = source
    return body

def run_AB(client, query, configA, configB, size=10):
    confA = get_search_config(client, configA)
    confB = get_search_config(client, configB)
    qA = populate_query(query, confA, size=size)
    qB = populate_query(query, confB, size=size)
    #TODO: extract the endpoint value from the search config, rather than using the client here
    resA = client.search(body=qA)
    resB = client.search(body=qB)
    result = get_list(resA, resB, 10)
    return result

# This will conflate different query-ids in ubi events. Needs review
def get_events(client, id, query, event_type=None):
    if event_type:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}},
                 {"match": {"action_name": event_type}}
             ]
            }
          },
          "size": 1000
        }
    else:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}}
             ]
            }
          },
          "size": 1000
        }
    results = client.search(body=evq, index='ubi_events')
    return results

def get_clicks(client, id, query):
    results = get_events(client, id, query, 'click')
    return results

def count_clicks(client, id, query):
    results = get_clicks(client, id, query)
    return results['hits']['total']['value']

In [3]:
# Create the client with SSL/TLS and hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': OPENSEARCH_HOST, 'port': OPENSEARCH_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [4]:
ubi_queries = client.search(body={'query': {'match_all': {}}, "size":10000}, index='ubi_queries')
ubi_user_queries = [x['_source']['user_query'] for x in ubi_queries['hits']['hits']]
ubi_user_queries

['A long (mid calf length), lightweight beige/brown single breasted trench jacket ',
 'iwo jima statue',
 'diamond band rings for women size 4',
 'hair dryer',
 'cursive kickoff handwriting without tears',
 'tiered spice rack organizer',
 'robot vacuum',
 'jeansian shirts for men',
 'yoga outfits for women 2 piece set high waist',
 'gaming chair',
 'mattress queen',
 'truck roof tent',
 'unscented toilet paper',
 'align tank top dupes lululemon',
 'cbd oil not hemp oil',
 'my life would suck without you',
 'heated floor thermostat',
 'chilli seasoning packets',
 'liquid probiotic for women',
 'work will suck without you water bottle',
 'apple airpods',
 'toiletries travel bag men',
 'maxpider tesla model y',
 'halo',
 '10 pocket folders without brads',
 '110v led strip lights',
 'kids farm boots for girls',
 'juicy toddler girl',
 'tan towel set',
 'delish white jordan almonds',
 '1 1/2 black drain without overflow',
 'walnut knife block without knives',
 'seat cushion not memory foam'

In [11]:
populate_query('sipsnap', get_search_config(client, 'baseline'))

{'query': {'multi_match': {'query': 'sipsnap',
   'fields': ['id',
    'title',
    'category',
    'bullets',
    'description',
    'attrs.Brand',
    'attrs.Color']}},
 'size': 10,
 '_source': ['title', 'description', 'asin']}

In [5]:
for query in ubi_user_queries[:100]:
    for id,team in run_AB(client, query, 'baseline', 'baseline with title weight'):
        events = get_events(client, id, query)
        count = events['hits']['total']['value']
        if count:
            print(f"{team} {query}, {id} => {count}")

TeamB diamond band rings for women size 4, B07VRNXW2G => 121
TeamB lidded ice cube trays, B08CDCBPSC => 112


In [6]:
for query in ubi_user_queries[:100]:
    for id,team in run_AB(client, query, 'baseline', 'baseline with title weight'):
        count = count_clicks(client, id, query)
        if count:
            print(f"{team} {query}, {id} => {count}")

TeamB lidded ice cube trays, B08CDCBPSC => 8


In [19]:
query_set_df = pd.DataFrame(ubi_user_queries, columns=['query'])
query_set_df

Unnamed: 0,query
0,"A long (mid calf length), lightweight beige/br..."
1,iwo jima statue
2,diamond band rings for women size 4
3,hair dryer
4,cursive kickoff handwriting without tears
...,...
9995,nope not today
9996,09 kia rio front bumper cover without fog lights
9997,#6 fishing hook without barb
9998,fortnight socks do not disturb


In [20]:
query_set_df['res'] = query_set_df['query'].apply(lambda x: [ (id, count_clicks(client, id, x), team) for id,team in run_AB(client, x, 'baseline', 'baseline with title weight')])
query_set_df = query_set_df.explode('res')

In [24]:
query_set_df[['id', 'clicks', 'team']] = pd.DataFrame(query_set_df['res'].tolist(), index=query_set_df.index)
query_set_df

Unnamed: 0,query,res,id,clicks,team
0,"A long (mid calf length), lightweight beige/br...","(B075WTPCQ5, 0, TeamA)",B075WTPCQ5,0.0,TeamA
0,"A long (mid calf length), lightweight beige/br...","(B071GRS2QR, 0, TeamB)",B071GRS2QR,0.0,TeamB
0,"A long (mid calf length), lightweight beige/br...","(B000A33JI2, 0, TeamA)",B000A33JI2,0.0,TeamA
0,"A long (mid calf length), lightweight beige/br...","(B077Y731X7, 0, TeamB)",B077Y731X7,0.0,TeamB
0,"A long (mid calf length), lightweight beige/br...","(B07XCPBM7Z, 0, TeamA)",B07XCPBM7Z,0.0,TeamA
...,...,...,...,...,...
9999,vintage suitcase for women,"(B07YRDYBNQ, 0, TeamB)",B07YRDYBNQ,0.0,TeamB
9999,vintage suitcase for women,"(B01BV8D10U, 0, TeamA)",B01BV8D10U,0.0,TeamA
9999,vintage suitcase for women,"(B071FVGCRW, 0, TeamB)",B071FVGCRW,0.0,TeamB
9999,vintage suitcase for women,"(B07ZPNDHYN, 0, TeamA)",B07ZPNDHYN,0.0,TeamA


In [25]:
query_set_df[query_set_df['team'] == 'TeamA'].describe()

Unnamed: 0,clicks
count,46858.0
mean,0.024073
std,0.57557
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,36.0


In [26]:
query_set_df[query_set_df['team'] == 'TeamB'].describe()

Unnamed: 0,clicks
count,51029.0
mean,0.028415
std,0.570478
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,28.0


In [27]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
112,tiger stuffed animal,"(B007JLUE4U, 9, TeamA)",B007JLUE4U,9.0,TeamA
194,rug deep cleaner,"(B07D46SQ63, 2, TeamA)",B07D46SQ63,2.0,TeamA
285,gifts for godparent,"(B07WMPLHD7, 11, TeamA)",B07WMPLHD7,11.0,TeamA
357,fruitcakes with rum or brandy,"(B00QKYOLAK, 12, TeamA)",B00QKYOLAK,12.0,TeamA
405,best orbital polisher buffer,"(B002654I46, 10, TeamA)",B002654I46,10.0,TeamA
...,...,...,...,...,...
9761,try flex mirror,"(B07QQZL14Q, 2, TeamA)",B07QQZL14Q,2.0,TeamA
9763,decora white switch,"(B000U3DV2Q, 6, TeamA)",B000U3DV2Q,6.0,TeamA
9793,silk bathing suit cover up,"(B075M68XXX, 3, TeamA)",B075M68XXX,3.0,TeamA
9876,marble composition notebook wide ruled,"(B0034XS3I6, 6, TeamA)",B0034XS3I6,6.0,TeamA


In [28]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
64,lidded ice cube trays,"(B08CDCBPSC, 8, TeamB)",B08CDCBPSC,8.0,TeamB
105,mesh bag for washing shoes,"(B08PPDMQ8N, 8, TeamB)",B08PPDMQ8N,8.0,TeamB
159,l.o.l. surprise! winter disco cottage playhouse,"(B085B26YB2, 19, TeamB)",B085B26YB2,19.0,TeamB
449,black and white air max shoes for women,"(B07HLHVPS6, 13, TeamB)",B07HLHVPS6,13.0,TeamB
596,corning ware lids french white,"(B00080XGNA, 6, TeamB)",B00080XGNA,6.0,TeamB
...,...,...,...,...,...
9600,navy blue bathroom towel set,"(B097Q3425Y, 7, TeamB)",B097Q3425Y,7.0,TeamB
9869,gshock solar mens watch,"(B00791R1MI, 10, TeamB)",B00791R1MI,10.0,TeamB
9884,brakecrafters,"(B087KVRDKN, 1, TeamB)",B087KVRDKN,1.0,TeamB
9932,aqua full sheets,"(B00VIWSZ6U, 10, TeamB)",B00VIWSZ6U,10.0,TeamB


In [29]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,187.0
mean,7.754011
std,5.390496
min,1.0
25%,4.0
50%,7.0
75%,10.0
max,28.0


In [30]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,171.0
mean,6.596491
std,6.906586
min,1.0
25%,1.5
50%,5.0
75%,9.0
max,36.0


In [31]:
query_set_df.to_csv('ubi_queries_df_3.csv')

In [17]:
get_search_config(client, name="baseline with title weight")

{'id': '40ae9a31-eaf0-40bd-b6fb-589979bd0604',
 'name': 'baseline with title weight',
 'timestamp': '2025-06-17T17:37:26.717Z',
 'index': 'ecommerce',
 'query': '{"query":{"multi_match":{"query":"%SearchText%","fields":["id","title^25","category","bullets","description","attrs.Brand","attrs.Color"]}}}',
 'searchPipeline': ''}

In [18]:
get_search_config(client, name="baseline")

{'id': '78482772-d169-47dc-bb25-bb1a16a73c48',
 'name': 'baseline',
 'timestamp': '2025-06-17T17:37:26.557Z',
 'index': 'ecommerce',
 'query': '{"query":{"multi_match":{"query":"%SearchText%","fields":["id","title","category","bullets","description","attrs.Brand","attrs.Color"]}}}',
 'searchPipeline': ''}