In [1]:
import numpy as np
import pandas as pd
import random
import json
from opensearchpy import OpenSearch
import os
OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost")
OPENSEARCH_PORT = os.getenv("OPENSEARCH_PORT", 9200)

In [10]:
def interleave(listA, listB, k):
    ids = []
    teams = []
    rank = 1
    idx_a = 0
    idx_b = 0
    len_a = len(listA)
    len_b = len(listB)

    while rank <= k:
        a_val = listA[idx_a]['_id'] if idx_a < len_a else None
        b_val = listB[idx_b]['_id'] if idx_b < len_b else None
        if not (a_val and b_val):
            # lists are done
            return zip(ids, teams)
        if not a_val:
            # take the rest of listB
            for hit in listB[idx_b:k]['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        if not b_val:
            # take the rest of listA
            for hit in listA['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        a_first = idx_a < idx_b or idx_a == idx_b and random.randint(0,1)
        if a_first:
            if a_val not in ids:
                ids.append(a_val)
                teams.append('TeamA')
                rank += 1
            idx_a += 1
        else:
            if b_val not in ids:
                ids.append(b_val)
                teams.append('TeamB')
                rank += 1
            idx_b += 1       
    return list(zip(ids, teams))


def get_list(listA, listB, k):
    A = listA['hits']['hits']
    B = listB['hits']['hits']
    interleaving = interleave(A, B, k)
    return interleaving

def get_search_config(client, name):
    search_configs_index = 'search-relevance-search-config'
    conf = client.search( body = {
      "query": {
        "match": {"name": name}
      },
      "size": 1
    }, index=search_configs_index)
    return conf['hits']['hits'][0]['_source'] if len(conf['hits']['hits']) else {}

def populate_query(query, config, size=10, source=["title", "description", "asin"]):
    query = query.replace('"', '\\"')
    body = json.loads(config['query'].replace("%SearchText%", query))
    body['size'] = size
    body['_source'] = source
    return body

def run_AB(client, query, configA, configB, size=10):
    confA = get_search_config(client, configA)
    confB = get_search_config(client, configB)
    qA = populate_query(query, confA, size=size)
    qB = populate_query(query, confB, size=size)
    #TODO: extract the endpoint value from the search config, rather than using the client here
    resA = client.search(body=qA)
    resB = client.search(body=qB)
    result = get_list(resA, resB, 10)
    return result

def get_events(client, id, query, event_type=None):
    if event_type:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}},
                 {"match": {"action_name": event_type}}
             ]
            }
          },
          "size": 1000
        }
    else:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}}
             ]
            }
          },
          "size": 1000
        }
    results = client.search(body=evq, index='ubi_events')
    return results

def get_clicks(client, id, query):
    results = get_events(client, id, query, 'click')
    return results

def count_clicks(client, id, query):
    results = get_clicks(client, id, query)
    return results['hits']['total']['value']

In [3]:
# Create the client with SSL/TLS and hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': OPENSEARCH_HOST, 'port': OPENSEARCH_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [4]:
ubi_queries = client.search(body={'query': {'match_all': {}}, "size":10000}, index='ubi_queries')
ubi_user_queries = [x['_source']['user_query'] for x in ubi_queries['hits']['hits']]
ubi_user_queries

['canned tuna solid white albacore water',
 'vitamin e oil without soy',
 'bikini high waisted',
 'budha mural wall art',
 'adidas golf shoes men',
 'and pepper shaker',
 'bee traps outdoor hanging',
 'leather sofas for living room',
 'scetchpad for drawing',
 '115 piece drill bit index without drill bits',
 'clif bars cool mint chocolate',
 'clock',
 'outdoor fans without lights',
 'planters for outdoor plants',
 'calcium supplements without vitamin d',
 'dimplex ignite xl 74',
 'resin patio chairs',
 'yeti',
 'ffp2 mask without valve',
 'gaming monitor',
 'round baking pans',
 'mct oil without lauric acid',
 'mouthguard toothbrush for adults',
 'japan deep frying pan',
 'hoppes bore snake 9mm pistol',
 'little ikes turn slide',
 'floating rafts for ocean',
 'albany park sofa',
 'sipsnap',
 'magnified lighted mirror',
 'red green yellow party decorations',
 'maritini glasses',
 '6 x 12 floor register oil rubbed bronze',
 'boys nautica shorts',
 'foam roller',
 'black teddy jacket',
 '

In [11]:
populate_query('sipsnap', get_search_config(client, 'baseline'))

{'query': {'multi_match': {'query': 'sipsnap',
   'fields': ['id',
    'title',
    'category',
    'bullets',
    'description',
    'attrs.Brand',
    'attrs.Color']}},
 'size': 10,
 '_source': ['title', 'description', 'asin']}

In [12]:
for query in ubi_user_queries[:10]:
    for id,team in run_AB(client, query, 'baseline', 'baseline with title weight'):
        events = get_events(client, id, query)
        count = events['hits']['total']['value']
        if count:
            print(f"{team} {query}, {id} => {count}")

In [14]:
for query in ubi_user_queries[:100]:
    for id,team in run_AB(client, query, 'baseline', 'baseline with title weight'):
        count = count_clicks(client, id, query)
        if count:
            print(f"{team} {query}, {id} => {count}")

TeamB resin patio chairs, B06Y4C6ZXZ => 3
TeamA yeti, B073WJRJZZ => 36
TeamB red green yellow party decorations, B07R4Y3LXZ => 1


In [15]:
query_set_df = pd.DataFrame(ubi_user_queries, columns=['query'])
query_set_df

Unnamed: 0,query
0,canned tuna solid white albacore water
1,vitamin e oil without soy
2,bikini high waisted
3,budha mural wall art
4,adidas golf shoes men
...,...
9995,burn to brown maui
9996,dog costumes for large dogs
9997,mua kit
9998,toronto blue jays cap


In [16]:
query_set_df['res'] = query_set_df['query'].apply(lambda x: [ (id, count_clicks(client, id, x), team) for id,team in run_AB(client, x, 'baseline', 'baseline with title weight')])
query_set_df = query_set_df.explode('res')

In [17]:
query_set_df = query_set_df.join(pd.DataFrame(query_set_df['res'].values.tolist(), columns=['id', 'clicks', 'team']))
query_set_df

Unnamed: 0,query,res,id,clicks,team
0,canned tuna solid white albacore water,"(B076CQZXXL, 0, TeamB)",B076CQZXXL,0.0,TeamB
0,canned tuna solid white albacore water,"(B002MDSY56, 0, TeamA)",B076CQZXXL,0.0,TeamB
0,canned tuna solid white albacore water,"(B08N4TGWXF, 0, TeamB)",B076CQZXXL,0.0,TeamB
0,canned tuna solid white albacore water,"(B005TOXCX4, 0, TeamB)",B076CQZXXL,0.0,TeamB
0,canned tuna solid white albacore water,"(B06Y45L4K2, 0, TeamA)",B076CQZXXL,0.0,TeamB
...,...,...,...,...,...
9999,daddy daughter not always eye to eye,"(B01FCQUHFY, 0, TeamA)",B06XQKQYSB,0.0,TeamA
9999,daddy daughter not always eye to eye,"(B089YTJRB3, 0, TeamB)",B06XQKQYSB,0.0,TeamA
9999,daddy daughter not always eye to eye,"(B0838J8RQG, 0, TeamA)",B06XQKQYSB,0.0,TeamA
9999,daddy daughter not always eye to eye,"(B07VRSP12M, 0, TeamB)",B06XQKQYSB,0.0,TeamA


In [18]:
query_set_df[query_set_df['team'] == 'TeamA'].describe()

Unnamed: 0,clicks
count,46872.0
mean,0.019201
std,0.604767
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,36.0


In [19]:
query_set_df[query_set_df['team'] == 'TeamB'].describe()

Unnamed: 0,clicks
count,50766.0
mean,0.013789
std,0.312274
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,11.0


In [20]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
178,echo dot 3rd gen,"(B083NTHSJH, 0, TeamA)",B073WJRJZZ,36.0,TeamA
178,echo dot 3rd gen,"(B08XB3XCQ4, 0, TeamB)",B073WJRJZZ,36.0,TeamA
178,echo dot 3rd gen,"(B07GPN3MRY, 0, TeamA)",B073WJRJZZ,36.0,TeamA
178,echo dot 3rd gen,"(B07XDB4Z5Q, 0, TeamB)",B073WJRJZZ,36.0,TeamA
178,echo dot 3rd gen,"(B07H4ZNHPY, 0, TeamA)",B073WJRJZZ,36.0,TeamA
...,...,...,...,...,...
9709,klein cable cutters,"(B084P11D8M, 0, TeamA)",B09CR6XHDD,2.0,TeamA
9709,klein cable cutters,"(B000GASHJU, 0, TeamB)",B09CR6XHDD,2.0,TeamA
9709,klein cable cutters,"(B071LCCGT1, 0, TeamA)",B09CR6XHDD,2.0,TeamA
9709,klein cable cutters,"(B083Q8K2N5, 0, TeamA)",B09CR6XHDD,2.0,TeamA


In [21]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
160,thomas the train,"(B07J6FDHR9, 0, TeamA)",B06Y4C6ZXZ,3.0,TeamB
160,thomas the train,"(B077XKQNG9, 0, TeamB)",B06Y4C6ZXZ,3.0,TeamB
160,thomas the train,"(B07T8T45G2, 0, TeamB)",B06Y4C6ZXZ,3.0,TeamB
160,thomas the train,"(B01NCZ5PTU, 0, TeamB)",B06Y4C6ZXZ,3.0,TeamB
160,thomas the train,"(B000BXJTIK, 0, TeamA)",B06Y4C6ZXZ,3.0,TeamB
...,...,...,...,...,...
9960,ruger american compact 45 holster,"(B01EIHELAA, 0, TeamA)",B098X2GWS1,8.0,TeamB
9960,ruger american compact 45 holster,"(B08943PKSC, 0, TeamA)",B098X2GWS1,8.0,TeamB
9960,ruger american compact 45 holster,"(B01N0P3Y08, 0, TeamA)",B098X2GWS1,8.0,TeamB
9960,ruger american compact 45 holster,"(B0764LQJ99, 0, TeamB)",B098X2GWS1,8.0,TeamB


In [22]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,130.0
mean,5.384615
std,3.038218
min,1.0
25%,3.0
50%,6.0
75%,7.0
max,11.0


In [23]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,100.0
mean,9.0
std,9.566355
min,2.0
25%,4.0
50%,5.0
75%,10.0
max,36.0


In [24]:
query_set_df.to_csv('ubi_queries_df_2.csv')

In [None]:
get_search_config(client, name="baseline with title weight")

In [None]:
get_search_config(client, name="baseline")