In [1]:
import numpy as np
import pandas as pd
import random
import json
from opensearchpy import OpenSearch
import os
OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost")
OPENSEARCH_PORT = os.getenv("OPENSEARCH_PORT", 9200)

In [2]:
def interleave(listA, listB, k):
    ids = []
    teams = []
    rank = 1
    idx_a = 0
    idx_b = 0
    len_a = len(listA)
    len_b = len(listB)

    while rank <= k:
        a_val = listA[idx_a]['_id'] if idx_a < len_a else None
        b_val = listB[idx_b]['_id'] if idx_b < len_b else None
        if not (a_val and b_val):
            # lists are done
            return zip(ids, teams)
        if not a_val:
            # take the rest of listB
            for hit in listB[idx_b:k]['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        if not b_val:
            # take the rest of listA
            for hit in listA['_id']:
                if hit not in ids:
                    ids.append(hit)
                    teams.append('TeamB')
            return zip(ids, teams)
        a_first = idx_a < idx_b or idx_a == idx_b and random.randint(0,1)
        if a_first:
            if a_val not in ids:
                ids.append(a_val)
                teams.append('TeamA')
                rank += 1
            idx_a += 1
        else:
            if b_val not in ids:
                ids.append(b_val)
                teams.append('TeamB')
                rank += 1
            idx_b += 1       
    return list(zip(ids, teams))


def get_list(listA, listB, k):
    A = listA['hits']['hits']
    B = listB['hits']['hits']
    interleaving = interleave(A, B, k)
    return interleaving

def get_search_config(client, name):
    search_configs_index = '.plugins-search-relevance-search-config'
    conf = client.search( body = {
      "query": {
        "match": {"name": name}
      },
      "size": 1
    }, index=search_configs_index)
    return conf['hits']['hits'][0]['_source'] if len(conf['hits']['hits']) else {}

def populate_query(query, config, size=10, source=["title", "description", "asin"]):
    query = query.replace('"', '\\"')
    body = config['queryBody'].replace("%SearchText%", query)
    return {
        "query": json.loads(body),
        "size": size,
        "_source": source
    }

def run_AB(client, query, configA, configB, size=10):
    confA = get_search_config(client, configA)
    confB = get_search_config(client, configB)
    qA = populate_query(query, confA, size=size)
    qB = populate_query(query, confB, size=size)
    #TODO: extract the endpoint value from the search config, rather than using the client here
    resA = client.search(body=qA)
    resB = client.search(body=qB)
    result = get_list(resA, resB, 10)
    return result

def get_events(client, id, query, event_type=None):
    if event_type:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}},
                 {"match": {"action_name": event_type}}
             ]
            }
          },
          "size": 1000
        }
    else:
        evq = {
          "query": {
            "bool": {
             "must": [   
                 {"match": {"event_attributes.object.object_id": id}},
                 {"match": {"user_query": query}}
             ]
            }
          },
          "size": 1000
        }
    results = client.search(body=evq, index='ubi_events')
    return results

def get_clicks(client, id, query):
    results = get_events(client, id, query, 'click')
    return results

def count_clicks(client, id, query):
    results = get_clicks(client, id, query)
    return results['hits']['total']['value']

In [3]:
# Create the client with SSL/TLS and hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': OPENSEARCH_HOST, 'port': OPENSEARCH_PORT}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

In [4]:
ubi_queries = client.search(body={'query': {'match_all': {}}, "size":10000-31, "from": 31}, index='ubi_queries')
ubi_user_queries = [x['_source']['user_query'] for x in ubi_queries['hits']['hits']]
ubi_user_queries

['vibrator',
 'weighted blanket',
 'apple watch',
 'air fryer',
 'surge protector power strip',
 'firestick',
 'gift not included batteries',
 'futon frames full size without mattress',
 'tv',
 'portable charger',
 'earbuds',
 'boots',
 'dresser',
 'bite not collar',
 'yoga mat',
 'water bottles',
 'futon frames full size without mattress',
 'hawaiian shirts for women',
 'tv',
 'chromebook',
 'minecraft',
 'portable charger',
 'tv',
 'apple watch',
 'tv',
 'nintendo switch games',
 'bluetooth headphones',
 'poleras largas mujer',
 'gloves',
 'christmas lights',
 'pokemon',
 'rc drone without camera',
 'gifts for women',
 'rc drone without camera',
 'shoes',
 'floor lamps for living room',
 'food',
 'yeti',
 '3d printer',
 'toilet paper',
 'ipod touch',
 'ipad',
 'basketball',
 'printer',
 'echo dot',
 'ps4',
 'sewing machine',
 'prime movies',
 'ugg slippers women',
 'paper towels',
 'airpods',
 'barbie dolls',
 'iphone 11 case',
 '30 inch ceiling fan without light',
 'shoes for women'

In [6]:
for query in ubi_user_queries[:10]:
    for id,team in run_AB(client, query, 'Demo 2', 'Demo 3'):
        events = get_events(client, id, query)
        count = events['hits']['total']['value']
        if count:
            print(f"{team} {query}, {id} => {count}")

TeamA surge protector power strip, B00TP1C51M => 2


In [7]:
for query in ubi_user_queries[:100]:
    for id,team in run_AB(client, query, 'Demo 2', 'Demo 3'):
        count = count_clicks(client, id, query)
        if count:
            print(f"{team} {query}, {id} => {count}")

TeamA pokemon, B07DMFDC6W => 9
TeamB ipod touch, B07FM5Z5C3 => 21
TeamA 30 inch ceiling fan without light, B001JBPHRU => 7


In [8]:
query_set_df = pd.DataFrame(ubi_user_queries, columns=['query'])
query_set_df

Unnamed: 0,query
0,vibrator
1,weighted blanket
2,apple watch
3,air fryer
4,surge protector power strip
...,...
9964,metal razor for women
9965,metagenics phytomulti without iron
9966,wheelchair cushions for pressure sores
9967,barbie dolls


In [9]:
query_set_df['res'] = query_set_df['query'].apply(lambda x: [ (id, count_clicks(client, id, x), team) for id,team in run_AB(client, x, 'Demo 2', 'Demo 3')])
query_set_df = query_set_df.explode('res')

In [10]:
query_set_df = query_set_df.join(pd.DataFrame(query_set_df['res'].values.tolist(), columns=['id', 'clicks', 'team']))
query_set_df

Unnamed: 0,query,res,id,clicks,team
0,vibrator,"(B07GZHJ3NL, 0, TeamA)",B07GZHJ3NL,0.0,TeamA
0,vibrator,"(B07F9RR8PN, 0, TeamB)",B07GZHJ3NL,0.0,TeamA
1,weighted blanket,"(B097DWWTLM, 0, TeamA)",B07F9RR8PN,0.0,TeamB
1,weighted blanket,"(B075W9VSW7, 0, TeamB)",B07F9RR8PN,0.0,TeamB
1,weighted blanket,"(B07C9N1GXD, 0, TeamA)",B07F9RR8PN,0.0,TeamB
...,...,...,...,...,...
9968,lifevac choking device,"(B07P5TTMLJ, 0, TeamA)",B07G5VLVNM,0.0,TeamA
9968,lifevac choking device,"(B00TRM0UI4, 0, TeamB)",B07G5VLVNM,0.0,TeamA
9968,lifevac choking device,"(B01MSWT7H7, 0, TeamA)",B07G5VLVNM,0.0,TeamA
9968,lifevac choking device,"(B07L4BFQBK, 0, TeamA)",B07G5VLVNM,0.0,TeamA


In [11]:
query_set_df[query_set_df['team'] == 'TeamA'].describe()

Unnamed: 0,clicks
count,47991.0
mean,0.019795
std,0.459468
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,17.0


In [12]:
query_set_df[query_set_df['team'] == 'TeamB'].describe()

Unnamed: 0,clicks
count,48038.0
mean,0.022378
std,0.550752
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,21.0


In [13]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
267,desk lamp,"(B08R3HB3QB, 0, TeamA)",B07DMFDC6W,9.0,TeamA
267,desk lamp,"(B072PS1YBH, 0, TeamB)",B07DMFDC6W,9.0,TeamA
267,desk lamp,"(B01M4NPL9Z, 0, TeamA)",B07DMFDC6W,9.0,TeamA
267,desk lamp,"(B07G8SCB99, 0, TeamB)",B07DMFDC6W,9.0,TeamA
267,desk lamp,"(B01JFRX3FC, 0, TeamA)",B07DMFDC6W,9.0,TeamA
...,...,...,...,...,...
9384,baby dolls,"(B005537IFW, 0, TeamB)",B00HD0ELFK,8.0,TeamA
9384,baby dolls,"(B004QPEWVS, 0, TeamA)",B00HD0ELFK,8.0,TeamA
9384,baby dolls,"(B07RP7JWNM, 0, TeamB)",B00HD0ELFK,8.0,TeamA
9384,baby dolls,"(B095LF377Q, 0, TeamB)",B00HD0ELFK,8.0,TeamA


In [14]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0]

Unnamed: 0,query,res,id,clicks,team
366,yoga swing without handles,"(B000BPBXL4, 0, TeamB)",B07FM5Z5C3,21.0,TeamB
366,yoga swing without handles,"(B076ZY2438, 0, TeamA)",B07FM5Z5C3,21.0,TeamB
366,yoga swing without handles,"(B07CGP4SB2, 0, TeamA)",B07FM5Z5C3,21.0,TeamB
366,yoga swing without handles,"(B01JH4J90G, 0, TeamB)",B07FM5Z5C3,21.0,TeamB
366,yoga swing without handles,"(B07B1RXDGQ, 0, TeamA)",B07FM5Z5C3,21.0,TeamB
...,...,...,...,...,...
8044,10 x 20-feet pop up canopy without sidewalls,"(B000BPBXL4, 0, TeamA)",B07XFR5M5G,1.0,TeamB
8044,10 x 20-feet pop up canopy without sidewalls,"(B01D7VBKQG, 0, TeamB)",B07XFR5M5G,1.0,TeamB
8044,10 x 20-feet pop up canopy without sidewalls,"(B083CBCQL1, 0, TeamA)",B07XFR5M5G,1.0,TeamB
8044,10 x 20-feet pop up canopy without sidewalls,"(B0119Q7J48, 0, TeamB)",B07XFR5M5G,1.0,TeamB


In [15]:
query_set_df[query_set_df['team'] == 'TeamB'][query_set_df[query_set_df['team'] == 'TeamB']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,117.0
mean,9.188034
std,6.3774
min,1.0
25%,3.0
50%,7.0
75%,14.0
max,21.0


In [16]:
query_set_df[query_set_df['team'] == 'TeamA'][query_set_df[query_set_df['team'] == 'TeamA']['clicks'] > 0].describe()

Unnamed: 0,clicks
count,120.0
mean,7.916667
std,4.700408
min,2.0
25%,4.5
50%,7.0
75%,9.75
max,17.0
