In [1]:
# Loader

import pandas as pd
from numpy import integer

daily_aggregated_data = None
daily_total_hits = None
query_country_date = None
real_avg_gmv = None
daily_engine_data = None

daily_aggregated_raw_data = pd.read_csv("../data/daily_aggregated_data.csv")
daily_aggregated_data = (
    daily_aggregated_raw_data
    .groupby(["query", "country", "search_engine"], group_keys=False)
    .apply(lambda x: x[["avg_gmv", "std_gmv", "max_day_gmv", "min_cap", "max_cap"]]
           .iloc[0].to_dict(), include_groups=False)
)

daily_total_hits_raw_data = pd.read_csv("../data/daily_hits.csv")
daily_total_hits = (
    daily_total_hits_raw_data
    .groupby(["query", "country", "date"], group_keys=False)
    .apply(lambda x: x[["total_day_hits"]]
           .iloc[0].to_dict(), include_groups=False)
)

query_country_date_raw_data = pd.read_csv("../data/query_country_date.csv")
query_country_date = (
    query_country_date_raw_data
    .groupby(["query", "country"], group_keys=False)
    .apply(lambda x: x[["min_date", "max_date"]]
           .iloc[0].to_dict(), include_groups=False)
)

real_avg_gmv_raw_data = pd.read_csv("../data/real_avg_gmv.csv")
real_avg_gmv = (
    real_avg_gmv_raw_data
    .groupby(["query", "country"], group_keys=False)
    .apply(lambda x: x[["total_real_avg_gmv"]]
           .iloc[0].to_dict(), include_groups=False)
)

daily_engine_data_raw_data = pd.read_csv("../data/daily_engine_data.csv")
daily_engine_data = (
    daily_engine_data_raw_data
    .groupby(["query", "country", "search_engine", "date"], group_keys=False)
    .apply(lambda x: x[["total_daily_hits", "total_daily_gmv"]]
           .iloc[0].to_dict(), include_groups=False)
)

In [2]:
#Lodaer Reader

from typing import Tuple


def get_gmv_data(query: str, country: str, engine: str) -> Tuple[float, float, float, float, float]:
    if daily_aggregated_data is None:
        raise Exception("Data not loaded")

    avg_gmv = daily_aggregated_data[query, country, engine]["avg_gmv"]
    std_gmv = daily_aggregated_data[query, country, engine]["std_gmv"]
    max_day_gmv = daily_aggregated_data[query, country, engine]["max_day_gmv"]
    min_cap = daily_aggregated_data[query, country, engine]["min_cap"]
    max_cap = daily_aggregated_data[query, country, engine]["max_cap"]

    return avg_gmv, std_gmv, max_day_gmv, min_cap, max_cap


def get_daily_total_hits(query: str, country: str, date: str) -> int:
    if daily_total_hits is None:
        raise Exception("Data not loaded")

    if (query, country, date) not in daily_total_hits:
        return NOT_VALID_DATA

    if daily_total_hits[query, country, date]["total_day_hits"] is not None:
        return daily_total_hits[query, country, date]["total_day_hits"]

    return NOT_VALID_DATA


def get_query_time_range(query: str, country: str) -> Tuple[str, str]:
    if query_country_date is None:
        raise Exception("Data not loaded")

    return (query_country_date[query, country]["min_date"],
            query_country_date[query, country]["max_date"])


def get_real_avg_gmv(query: str, country: str) -> float:
    if real_avg_gmv is None:
        raise Exception("Data not loaded")

    return real_avg_gmv[query, country]["total_real_avg_gmv"]


def get_daily_engine_data(query: str, country: str, engine: str, date: str) -> Tuple[float, int]:
    if daily_engine_data is None:
        raise Exception("Data not loaded")

    total_daily_gmv = daily_engine_data[query, country, engine, date]["total_daily_gmv"]
    total_daily_hits = daily_engine_data[query, country, engine, date]["total_daily_hits"]
    return total_daily_gmv, total_daily_hits

In [3]:
# Utils

from typing import TypeVar, Generic
from dataclasses import dataclass

T = TypeVar('T')


@dataclass
class Range(Generic[T]):
    left: T
    right: T


import numpy as np
from scipy.special import softmax


def generate_gmv_sample(avg: float, std: float, min_cap: float, max_cap: float) -> float:
    selected_gmv = np.random.normal(avg, std)
    return min(max_cap, max(selected_gmv, min_cap))  # capping

In [4]:
# Algorithms

from typing import List


def get_ucb_score(gmvs: List[float], total_engine_hits: List[int]) -> Range[float]:
    total_hits = np.sum(total_engine_hits)
    res = softmax(
        (gmvs + UCB_CONFIDENCE * np.sqrt(np.log(total_hits) / np.array(total_engine_hits))) * SOFTMAX_SENSITIVITY)
    return Range(res[0], res[1])
    

In [5]:
# processing

import pandas as pd


def run_algorithm(query: str, country: str, start_date: str, end_date: str, get_engine_score: callable) -> Tuple[
    float, float, float, float, Range[float]]:
    total_regret = 0
    total_real_regret = 0
    total_real_hits = 0
    total_gmv = 0
    google_sofar_gmv = 0
    google_sofar_hits = 0
    elastic_sofar_gmv = 0
    elastic_sofar_hits = 0

    curr_range = Range(0.5, 0.5)
    elastic_total_avg_gmv, elastic_total_std_gmv, elastic_max_day_gmv, elastic_min_cap, elastic_max_cap = get_gmv_data(
        query, country, "elastic")
    google_total_avg_gmv, google_total_std_gmv, google_max_day_gmv, google_min_cap, google_max_cap = get_gmv_data(query,
                                                                                                                  country,
                                                                                                                  "google")

    date_range = pd.date_range(start=start_date, end=end_date)
    for current_date in date_range:
        current_date_str = current_date.strftime('%Y-%m-%d')
        elastic_total_real_daily_gmv, elastic_total_real_daily_hits = get_daily_engine_data(query, country, "elastic",
                                                                                            current_date_str)
        google_total_real_daily_gmv, google_total_real_daily_hits = get_daily_engine_data(query, country, "google",
                                                                                          current_date_str)

        total_daily_hits = get_daily_total_hits(query, country, current_date_str)
        # If data is not available for the current date, skip the current iteration
        if total_daily_hits == NOT_VALID_DATA:
            # Data is not available
            continue
        elastic_daily_hits = round(curr_range.left * total_daily_hits)
        google_daily_hits = round(curr_range.right * total_daily_hits)

        # Both engine should have at least 1 hit
        if elastic_daily_hits == 0 or google_daily_hits == 0 or elastic_total_real_daily_hits == 0 or google_total_real_daily_hits == 0:
            if elastic_daily_hits + google_daily_hits > 1:
                if elastic_daily_hits == 0:
                    elastic_daily_hits = 1
                    google_daily_hits -= 1
                else:
                    google_daily_hits = 1
                    elastic_daily_hits -= 1
            else:
                # No enough data to make a decision
                continue

        # assure that total_daily_hits has the same value
        total_daily_hits = elastic_daily_hits + google_daily_hits

        google_sofar_hits += google_daily_hits
        elastic_sofar_hits += elastic_daily_hits

        elastic_selected_gmv = 0
        for iterator in range(elastic_daily_hits):
            elastic_selected_gmv += generate_gmv_sample(elastic_total_avg_gmv, elastic_total_std_gmv, elastic_min_cap,
                                                        elastic_max_cap)
        elastic_sofar_gmv += elastic_selected_gmv

        google_selected_gmv = 0
        for iterator in range(google_daily_hits):
            google_selected_gmv += generate_gmv_sample(google_total_avg_gmv, google_total_std_gmv, google_min_cap,
                                                       google_max_cap)
        google_sofar_gmv += google_selected_gmv

        total_daily_selected_gmv = elastic_selected_gmv + google_selected_gmv
        best_gmv = max(elastic_total_avg_gmv, google_total_avg_gmv) * total_daily_hits
        best_real_gmv = max(elastic_total_avg_gmv, google_total_avg_gmv) * (
                    elastic_total_real_daily_hits + google_total_real_daily_hits)
        total_real_hits += elastic_total_real_daily_hits + google_total_real_daily_hits

        total_regret += best_gmv - total_daily_selected_gmv
        total_real_regret += best_real_gmv - (elastic_total_real_daily_gmv + google_total_real_daily_gmv)

        total_gmv += total_daily_selected_gmv

        curr_range = get_engine_score([elastic_sofar_gmv, google_sofar_gmv], [elastic_sofar_hits, google_sofar_hits])

    total_hits = elastic_sofar_hits + google_sofar_hits
    algorithm_avg_regret = total_regret / total_hits
    real_avg_regret = total_real_regret / total_real_hits
    algorithm_avg_gmv = total_gmv / total_hits
    curr_real_avg_gmv = get_real_avg_gmv(query, country)

    return algorithm_avg_gmv, algorithm_avg_regret, curr_real_avg_gmv, real_avg_regret, curr_range

In [6]:
# Constants

NOT_VALID_DATA = -1.0
UCB_CONFIDENCE = 100
SOFTMAX_SENSITIVITY = 0.00005

In [7]:
# Start Experiment

N = 1000
total_avg_gmv_res = 0
total_regret_res = 0
total_real_avg_gmv_res = 0
total_real_regret_res = 0
mn_regret = 100000000000000
bst_conf = -1
bst_sens = -1
UCB_CONFIDENCE = 1000
SOFTMAX_SENSITIVITY = 0.0001


# for i in range(N):
#     for j in range(N):
#         UCB_CONFIDENCE = i
#         SOFTMAX_SENSITIVITY = j
#         algorithm_avg_gmv_res, algorithm_avg_regret_res, curr_real_avg_gmv_res, real_avg_regret_res, curr_range_res = run_algorithm('shoes for men', 'sa', '2024-12-01', '2025-02-01', get_ucb_score)
#         if mn_regret < algorithm_avg_regret_res:
#             mn_regret = algorithm_avg_regret_res
#             bst_conf = UCB_CONFIDENCE
#             bst_sens = SOFTMAX_SENSITIVITY
#
# print(mn_regret, bst_conf, bst_sens)

curr_range_res = None
for i in range(N):
    algorithm_avg_gmv_res, algorithm_avg_regret_res, curr_real_avg_gmv_res, real_avg_regret_res, curr_range_res = run_algorithm(
        # 'samsung s24 ultra', 'sa', '2024-12-01', '2025-02-01', get_ucb_score)
        'shoes for men', 'sa', '2024-12-01', '2025-02-01', get_ucb_score)
    total_avg_gmv_res += algorithm_avg_gmv_res
    total_regret_res += algorithm_avg_regret_res
    total_real_avg_gmv_res += curr_real_avg_gmv_res
    total_real_regret_res += real_avg_regret_res

total_avg_gmv_res /= N
total_regret_res /= N
total_real_avg_gmv_res /= N
total_real_regret_res /= N

print(total_avg_gmv_res, total_regret_res, total_real_avg_gmv_res, total_real_regret_res, curr_range_res)

0.3168678398302206 0.001954110723179321 0.31401549134553797 0.004806459207857898 Range(left=np.float64(0.8822096721982101), right=np.float64(0.11779032780179001))
