In [59]:
# Loader

import pandas as pd

daily_aggregated_data = None
daily_total_hits = None
query_country_date = None
real_avg_gmv = None
daily_engine_data = None

daily_aggregated_raw_data = pd.read_csv("../data/bad/daily_aggregated_data.csv")
daily_aggregated_data = (
    daily_aggregated_raw_data
    .groupby(["query", "country", "search_engine"], group_keys=False)
    .apply(lambda x: x[["avg_gmv", "std_gmv", "min_cap", "max_cap"]]
           .iloc[0].to_dict(), include_groups=False)
)

daily_total_hits_raw_data = pd.read_csv("../data/bad/daily_hits.csv")
daily_total_hits = (
    daily_total_hits_raw_data
    .groupby(["query", "country", "date"], group_keys=False)
    .apply(lambda x: x[["total_day_hits"]]
           .iloc[0].to_dict(), include_groups=False)
)

query_country_date_raw_data = pd.read_csv("../data/bad/query_country_date.csv")
query_country_date = (
    query_country_date_raw_data
    .groupby(["query", "country"], group_keys=False)
    .apply(lambda x: x[["min_date", "max_date"]]
           .iloc[0].to_dict(), include_groups=False)
)

real_avg_gmv_raw_data = pd.read_csv("../data/bad/real_avg_gmv.csv")
real_avg_gmv = (
    real_avg_gmv_raw_data
    .groupby(["query", "country"], group_keys=False)
    .apply(lambda x: x[["total_real_avg_gmv"]]
           .iloc[0].to_dict(), include_groups=False)
)

daily_engine_data_raw_data = pd.read_csv("../data/bad/daily_engine_data.csv")
daily_engine_data = (
    daily_engine_data_raw_data
    .groupby(["query", "country", "search_engine", "date"], group_keys=False)
    .apply(lambda x: x[["total_daily_hits", "total_daily_gmv"]]
           .iloc[0].to_dict(), include_groups=False)
)

In [18]:
#Lodaer Reader

from typing import Tuple


def get_gmv_data(query: str, country: str, engine: str) -> Tuple[float, float, float, float]:
    if daily_aggregated_data is None:
        raise Exception("Data not loaded")

    if (query, country, engine) not in daily_aggregated_data:
        return NOT_VALID_DATA, NOT_VALID_DATA, NOT_VALID_DATA, NOT_VALID_DATA

    avg_gmv = daily_aggregated_data[query, country, engine]["avg_gmv"]
    std_gmv = daily_aggregated_data[query, country, engine]["std_gmv"]
    min_cap = daily_aggregated_data[query, country, engine]["min_cap"]
    max_cap = daily_aggregated_data[query, country, engine]["max_cap"]

    return avg_gmv, std_gmv, min_cap, max_cap


def get_daily_total_hits(query: str, country: str, date: str) -> int:
    if daily_total_hits is None:
        raise Exception("Data not loaded")

    if (query, country, date) not in daily_total_hits:
        return NOT_VALID_DATA

    if daily_total_hits[query, country, date]["total_day_hits"] is not None:
        return daily_total_hits[query, country, date]["total_day_hits"]

    return NOT_VALID_DATA


def get_all_query_time_range(query: str, country: str) -> Tuple[str, str]:
    if query_country_date is None:
        raise Exception("Data not loaded")

    if (query, country) not in query_country_date:
        return NOT_VALID_DATA, NOT_VALID_DATA

    return (query_country_date[query, country]["min_date"],
            query_country_date[query, country]["max_date"])


def get_real_avg_gmv(query: str, country: str) -> float:
    if real_avg_gmv is None:
        raise Exception("Data not loaded")

    if (query, country) not in real_avg_gmv:
        return NOT_VALID_DATA

    return real_avg_gmv[query, country]["total_real_avg_gmv"]


def get_daily_engine_data(query: str, country: str, engine: str, date: str) -> Tuple[float, int]:
    if daily_engine_data is None:
        raise Exception("Data not loaded")

    if (query, country, engine, date) not in daily_engine_data:
        return NOT_VALID_DATA, NOT_VALID_DATA

    total_daily_gmv = daily_engine_data[query, country, engine, date]["total_daily_gmv"]
    total_daily_hits = daily_engine_data[query, country, engine, date]["total_daily_hits"]
    return total_daily_gmv, total_daily_hits

In [49]:
# Constants

NOT_VALID_DATA = -1.0
Z_SCORE = 1.96
MIN_ENGINE_RANGE = 0.03
DEFAULT_ENGINES_SPLIT = {
    'elastic': 0.2,
    'google': 0.8
}

In [36]:
# Utils

import numpy as np
from scipy.special import softmax

def adjusted_softmax(arr: np.array, min_val: float) -> np.array:
    arr = softmax(arr)
    # ensure that every element is at least min_val
    arr = np.maximum(arr, min_val)
    # re-normalize the array
    arr = arr / np.sum(arr)

    return arr

def generate_gmv_sample(avg: float, std: float, min_cap: float, max_cap: float) -> float:
    selected_gmv = np.random.normal(avg, std)
    return min(max_cap, max(selected_gmv, min_cap))  # capping

In [42]:
# Algorithms

from typing import List, Dict
from abc import ABC, abstractmethod


class ScoringAlgorithm(ABC):
    @abstractmethod
    def get_score(self, engines_gmvs: List[float]) -> List[float]:
        pass


class UCBScoringAlgorithm(ScoringAlgorithm):

    def __init__(self, S: float, C: float):
        self.S = S
        self.C = C

    def __str__(self):
        return f"UCB: S:{self.S}, C:{self.C}"

    def __repr__(self):
        return f"UCB: S:{self.S}, C:{self.C}"

    def get_score(self, engines_gmvs: List[float]) -> List[float]:
        total_engine_hits = sum(len(engine_gmv) for engine_gmv in engines_gmvs)
        agg_hits = [len(engine_gmv) for engine_gmv in engines_gmvs]
        agg_gmvs = [sum(engine_gmv) for engine_gmv in engines_gmvs]

        agg_gmvs = np.array(agg_gmvs)
        agg_hits = np.array(agg_hits)
        agg_hits = np.maximum(agg_hits, 1)
        ucb_res = softmax((agg_gmvs + self.C * np.sqrt(np.log(total_engine_hits) / agg_hits)) * self.S)

        return ucb_res.tolist()


THOMPSON_ITERATIONS = 10000


class ThompsonScoringAlgorithm(ScoringAlgorithm):

    def __init__(self, S: float, n_iterations=THOMPSON_ITERATIONS):
        self.S = S
        self.n_iterations = n_iterations

    def __str__(self):
        return f"TS: S:{self.S}, N:{self.n_iterations}"

    def __repr__(self):
        return f"TS: S:{self.S}, N:{self.n_iterations}"

    def get_score(self, engines_gmvs: List[float]) -> List[float]:
        avg_gmvs = [np.mean(engine_gmvs) for engine_gmvs in engines_gmvs]
        std_gmvs = [np.std(engine_gmvs) for engine_gmvs in engines_gmvs]
        min_caps = [max(0, avg - std * Z_SCORE) for avg, std in zip(avg_gmvs, std_gmvs)]
        max_caps = [avg + std * Z_SCORE for avg, std in zip(avg_gmvs, std_gmvs)]

        num_elements = len(engines_gmvs)
        scores = [0] * num_elements

        for _ in range(self.n_iterations):
            sampled_gmvs = [generate_gmv_sample(avg, std, min_cap, max_cap) for avg, std, min_cap, max_cap in
                            zip(avg_gmvs, std_gmvs, min_caps, max_caps)]
            max_sampled_gmv = max(sampled_gmvs)
            scores = [score + 1 if sampled_gmv == max_sampled_gmv else score for score, sampled_gmv in
                      zip(scores, sampled_gmvs)]

        scores = np.array(scores)
        thompson_res = adjusted_softmax((scores / self.n_iterations) * self.S, MIN_ENGINE_RANGE)

        return thompson_res.tolist()



In [50]:
# processing

import pandas as pd


class SimulationResult:
    def __init__(self):
        self.total_avg_regrets: List[float] = []
        self.total_avg_real_regrets: List[float] = []
        self.avg_gmvs: List[float] = []
        self.total_avg_real_gmvs: List[float] = []
        self.engines_ranges: List[Dict[str, float]] = []
        self.days: List[str] = []


class EngineAggData:
    def __init__(self, avg_gmv: float, std_gmv: float, min_cap: float, max_cap: float):
        self.avg_gmv = avg_gmv
        self.std_gmv = std_gmv
        self.min_cap = min_cap
        self.max_cap = max_cap


class EngineDailyData:
    def __init__(self, daily_gmv: float, daily_hits: int):
        self.daily_gmv = daily_gmv
        self.daily_hits = daily_hits


class Data:
    def __init__(self, total_gmv: float, total_regret: float, total_hits: int):
        self.total_gmv = total_gmv
        self.total_regret = total_regret
        self.total_hits = total_hits


def run_algorithm(query: str, country: str, start_date: str, end_date: str,
                  scoring_algorithm: ScoringAlgorithm, engines: List[str] = None,
                  default_ranges:Dict[str, float] = None) -> SimulationResult:
    if engines is None:
        engines = ['elastic', 'google']

    algo_data = Data(0, 0, 0)
    real_data = Data(0, 0, 0)
    engines_sofar_gmv = {engine: [] for engine in engines}
    engines_range = {engine: 1 / len(engines) for engine in engines}
    if default_ranges is not None:
        engines_range = default_ranges
    simulationResult = SimulationResult()
    # use this if engine is introduced in the middle of the simulation
    is_engine_active = {engine: False for engine in engines}

    # Get the real aggregated gmv data for each engine
    engines_agg_data = {}
    for engine in engines:
        avg_gmv, std_gmv, min_cap, max_cap = get_gmv_data(query, country, engine)
        if avg_gmv == NOT_VALID_DATA:
            return simulationResult
        engines_agg_data[engine] = EngineAggData(avg_gmv, std_gmv, min_cap, max_cap)

    date_range = pd.date_range(start=start_date, end=end_date)
    for current_date in date_range:
        current_date_str = current_date.strftime('%Y-%m-%d')

        # Get the real daily gmv data for each engine
        engines_daily_data = {}
        for engine in engines:
            daily_gmv, daily_hits = get_daily_engine_data(query, country, engine, current_date_str)
            if daily_hits == NOT_VALID_DATA:
                daily_hits, daily_gmv = 0, 0
            else:
                is_engine_active[engine] = True
            engines_daily_data[engine] = EngineDailyData(daily_gmv, daily_hits)

        total_daily_hits = get_daily_total_hits(query, country, current_date_str)
        # If data is not available for the current date, skip the current iteration
        if total_daily_hits == NOT_VALID_DATA:
            continue

        # Get the daily algo hits for each engine
        engines_daily_algo_hits = {}
        for engine in engines:
            engines_daily_algo_hits[engine] = round(total_daily_hits * engines_range[engine]) * is_engine_active[engine]
            algo_data.total_hits += engines_daily_algo_hits[engine]

        # Make sure that total_daily_hits has the same value as the sum of the hits of the all engines
        total_daily_hits = sum(engines_daily_algo_hits.values())
        algo_data.total_gmv += total_daily_hits

        # Aggregate the total real hits for all engines
        for engine in engines:
            real_data.total_hits += engines_daily_data[engine].daily_hits

        total_algo_daily_selected_gmv = 0

        # Simulate engines gmvs
        for engine in engines:
            for _ in range(engines_daily_algo_hits[engine]):
                gmv_sample = generate_gmv_sample(engines_agg_data[engine].avg_gmv, engines_agg_data[engine].std_gmv,
                                                 engines_agg_data[engine].min_cap, engines_agg_data[engine].max_cap)
                total_algo_daily_selected_gmv += gmv_sample
                engines_sofar_gmv[engine].append(gmv_sample)

        algo_data.total_gmv += total_algo_daily_selected_gmv
        simulationResult.avg_gmvs.append(algo_data.total_gmv / algo_data.total_hits)

        # Best daily gmv is the max avg gmv of all engines multiplied by the total daily hits
        best_gmv = max([engines_agg_data[engine].avg_gmv for engine in engines]) * total_daily_hits
        # Best real gmv is the max avg gmv of all engines multiplied by the total real hits
        best_real_gmv = max([engines_agg_data[engine].avg_gmv for engine in engines]) * sum(
            [engines_daily_data[engine].daily_hits for engine in engines])

        # Calc algo regret
        algo_data.total_regret += max(0, best_gmv - total_algo_daily_selected_gmv)
        simulationResult.total_avg_regrets.append(algo_data.total_regret / algo_data.total_hits)

        # Calc real regret
        real_data.total_regret += max(0,
                                      best_real_gmv - sum([engines_daily_data[engine].daily_gmv for engine in engines]))
        simulationResult.total_avg_real_regrets.append(real_data.total_regret / real_data.total_hits)

        # Calc real gmv
        real_data.total_gmv += (sum([engines_daily_data[engine].daily_gmv for engine in engines]))
        simulationResult.total_avg_real_gmvs.append(real_data.total_gmv / real_data.total_hits)

        # Calc ranges
        algo_scores = scoring_algorithm.get_score([engines_sofar_gmv[engine] for engine in engines])
        engines_range = {engine: rng for engine, rng in zip(engines, algo_scores)}
        simulationResult.engines_ranges.append(engines_range)

        # Log days
        simulationResult.days.append(current_date_str)

    return simulationResult

In [26]:
#     total_avg_gmv_res += algorithm_avg_gmv_res
#     total_regret_res += algorithm_avg_regret_res
#     total_real_avg_gmv_res += curr_real_avg_gmv_res
#     total_real_regret_res += real_avg_regret_res
#
# total_avg_gmv_res /= N
# total_regret_res /= N
# total_real_avg_gmv_res /= N
# total_real_regret_res /= N

# print(total_avg_gmv_res, total_regret_res, total_real_avg_gmv_res, total_real_regret_res, curr_range_res)
#  -----------------------------------------------------------
# | ThompsonScoringAlgorithm                                  |
# | SOFTMAX_SENSITIVITY min = 0.128 max = 65.536              |
# |                                                           |
# | UCBScoringAlgorithm                                       |
# | UCB_CONFIDENCE = 10000                                    |
# | SOFTMAX_SENSITIVITY min = 2.62144e-06 max = 0.00067108864 |
#  -----------------------------------------------------------


In [7]:
# drow

from matplotlib import pyplot as plt
import matplotlib.dates as mdates
from typing import List


def show_graph(values_to_draw: List[float], days: List[str], graph_title: str, graph_label: str,
               graph_color: str = 'b'):
    graph_days = pd.to_datetime(days)
    plt.plot(graph_days, values_to_draw, color=graph_color, label=graph_label)
    plt.xlabel('date')
    plt.ylabel(graph_label)
    plt.title(graph_title)
    plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.show()


def show_graph_multiple(values_to_draw: List[List[float]], days: List[str], ylabel: str, graph_title: str,
                        graph_labels: List[str], y_limit:List[float] = None):
    graph_days = pd.to_datetime(days)
    for i in range(len(values_to_draw)):
        plt.plot(graph_days, values_to_draw[i], label=graph_labels[i])
    plt.xlabel('date')
    plt.ylabel(ylabel)
    if y_limit is not None:
        plt.ylim(y_limit[0], y_limit[1])
    plt.title(graph_title)
    plt.gca().xaxis.set_major_locator(mdates.WeekdayLocator(interval=1))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m-%d'))
    plt.legend(fontsize=8, bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.style.use('dark_background')
    plt.figure(figsize=(16, 10))

    plt.show()

In [60]:
import time

# Start Experiment Thompson

# Custom queries to train with
queries_train = [
    ('ملصقات', 'sa', '2024-12-01', '2025-02-01')
    , ('مزيل عرق', 'sa', '2024-12-01', '2025-02-01'),
    ('adidas shoes', 'sa', '2024-12-01', '2025-02-01'),
    ('جزمة رجالي', 'sa', '2024-12-01', '2025-02-01'),
    ('puma shoes for men', 'sa', '2024-12-01', '2025-02-01'),
    ('boots', 'eg', '2024-12-01', '2025-02-01'),
    ('samsung', 'sa', '2024-12-01', '2025-02-01'),
    ('شنط جس', 'sa', '2024-12-01', '2025-02-01'),
    ('boots for ladies', 'sa', '2024-12-01', '2025-02-01'),
    ('abaya', 'ae', '2024-12-01', '2025-02-01'),
    ('iphone 11', 'sa', '2024-12-01', '2025-02-01'),
    ('ماكينة حلاقة', 'sa', '2024-12-01', '2025-02-01'),
    ('airpods', 'ae', '2024-12-01', '2025-02-01'),
    ('استشوار شعر', 'sa', '2024-12-01', '2025-02-01'),
    ('سماعه بلوتوث', 'sa', '2024-12-01', '2025-02-01'),
    ('ابجوره طويله', 'sa', '2024-12-01', '2025-02-01'),
    ('لوحه جداريه', 'sa', '2024-12-01', '2025-02-01'),
    ('نيو بالانس احذية نسائية', 'sa', '2024-12-01', '2025-02-01'),
    ('new balance shoes for men', 'ae', '2024-12-01', '2025-02-01'),
    ('العاب', 'sa', '2024-12-01', '2025-02-01'),
    ('iphone 13 pro max', 'sa', '2024-12-01', '2025-02-01')
]

column_order = ['query', 'country', 'min_date', 'max_date']
query_country_date_raw_data = query_country_date_raw_data[column_order]
start = 10
end = 100
num_of_steps = 5
sensitivities = np.logspace(np.log10(start), np.log10(end), num=num_of_steps)
ALGO_N_ITERATIONS = 10

experiment_results = pd.DataFrame(
    columns=['query', 'country', 'date', 'engines_range', 'config', 'algo_regret', 'real_regret', 'algo_gmv',
             'real_gmv'])
total_queries = len(queries_train)  #len(query_country_date_raw_data)

for query_it, country_it, start_date_it, end_date_it in queries_train:  #query_country_date_raw_data.itertuples(index=False, name=None):
    start_time = time.perf_counter()
    for sensitivity in sensitivities:
        res = run_algorithm(
            query_it, country_it, start_date_it, end_date_it, ThompsonScoringAlgorithm(sensitivity, 10000),
            ['elastic', 'google', 'she7ata_engine'])

        if len(res.days) == 0:
            break

        for _ in range(ALGO_N_ITERATIONS - 1):
            temp = run_algorithm(
                query_it, country_it, start_date_it, end_date_it, ThompsonScoringAlgorithm(sensitivity, 10000),
                ['elastic', 'google', 'she7ata_engine'])
            res.total_avg_regrets = [x + y for x, y in zip(res.total_avg_regrets, temp.total_avg_regrets)]
            res.total_avg_real_regrets = [x + y for x, y in
                                          zip(res.total_avg_real_regrets, temp.total_avg_real_regrets)]
            res.avg_gmvs = [x + y for x, y in zip(res.avg_gmvs, temp.avg_gmvs)]
            res.total_avg_real_gmvs = [x + y for x, y in zip(res.total_avg_real_gmvs, temp.total_avg_real_gmvs)]
            for i in range(len(res.engines_ranges)):
                for k in res.engines_ranges[i].keys():
                    res.engines_ranges[i][k] += temp.engines_ranges[i][k]

        res.total_avg_regrets = [x / ALGO_N_ITERATIONS for x in res.total_avg_regrets]
        res.total_avg_real_regrets = [x / ALGO_N_ITERATIONS for x in res.total_avg_real_regrets]
        res.avg_gmvs = [x / ALGO_N_ITERATIONS for x in res.avg_gmvs]
        res.total_avg_real_gmvs = [x / ALGO_N_ITERATIONS for x in res.total_avg_real_gmvs]
        for i in range(len(res.engines_ranges)):
            for k in res.engines_ranges[i].keys():
                res.engines_ranges[i][k] /= ALGO_N_ITERATIONS

        result_data = {}

        result_data['query'] = [query_it] * len(res.days)
        result_data['country'] = [country_it] * len(res.days)
        result_data['date'] = res.days
        result_data['engines_range'] = res.engines_ranges
        result_data['config'] = [ThompsonScoringAlgorithm(sensitivity)] * len(res.days)
        result_data['algo_regret'] = res.total_avg_regrets
        result_data['real_regret'] = res.total_avg_real_regrets
        result_data['algo_gmv'] = res.avg_gmvs
        result_data['real_gmv'] = res.total_avg_real_gmvs

        experiment_results = pd.concat([experiment_results, pd.DataFrame(result_data)])
        # show_graph_multiple([[x.left for x in res.engines_ranges], [x.right for x in res.engines_ranges]], res.days, f'engines_ranges over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', ['elastic', 'google'], ['b', 'r'])
        # show_graph(res.total_avg_regrets, res.days, f'avg_regret over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_regret', 'r')
        # show_graph(res.total_avg_real_regrets, res.days, f'avg_real_regret over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_real_regret', 'g')
        # show_graph(res.avg_gmvs, res.days, f'avg_gmv over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_gmv', 'b')
    end_time = time.perf_counter()
    process_time = (end_time - start_time) / 60
    total_queries -= 1
    print("processing done for query: ", query_it, "country: ", country_it, 'in ', process_time, 'minutes',
          "remaining queries: ", total_queries)
print(experiment_results)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  experiment_results = pd.concat([experiment_results, pd.DataFrame(result_data)])


processing done for query:  ملصقات country:  sa in  1.1822371277832038 minutes remaining queries:  20
processing done for query:  مزيل عرق country:  sa in  1.218921738183902 minutes remaining queries:  19
processing done for query:  adidas shoes country:  sa in  1.232954759716328 minutes remaining queries:  18
processing done for query:  جزمة رجالي country:  sa in  1.196062334034165 minutes remaining queries:  17
processing done for query:  puma shoes for men country:  sa in  1.3358185847163744 minutes remaining queries:  16
processing done for query:  boots country:  eg in  1.2203031757002463 minutes remaining queries:  15
processing done for query:  samsung country:  sa in  1.3747537930505738 minutes remaining queries:  14
processing done for query:  شنط جس country:  sa in  1.1888402840161385 minutes remaining queries:  13
processing done for query:  boots for ladies country:  sa in  1.2003922867996153 minutes remaining queries:  12
processing done for query:  abaya country:  ae in  

In [61]:
experiment_results.to_csv('../experiment_output/thompson_results_bad_engine_adjusted_softmax_sens_10_100.csv', index=False)

In [26]:
import time

# Start Experiment UCB

queries_train_ucb = [
    ('ملصقات', 'sa', '2024-12-01', '2025-02-01')
    , ('مزيل عرق', 'sa', '2024-12-01', '2025-02-01'),
    ('adidas shoes', 'sa', '2024-12-01', '2025-02-01'),
    ('جزمة رجالي', 'sa', '2024-12-01', '2025-02-01'),
    ('puma shoes for men', 'sa', '2024-12-01', '2025-02-01'),
    ('boots', 'eg', '2024-12-01', '2025-02-01'),
    ('samsung', 'sa', '2024-12-01', '2025-02-01'),
    ('شنط جس', 'sa', '2024-12-01', '2025-02-01'),
    ('boots for ladies', 'sa', '2024-12-01', '2025-02-01'),
    ('abaya', 'ae', '2024-12-01', '2025-02-01'),
    ('iphone 11', 'sa', '2024-12-01', '2025-02-01'),
    ('ماكينة حلاقة', 'sa', '2024-12-01', '2025-02-01'),
    ('airpods', 'ae', '2024-12-01', '2025-02-01'),
    ('استشوار شعر', 'sa', '2024-12-01', '2025-02-01'),
    ('سماعه بلوتوث', 'sa', '2024-12-01', '2025-02-01'),
    ('ابجوره طويله', 'sa', '2024-12-01', '2025-02-01'),
    ('لوحه جداريه', 'sa', '2024-12-01', '2025-02-01'),
    ('نيو بالانس احذية نسائية', 'sa', '2024-12-01', '2025-02-01'),
    ('new balance shoes for men', 'ae', '2024-12-01', '2025-02-01'),
    ('العاب', 'sa', '2024-12-01', '2025-02-01'),
    ('iphone 13 pro max', 'sa', '2024-12-01', '2025-02-01')
]
column_order_ucb = ['query', 'country', 'min_date', 'max_date']
query_country_date_raw_data_ucb = query_country_date_raw_data[column_order_ucb]

start_sens_ucb = 2.62144e-06
end_sens_ucb = 0.00067108864
start_conf_ucb = 100000
end_conf_ucb = 100000
num_of_steps_ucb = 1
sensitivities_ucb = np.logspace(np.log10(start_sens_ucb), np.log10(end_sens_ucb), num=num_of_steps_ucb)
confidences_ucb = np.logspace(np.log10(start_conf_ucb), np.log10(end_conf_ucb), num=num_of_steps_ucb)
ALGO_N_ITERATIONS_UCB = 50

experiment_results_ucb = pd.DataFrame(
    columns=['query', 'country', 'date', 'engines_range', 'config', 'algo_regret', 'real_regret', 'algo_gmv',
             'real_gmv'])
total_queries_ucb = len(queries_train_ucb)  #len(query_country_date_raw_data)
for query_it_ucb, country_it_ucb, start_date_it_ucb, end_date_it_ucb in queries_train_ucb:  #query_country_date_raw_data.itertuples(index=False, name=None):
    start_time_ucb = time.perf_counter()
    for sensitivity_ucb in sensitivities_ucb:
        for confidence_ucb in confidences_ucb:
            res_ucb = run_algorithm(
                query_it_ucb, country_it_ucb, start_date_it_ucb, end_date_it_ucb,
                UCBScoringAlgorithm(sensitivity_ucb, confidence_ucb), ['elastic', 'google'], DEFAULT_ENGINES_SPLIT)

            if len(res_ucb.days) == 0:
                break

            for _ in range(ALGO_N_ITERATIONS_UCB - 1):
                temp_ucb = run_algorithm(
                    query_it_ucb, country_it_ucb, start_date_it_ucb, end_date_it_ucb,
                    UCBScoringAlgorithm(sensitivity_ucb, confidence_ucb), ['elastic', 'google'], DEFAULT_ENGINES_SPLIT)
                res_ucb.total_avg_regrets = [x + y for x, y in
                                             zip(res_ucb.total_avg_regrets, temp_ucb.total_avg_regrets)]
                res_ucb.total_avg_real_regrets = [x + y for x, y in
                                                  zip(res_ucb.total_avg_real_regrets, temp_ucb.total_avg_real_regrets)]
                res_ucb.avg_gmvs = [x + y for x, y in zip(res_ucb.avg_gmvs, temp_ucb.avg_gmvs)]
                res_ucb.total_avg_real_gmvs = [x + y for x, y in
                                               zip(res_ucb.total_avg_real_gmvs, temp_ucb.total_avg_real_gmvs)]
                for i in range(len(res_ucb.engines_ranges)):
                    for k in res_ucb.engines_ranges[i].keys():
                        res_ucb.engines_ranges[i][k] += temp_ucb.engines_ranges[i][k]

            res_ucb.total_avg_regrets = [x / ALGO_N_ITERATIONS_UCB for x in res_ucb.total_avg_regrets]
            res_ucb.total_avg_real_regrets = [x / ALGO_N_ITERATIONS_UCB for x in res_ucb.total_avg_real_regrets]
            res_ucb.avg_gmvs = [x / ALGO_N_ITERATIONS_UCB for x in res_ucb.avg_gmvs]
            res_ucb.total_avg_real_gmvs = [x / ALGO_N_ITERATIONS_UCB for x in res_ucb.total_avg_real_gmvs]
            for i in range(len(res_ucb.engines_ranges)):
                for k in res_ucb.engines_ranges[i].keys():
                    res_ucb.engines_ranges[i][k] /= ALGO_N_ITERATIONS

            result_data_ucb = {}

            result_data_ucb['query'] = [query_it_ucb] * len(res_ucb.days)
            result_data_ucb['country'] = [country_it_ucb] * len(res_ucb.days)
            result_data_ucb['date'] = res_ucb.days
            result_data_ucb['engines_range'] = res_ucb.engines_ranges
            result_data_ucb['config'] = [UCBScoringAlgorithm(sensitivity_ucb, confidence_ucb)] * len(res_ucb.days)
            result_data_ucb['algo_regret'] = res_ucb.total_avg_regrets
            result_data_ucb['real_regret'] = res_ucb.total_avg_real_regrets
            result_data_ucb['algo_gmv'] = res_ucb.avg_gmvs
            result_data_ucb['real_gmv'] = res_ucb.total_avg_real_gmvs

            experiment_results_ucb = pd.concat([experiment_results_ucb, pd.DataFrame(result_data_ucb)])
            # show_graph_multiple([[x.left for x in res.engines_ranges], [x.right for x in res.engines_ranges]], res.days, f'engines_ranges over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', ['elastic', 'google'], ['b', 'r'])
            # show_graph(res.total_avg_regrets, res.days, f'avg_regret over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_regret', 'r')
            # show_graph(res.total_avg_real_regrets, res.days, f'avg_real_regret over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_real_regret', 'g')
            # show_graph(res.avg_gmvs, res.days, f'avg_gmv over dates query = {query_it}, thompson soft_max_sens = {sensitivity}', 'avg_gmv', 'b')
    end_time_ucb = time.perf_counter()
    process_time_ucb = (end_time_ucb - start_time_ucb) / 60
    total_queries_ucb -= 1
    print("processing done for query: ", query_it_ucb, "country: ", country_it_ucb, 'in ', process_time_ucb, 'minutes',
          "remaining queries: ", total_queries_ucb)
print(experiment_results_ucb)

  experiment_results_ucb = pd.concat([experiment_results_ucb, pd.DataFrame(result_data_ucb)])


processing done for query:  ملصقات country:  sa in  0.017196118750143798 minutes remaining queries:  20
processing done for query:  مزيل عرق country:  sa in  0.024963793050361952 minutes remaining queries:  19
processing done for query:  adidas shoes country:  sa in  0.02502900833351305 minutes remaining queries:  18
processing done for query:  جزمة رجالي country:  sa in  0.014263135416452617 minutes remaining queries:  17
processing done for query:  puma shoes for men country:  sa in  0.04483889305023089 minutes remaining queries:  16
processing done for query:  boots country:  eg in  0.01595128958336621 minutes remaining queries:  15
processing done for query:  samsung country:  sa in  0.06017094513323779 minutes remaining queries:  14
processing done for query:  شنط جس country:  sa in  0.013725725700108644 minutes remaining queries:  13
processing done for query:  boots for ladies country:  sa in  0.013014288200065494 minutes remaining queries:  12
processing done for query:  abaya 

In [22]:
experiment_results_ucb.to_csv('../experiment_output/ucb_results_bad_engine.csv', index=False)