### Step 1: Import packages

In [70]:
from google.cloud import bigquery
from google.cloud import bigquery_storage
import pandas as pd
import numpy as np
import os
import datetime as dt
from datetime import datetime, timedelta
from math import ceil
import uuid
import scipy.stats
import warnings
warnings.filterwarnings(action="ignore")

### Step 2: Define some input parameters

In [2]:
# Instantiate a BQ client and run the SQL query that pulls the historical data
client = bigquery.Client(project="logistics-data-staging-flat")
bqstorage_client = bigquery_storage.BigQueryReadClient()

with open("sql_queries.sql", mode="r", encoding="utf-8") as f:
    query = f.read()
    f.close()

client.query(query=query).result()

In [3]:
# Pull the data from the final table generated by the query
df = client.query("""SELECT * FROM `dh-logistics-product-ops.pricing.ab_test_individual_orders_augmented_randomization_algo_analysis`""").result().to_dataframe(bqstorage_client=bqstorage_client, progress_bar_type="tqdm")

Downloading: 100%|██████████| 1422904/1422904 [01:07<00:00, 21182.12rows/s]


In [4]:
# Define a list of dictionaries containing the entity IDs, ASA IDs, and zone names that will be used in the analysis
entity_asa_zone_dict = [
    # SG
    {"entity_id": "FP_SG", "asa_id": 559, "zone_names": ["Bukitpanjang", "Jurongwest", "Woodlands"], "zone_group_identifier": "zg_1"},
    {"entity_id": "FP_SG", "asa_id": 560, "zone_names": ["Far_east", "Jurong east"], "zone_group_identifier": "zg_2"},

    # HK
    {"entity_id": "FP_HK", "asa_id": 402, "zone_names": ["To kwa wan rider", "Kowloon city rider", "Lai chi kok rider"], "zone_group_identifier": "zg_3"},
    {"entity_id": "FP_HK", "asa_id": 406, "zone_names": ["Ma liu shui rider", "Kwai chung rider", "Sai kung rider", "Sheung shui rider", "Tai po rider", "Tai wai rider", "Tin shui wai rider", "Tsing yi rider", "Tsuen wan rider", "Tuen mun rider", "Tun chung rider", "Yuen long rider"], "zone_group_identifier": "zg_4"},
    {"entity_id": "FP_HK", "asa_id": 398, "zone_names": ["Admiralty cwb rider", "Happy valley cwb rider", "Kennedy town rider", "Quarry bay rider"], "zone_group_identifier": "zg_5"},

    # PH
    {"entity_id": "FP_PH", "asa_id": 496, "zone_names": ["South alabang atc", "Paranaque", "North Ias pinas", "North alabang atc", "Bf homes"], "zone_group_identifier": "zg_6"},
    {"entity_id": "FP_PH", "asa_id": 525, "zone_names": ["Bacoor north", "Tagaytay", "Dasmarinas", "Imus"], "zone_group_identifier": "zg_7"},
    {"entity_id": "FP_PH", "asa_id": 528, "zone_names": ["Antipolo north", "Malabon", "Sjdm", "Valenzuela"], "zone_group_identifier": "zg_8"},
    {"entity_id": "FP_PH", "asa_id": 508, "zone_names": ["Makati", "Pasay"], "zone_group_identifier": "zg_9"}
]

In [12]:
# Create a new data frame with the combinations stipulated in the dictionary above
df_reduced = []
for i in entity_asa_zone_dict:
    df_iter = df[(df["entity_id"] == i["entity_id"]) & (df["asa_id"] == i["asa_id"]) & (df["zone_name"].isin(i["zone_names"]))]
    df_iter["zone_group_identifier"] = i["zone_group_identifier"]
    df_reduced.append(df_iter)

# Convert df_reduced to a dataframe
df_reduced = pd.concat(df_reduced)

# Add a new field to df_reduced showing a different format of "dps_sessionid_created_at_utc". We want to display the format followed by DPS, which is "%Y-%m-%dT%H:%M:%SZ"
df_reduced["dps_sessionid_created_at_utc_formatted"] = df_reduced["dps_sessionid_created_at_utc"]\
    .apply(lambda x: pd.to_datetime(dt.datetime.strftime(x, "%Y-%m-%dT%H:%M:%SZ")))

df_reduced.reset_index(drop=True, inplace=True)

In [90]:
# The shell script that runs the randomization algorithm needs the starting time of the experiment as one of its input
# We define that as the minimum dps_session_start_timestamp per zone_group_identifier
df_min_max_dps_session_start_ts = df_reduced.groupby(["entity_id", "zone_group_identifier"])["dps_sessionid_created_at_utc_formatted"]\
    .agg(["min", "max"])\
    .reset_index()\
    .rename(columns={"min": "min_dps_session_start_ts", "max": "max_dps_session_start_ts"})

In [31]:
# Create a function that takes the zone_group_identifier and creates a CSV file called input_{zg_identifier}. This file contains the details necessary to run the randomization algorithm
def input_csv_func(zg_identifier):
    df_stg = df_reduced[df_reduced["zone_group_identifier"] == zg_identifier][["platform_order_code", "zone_id", "dps_sessionid_created_at_utc_formatted"]]\
        .sort_values("dps_sessionid_created_at_utc_formatted")\
        .reset_index(drop=True)
    df_stg["dps_sessionid_created_at_utc_formatted"] = df_stg["dps_sessionid_created_at_utc_formatted"].apply(lambda x: str(x))
    df_stg.to_csv(f"input.csv", index=False, header=False, date_format="str")

# Invoke the function that creates the input file. Keep in mind that this overwrites the already existing input.csv file
input_csv_func(zg_identifier="zg_1")

In [32]:
%%script "C:/Program Files/Git/bin/bash.exe"
dos2unix input.csv

dos2unix: converting file input.csv to Unix format...


In [33]:
%%script "C:/Program Files/Git/bin/bash.exe"
./run-allocation.sh -w 3.0 -v 3 -t 2023-01-01T00:48:04Z -k 28115 -s DB0720FD-326E-407F-9EA2-512BF8154DDE

key: 28115
salt: DB0720FD-326E-407F-9EA2-512BF8154DDE
Switchback parameters are valid, starting experiment..
Allocation is complete. Results are available in output.csv file


In [71]:
# After the output.csv file is created, retrieve the variants from the output.csv file and join them to df_reduced
df_variants = pd.read_csv("output.csv")
df_analysis = df_reduced[df_reduced["zone_group_identifier"] == "zg_1"].copy() # Create a copy of df_reduced just for the zg_id being analysed
df_analysis = pd.merge(left=df_analysis, right=df_variants, how="left", left_on="platform_order_code", right_on="OrderID")
df_analysis.drop("OrderID", axis=1, inplace=True)

In [73]:
# Add a column indicating the week number
df_analysis["created_date_local"] = df_analysis["order_placed_at_local"].apply(lambda x: pd.to_datetime(datetime.date(x)))

conditions = [
    (df_analysis["created_date_local"] >= "2023-01-01") & (df_analysis["created_date_local"] <= "2023-01-07"),
    (df_analysis["created_date_local"] >= "2023-01-08") & (df_analysis["created_date_local"] <= "2023-01-14"),
    (df_analysis["created_date_local"] >= "2023-01-15") & (df_analysis["created_date_local"] <= "2023-01-21"),
    (df_analysis["created_date_local"] >= "2023-01-22") & (df_analysis["created_date_local"] <= "2023-01-28"),
]

df_analysis["week_num"] = np.select(condlist=conditions, choicelist=["week_1", "week_2", "week_3", "week_4"])

In [116]:
def hr_interval_func_random(sb_interval):
    bins = int(24 / sb_interval) # The number of bins by which we will divide the range from 0 to 24. A 2-hour switchback interval will have 12 bin --> [0, 2), [2, 4), [4, 6), ... [22, 24)
    if sb_interval >= 1:
        end_of_range = 25
    elif sb_interval == 0.5:
        end_of_range = 24.5
    elif sb_interval == 0.25:
        end_of_range = 24.25
    df_mapping = pd.DataFrame(data = {
            'hr_interval': list(pd.cut(np.arange(0, end_of_range, sb_interval), bins = bins, right = False)) # The bins should be closed from the left
        }
    )

    # Drop duplicates
    df_mapping.drop_duplicates(inplace = True)

    unique_intervals = df_mapping['hr_interval'].unique()

    rnd_id_list = [] # Create the full list that the rng.choice would choose from
    for i in range(1, len(unique_intervals) + 1):
        rnd_id_list.append(uuid.uuid4())

    rng = np.random.default_rng()
    df_mapping['treatment_status_by_time'] = rng.choice(rnd_id_list, replace = False, axis = 0, size = len(df_mapping))
    return df_mapping

In [118]:
def hr_interval_date_func_random(test_start, test_length, sb_interval):
    m = []
    date_iter = test_start # Start date of the test in datetime format
    for i in range(0, test_length): # The length of a test in days
        y = hr_interval_func_random(sb_interval) # The switchback window size
        y['sim_run'] = i + 1
        y['created_date_local'] = date_iter
        date_iter = date_iter + timedelta(days = 1)
        m.append(y)

    m = pd.concat(m)
    m.reset_index(inplace = True, drop = True)
    return m