In [1]:
import random
from collections import Counter

try:
    import numpy as np
except ImportError:
    !{sys.executable} -m pip install numpy==1.17.4
import numpy as np

### TODO: Document the code

In [2]:
random.seed(10)
np.random.seed(10)

In [3]:
# Data generation

unique_values = list(range(10))
variations_distributions = {
    'A': [0.6, 0.2, 0.1, 0.05, 0.05, 0, 0, 0, 0, 0],
    'B': [0.65, 0.15, 0.1, 0.05, 0.05, 0, 0, 0, 0, 0],
    'C': [0.7, 0.12, 0.13, 0.02, 0.03, 0, 0, 0, 0, 0],
    'D': [0.7, 0.05, 0.05, 0.05, 0.05, 0.04, 0.02, 0.02, 0.01, 0.01],
}

purchases = {
    k: np.random.choice(unique_values, (random.randint(85000, 100000),), p=v) 
    for k, v in variations_distributions.items()
}

In [4]:
# Binary variable

alpha_prior, beta_prior = 1, 1
beta_simulation_size = 100000

simulation_data_binary = {}
for variation, variation_data in purchases.items():
    alpha = len(variation_data[variation_data > 0])
    beta = len(variation_data) - alpha
    simulation = np.random.beta(alpha + alpha_prior, beta + beta_prior, size=beta_simulation_size)
    simulation_data_binary[variation] = simulation

hypothesis_results_binary = {}
for segment_control, simulation_control in simulation_data_binary.items():
    all_others = []
    for segment_test, simulation_test in simulation_data_binary.items():
        if segment_test == segment_control:
            continue
        control_beats_test = simulation_control > simulation_test
        control_beats_test_probability = (sum(control_beats_test) / beta_simulation_size) * 100
        hypothesis_results_binary[f'{segment_control}_beats_{segment_test}'] = control_beats_test_probability
        all_others.append(control_beats_test)
    all_others = np.array(all_others)
    control_beats_all_others = sum(np.all(all_others, axis=0))
    control_beats_all_others_probability = (control_beats_all_others / beta_simulation_size) * 100
    hypothesis_results_binary[f'{segment_control}_beats_all_others'] = control_beats_all_others_probability

print(hypothesis_results_binary)

{'A_beats_B': 100.0, 'A_beats_C': 100.0, 'A_beats_D': 100.0, 'A_beats_all_others': 100.0, 'B_beats_A': 0.0, 'B_beats_C': 100.0, 'B_beats_D': 100.0, 'B_beats_all_others': 0.0, 'C_beats_A': 0.0, 'C_beats_B': 0.0, 'C_beats_D': 27.035999999999998, 'C_beats_all_others': 0.0, 'D_beats_A': 0.0, 'D_beats_B': 0.0, 'D_beats_C': 72.964, 'D_beats_all_others': 0.0}


In [5]:
def estimate_probability_continuous(test_name, variations_names, medians):
    variation_is_best = [0] * (len(variations_names) + 1)
    medians_stacked = [simulation_data_continuous[test_name]] + [medians[variation] for variation in variations_names]
    medians_stacked = np.vstack(medians_stacked)
    best_all_variations = np.argmax(medians_stacked, axis=0)
    for i in range(len(variation_is_best)):
        variation_is_best[i] = sum(best_all_variations == i)

    variation_probabilities = [n_is_best / sum(variation_is_best) for n_is_best in variation_is_best]

    return round(variation_probabilities[0] * 100, 2)

In [6]:
# Continuous variable

bootstrap_simulation_size = 5000
probability_sample_size = 50000

simulation_data_continuous = {}
for variation, variation_data in purchases.items():
    non_zero_values = variation_data[variation_data > 0]
    sample_size = len(non_zero_values)
    values_counts = list(Counter(non_zero_values).items())
    values_aggregated = np.array([x[0] for x in values_counts])
    weights_aggregated = np.array([x[1] for x in values_counts])
    pvals_aggregated = weights_aggregated / sum(weights_aggregated)
    draws = np.random.choice(values_aggregated, 
                             size=(bootstrap_simulation_size, sample_size), 
                             replace=True, p=pvals_aggregated)
    medians = np.median(draws, axis=1, overwrite_input=True)
    simulation_data_continuous[variation] = np.random.choice(medians, probability_sample_size, replace=True)

hypothesis_results_continuous = {}
for segment_control in purchases.keys():
    all_others = []
    for segment_test in purchases.keys():
        if segment_test == segment_control:
            continue
        control_beats_test_probability = estimate_probability_continuous(
            segment_control, [segment_test], simulation_data_continuous)
        hypothesis_results_continuous[f'{segment_control}_beats_{segment_test}'] = control_beats_test_probability
        all_others.append(segment_test)

    control_beats_all_others_probability = estimate_probability_continuous(
            segment_control, all_others, simulation_data_continuous)
    hypothesis_results_continuous[f'{segment_control}_beats_all_others'] = control_beats_all_others_probability

print(hypothesis_results_continuous)

{'A_beats_B': 28.38, 'A_beats_C': 28.38, 'A_beats_D': 0.0, 'A_beats_all_others': 0.0, 'B_beats_A': 100.0, 'B_beats_C': 100.0, 'B_beats_D': 0.0, 'B_beats_all_others': 0.0, 'C_beats_A': 100.0, 'C_beats_B': 100.0, 'C_beats_D': 0.0, 'C_beats_all_others': 0.0, 'D_beats_A': 100.0, 'D_beats_B': 100.0, 'D_beats_C': 100.0, 'D_beats_all_others': 100.0}
