# Epsilon Decreasing


In [1]:
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from collections import namedtuple
import random
from Arm import Arm
from runner import graph_impressions, run_round, graph_regret, display_real_ctrs

## Basics

Epsilon Greedy used a flat rate to determine how often we should explore vs exploit. If epsilon was 0.1, we would explore 10% of the time. 90% of the time we would choose the highest performing arm. The obvious drawback is if we find a clear winner in the first few pulls, we'll still be wasting 10% of our pulls on exploration. 

Epsilon Decreasing solves this by, unsurpsingly, decreasing the value of epsilon over time. So, as we become more confident that a given arm is the best one, we exploit it more and more. 

In [7]:
class EpsilonDecreasingArm(Arm):
    def __init__(self, name, expected_win_rate):
        super(EpsilonDecreasingArm, self).__init__(name, expected_win_rate)
        

The exploit_or_explore function below describes how we choose which to do. The first time we ask, epsilon equals 1. We are guaranteed we'll explore. The second time, espilone equal 1/2, so it's a coin flip. Epsilon continues decreasing linearly. It's value is 1 divided by the number of arm pulls.

In [3]:
def exploit_or_explore():
    """ True if exploit, False if explore"""
    num_decisions = 1
    def _exploit_or_explore():
        nonlocal num_decisions
    
        epsilon = 1 / num_decisions
        num_decisions += 1
        
        return epsilon < random.random()
    return _exploit_or_explore

def get_winning_arm(arms):
    max_win_rate = max([a.win_rate for a in arms])
    winners = [arm for arm in arms 
                  if arm.win_rate == max_win_rate]

    return random.choice(winners)
    
def choose_arm():
    exploit_or_explorer = exploit_or_explore()
    def _choose_arm(arms):
        use_winner = exploit_or_explorer()

        if use_winner:
            return get_winning_arm(arms)
        return random.choice(arms)

    return _choose_arm
    

Below you can see how exploration vs exploitation changes over time. Rather quickly, exploitation fully takes over, as epsilon decreases.

In [4]:
def example_arm_choice(num_trials):
    explore = []
    exploit = []
    exploit_or_explorer = exploit_or_explore()
    
    for _ in range(num_trials):
        chose_winner = exploit_or_explorer()
    
        explore.append(not chose_winner)
        exploit.append(chose_winner)
        
    data=pd.DataFrame(data={'explore': np.cumsum(explore), 'exploit': np.cumsum(exploit)})
    sns.lineplot(data=data)
        
widget = interact_manual(example_arm_choice, 
                     num_trials=widgets.IntSlider(step=1, value=10, max=50, continuous_update=False));
widget

interactive(children=(IntSlider(value=10, continuous_update=False, description='num_trials', max=50), Button(d…

<function __main__.example_arm_choice(num_trials)>

In [5]:
def run(num_events, probs):
    RUN_N_TRIALS = 100
    
    prob_list = [float(i.strip()) for i in probs.split(',')]
    
    arms = [EpsilonDecreasingArm(str(prob), prob) for prob in prob_list]
    
    trials = []
    for _ in range(RUN_N_TRIALS):
        arms_to_history = {arm: {'wins': [], 'impressions': []} for arm in arms}
        arm_chooser = choose_arm()
        
        for i in range(num_events):
            arms_to_history = run_round(arm_chooser, arms_to_history)

        trials.append(arms_to_history)
    
    arms_to_history = {arm: {'wins': [], 'impressions': []} for arm in arms}
    for arm in arms:
        arms_to_history[arm]['wins'] = np.mean([trial[arm]['wins'] for trial in trials], axis=0)
        arms_to_history[arm]['impressions'] = np.mean([trial[arm]['impressions'] for trial in trials], axis=0)
            
    graph_impressions(arms_to_history)
    graph_regret(arms_to_history) 
    display_real_ctrs(arms_to_history)

## Full Simulation

You may notice that for epsilon decreasing, regret quickly levels off. 

In [8]:
widget = interact_manual(run, 
                     num_events=widgets.IntSlider(min=1, max=1000, step=1, value=10, continuous_update=False),
                     probs=widgets.Text(
                        placeholder='Comma-sep floats',
                        description='Action Rate:',
                        value='0.05,0.075,0.1',
                        continuous_update=False
                    ));
widget

interactive(children=(IntSlider(value=10, continuous_update=False, description='num_events', max=1000, min=1),…

<function __main__.run(num_events, probs)>