In [None]:
import pandas as pd 
path = './'

## read in example.gph
def read_in_data(filename, path):
    filename_path = path + filename + '.csv'
    with open(filename_path, 'r') as f:
      # data = np.loadtxt(filename_path, delimiter=',')
      # header = data[0]
      # df = data[1:]
      df = pd.read_csv(filename_path, header=0)
      header = df.columns
      # df = df.to_numpy()
    return df, header

df, header = read_in_data("lice_data", path)
lice_data = df[['week',
               "sea_temp",
               "adult_femalelice",
               'active_cleanerfirsh',
               'bath_treatment',
               'feed_treatment',
               'mechanical_removal',
               'release_of_cleanerfish',
               'locality_number',
                'date',
                'year']]

dp_round = 2
lice_data = lice_data.round(dp_round)
lice_data = lice_data.dropna()

## Model-Free Q-Learning with 8 Possible Actions:

- 0: no treatment
- 1: bath only
- 2: feed only
- 3: mechanical only
- 4: bath + feed
- 5: bath + mechanical
- 6: feed + mechanical
- 7: bath + feed + mechanical

In [None]:
import collections
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

class QLearning:
    def __init__(self, num_states, num_actions, discount, Q=None, alpha=0.5):
        self.num_states = num_states
        self.num_actions = num_actions
        self.discount = discount
        self.alpha = alpha
        if Q is None or Q.size == 0:
            self.Q = np.zeros((num_states, num_actions))
        else:
            self.Q = Q

def get_state_idx(lice_level, min_lice, dp_round):
    return int((lice_level - min_lice) * (10 ** dp_round))

def get_action(model, state_idx, eps):
    if np.random.random() < eps:
        return np.random.randint(0, model.num_actions)
    else:
        return np.argmax(model.Q[state_idx])

def get_action_from_data(row):
    """Convert treatment combinations to action index
    0: no treatment
    1: bath only
    2: feed only
    3: mechanical only
    4: bath + feed
    5: bath + mechanical
    6: feed + mechanical
    7: bath + feed + mechanical
    """
    bath = row['bath_treatment']
    feed = row['feed_treatment']
    mech = row['mechanical_removal']
    
    if bath and feed and mech:
        return 7
    elif feed and mech:
        return 6
    elif bath and mech:
        return 5
    elif bath and feed:
        return 4
    elif mech:
        return 3
    elif feed:
        return 2
    elif bath:
        return 1
    else:
        return 0  

def get_action_cost(action):
    """Return cost of each action combination"""
    costs = {
        0: 0.0,    # no treatment
        1: 1.0,    # bath only
        2: 1.5,    # feed only
        3: 2.0,    # mechanical only
        4: 2.5,    # bath + feed
        5: 3.0,    # bath + mechanical
        6: 3.5,    # feed + mechanical
        7: 4.5     # bath + feed + mechanical
    }
    return costs[action]

def calculate_reward(current_lice, next_lice, action, beta1, beta2, threshold=0.5):
    """reward function with treatment costs"""
    # penalty for current lice level
    base_penalty = -beta1 * current_lice
    
    # HUGE penalty for exceeding lice level 0.5
    threshold_penalty = -10 * beta1 * max(0, current_lice - threshold)
    
    # delta lice level 
    reduction_reward = 5 * beta1 * max(0, current_lice - next_lice)
    
    # cost
    treatment_cost = -beta2 * get_action_cost(action)
    
    return base_penalty + threshold_penalty + reduction_reward + treatment_cost

def update(model, s, a, r, s2, model_large):
    """Update Q-values using Q-learning update rule"""
    if 0 <= s < model.num_states and 0 <= s2 < model.num_states and 0 <= a < model.num_actions:
        model.Q[s, a] += model.alpha * (r + model.discount * np.max(model.Q[s2, :]) - model.Q[s, a])
    return model

def simulate_episode(model, sublice_data, eps, beta1, beta2, is_training=True, dp_round=2):
    """Run one episode of simulation"""
    rewards = []
    min_lice = sublice_data['adult_femalelice'].min()
    
    for i in range(len(sublice_data) - 1):
        current_lice = sublice_data.loc[i, 'adult_femalelice']
        next_lice = sublice_data.loc[i + 1, 'adult_femalelice']
        
        state_idx = get_state_idx(current_lice, min_lice, dp_round)
        next_state_idx = get_state_idx(next_lice, min_lice, dp_round)
        
        if is_training:
            action = get_action(model, state_idx, eps)
        else:
            action = np.random.randint(0, model.num_actions)
        
        reward = calculate_reward(current_lice, next_lice, action, beta1, beta2)
        rewards.append(reward)
        
        if is_training:
            model = update(model, state_idx, action, reward, next_state_idx, True)
    
    return model, sum(rewards)

def run_experiment(model, epochs, locations, lice_data, eps, beta1, beta2, dp_round, num_simulations=3):
    """Run multiple simulations comparing Q-learning and random policies"""
    q_learning_results = np.zeros((num_simulations, epochs))
    random_results = np.zeros((num_simulations, epochs))
    
    for sim in range(num_simulations):
        model.Q = np.zeros((model.num_states, model.num_actions))
        
        for epoch in range(epochs):
            q_learning_epoch_reward = 0
            random_epoch_reward = 0
            
            for location in locations:
                sublice_data = lice_data[lice_data['locality_number'] == location].reset_index(drop=True)
                
                if len(sublice_data) > 1:
                    model, q_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, 
                                                    is_training=True, dp_round=dp_round)
                    q_learning_epoch_reward += q_reward
                    
                    # random policy 
                    _, r_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, 
                                                is_training=False, dp_round=dp_round)
                    random_epoch_reward += r_reward
            
            q_learning_results[sim, epoch] = q_learning_epoch_reward
            random_results[sim, epoch] = random_epoch_reward
    
    return q_learning_results, random_results

def plot_comparison(q_learning_results, random_results):
    epochs = q_learning_results.shape[1]
    epochs_range = np.arange(epochs)
    
    plt.figure(figsize=(12, 8))
    
    q_mean = np.mean(q_learning_results, axis=0)
    q_std = np.std(q_learning_results, axis=0)
    random_mean = np.mean(random_results, axis=0)
    random_std = np.std(random_results, axis=0)
    
    plt.plot(epochs_range, q_mean, 'b-', label='Q-Learning', linewidth=2)
    plt.plot(epochs_range, random_mean, 'r--', label='Random Policy', linewidth=2)
    plt.fill_between(epochs_range, q_mean - q_std, q_mean + q_std, color='b', alpha=0.2)
    plt.fill_between(epochs_range, random_mean - random_std, random_mean + random_std, color='r', alpha=0.2)
    
    plt.xlabel('Epoch')
    plt.ylabel('Total Reward')
    plt.title('Q-Learning vs Random Policy: Total Rewards Over Time')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    print(f"\nFinal Statistics (over {len(q_learning_results)} simulations):")
    print(f"Q-Learning - Final Mean Reward: {q_mean[-1]:.2f} ± {q_std[-1]:.2f}")
    print(f"Random Policy - Final Mean Reward: {random_mean[-1]:.2f} ± {random_std[-1]:.2f}")
    
    plt.show()

max_lice = max(lice_data['adult_femalelice'])
min_lice = min(lice_data['adult_femalelice'])
dp_round = 2

max_state_idx = get_state_idx(max_lice, min_lice, dp_round)
num_states = max_state_idx + 1  # Add 1 because indices start at 0

print(f"State space size: {num_states}")
print(f"Min lice: {min_lice:.2f}, Max lice: {max_lice:.2f}")

#pParameters
epochs = 10
alpha = 0.5
discount = 0.99
beta1 = 5
beta2 = 0.5
eps = 0.1
num_simulations = 3
num_actions = 8 

# initialize model
model = QLearning(num_states, num_actions, discount, alpha=alpha)

# experiment
locations = set(lice_data['locality_number'])
q_learning_results, random_results = run_experiment(
    model, epochs, locations, lice_data, 
    eps, beta1, beta2, dp_round, 
    num_simulations=num_simulations
)

plot_comparison(q_learning_results, random_results)

## Model-Free Q-Learning with 2 Actions (no treatment, mechanical treatment)

In [None]:
import collections
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

class QLearning:
    def __init__(self, num_states, num_actions, discount, Q=None, alpha=0.5):
        self.num_states = num_states
        self.num_actions = num_actions
        self.discount = discount
        self.alpha = alpha
        if Q is None or Q.size == 0:
            self.Q = np.zeros((num_states, num_actions))
        else:
            self.Q = Q

def get_state_idx(lice_level, min_lice, dp_round):
    return int((lice_level - min_lice) * (10 ** dp_round))

def get_action(model, state_idx, eps):
    if np.random.random() < eps:
        return np.random.randint(0, model.num_actions)
    else:
        return np.argmax(model.Q[state_idx])

def update(model, s, a, r, s2, model_large):
    if 0 <= s < model.num_states and 0 <= s2 < model.num_states and 0 <= a < model.num_actions:
        model.Q[s, a] += model.alpha * (r + model.discount * np.max(model.Q[s2, :]) - model.Q[s, a])
    return model

def simulate_episode(model, sublice_data, eps, beta1, beta2, costs, is_training=True, dp_round=2):
    rewards = []
    min_lice = sublice_data['adult_femalelice'].min()
    
    for i in range(len(sublice_data) - 1):
        current_lice = sublice_data.loc[i, 'adult_femalelice']
        next_lice = sublice_data.loc[i + 1, 'adult_femalelice']
        
        state_idx = get_state_idx(current_lice, min_lice, dp_round)
        next_state_idx = get_state_idx(next_lice, min_lice, dp_round)
        
        if is_training:
            action = get_action(model, state_idx, eps)
        else:
            action = np.random.randint(0, model.num_actions)
        
        reward = -1 * (beta1 * current_lice + beta2 * costs[bool(action)])
        rewards.append(reward)
        
        if is_training:
            model = update(model, state_idx, action, reward, next_state_idx, True)
    
    return model, sum(rewards)  

def run_experiment(model, epochs, locations, lice_data, eps, beta1, beta2, costs, dp_round, num_simulations=3):
    q_learning_results = np.zeros((num_simulations, epochs))
    random_results = np.zeros((num_simulations, epochs))
    
    for sim in range(num_simulations):
        model.Q = np.zeros((model.num_states, model.num_actions))
        
        for epoch in range(epochs):
            q_learning_epoch_reward = 0
            random_epoch_reward = 0
            
            for location in locations:
                sublice_data = lice_data[lice_data['locality_number'] == location].reset_index(drop=True)
                
                if len(sublice_data) > 1:
                    model, q_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, costs, 
                                                    is_training=True, dp_round=dp_round)
                    q_learning_epoch_reward += q_reward
                    
                    # random policy episode
                    _, r_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, costs, 
                                                is_training=False, dp_round=dp_round)
                    random_epoch_reward += r_reward
            
            q_learning_results[sim, epoch] = q_learning_epoch_reward
            random_results[sim, epoch] = random_epoch_reward
    
    return q_learning_results, random_results

def plot_comparison(q_learning_results, random_results):
    epochs = q_learning_results.shape[1]
    epochs_range = np.arange(epochs)
    
    plt.figure(figsize=(12, 8))
    
    q_mean = np.mean(q_learning_results, axis=0)
    q_std = np.std(q_learning_results, axis=0)
    random_mean = np.mean(random_results, axis=0)
    random_std = np.std(random_results, axis=0)
    
    # means and confidence intervals
    plt.plot(epochs_range, q_mean, 'b-', label='Q-Learning', linewidth=2)
    plt.plot(epochs_range, random_mean, 'r--', label='Random Policy', linewidth=2)
    plt.fill_between(epochs_range, q_mean - q_std, q_mean + q_std, color='b', alpha=0.2)
    plt.fill_between(epochs_range, random_mean - random_std, random_mean + random_std, color='r', alpha=0.2)
    
    plt.xlabel('Epoch')
    plt.ylabel('Total Reward')
    plt.title('Q-Learning vs Random Policy: Total Rewards Over Time')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    print(f"\nFinal Statistics (over {len(q_learning_results)} simulations):")
    print(f"Q-Learning - Final Mean Reward: {q_mean[-1]:.2f} ± {q_std[-1]:.2f}")
    print(f"Random Policy - Final Mean Reward: {random_mean[-1]:.2f} ± {random_std[-1]:.2f}")
    
    plt.show()

max_lice = max(lice_data['adult_femalelice'])
min_lice = min(lice_data['adult_femalelice'])
dp_round = 2

max_state_idx = get_state_idx(max_lice, min_lice, dp_round)
num_states = max_state_idx + 1  # Add 1 because indices start at 0

print(f"State space size: {num_states}")
print(f"Min lice: {min_lice:.2f}, Max lice: {max_lice:.2f}")

#pParameters
epochs = 10
alpha = 0.5
discount = 0.99
beta1 = 5
beta2 = 0.5
eps = 0.1
num_simulations = 3
num_actions = 2
costs = {True: 1, False: 0}

# initialize model
model = QLearning(num_states, num_actions, discount, alpha=alpha)

# experiment
locations = set(lice_data['locality_number'])
q_learning_results, random_results = run_experiment(
    model, epochs, locations, lice_data, 
    eps, beta1, beta2, costs, dp_round, 
    num_simulations=num_simulations
)

plot_comparison(q_learning_results, random_results)

## Ignore me I'm old code!

In [None]:
# Calculate proper number of states
max_lice = max(lice_data['adult_femalelice'])
min_lice = min(lice_data['adult_femalelice'])
dp_round = 2

# Calculate the maximum possible state index
max_state_idx = get_state_idx(max_lice, min_lice, dp_round)
num_states = max_state_idx + 1  # Add 1 because indices start at 0

print(f"State space size: {num_states}")
print(f"Min lice: {min_lice:.2f}, Max lice: {max_lice:.2f}")

In [None]:
import collections
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

class QLearning:
    def __init__(self, num_states, num_actions, discount, Q=None, alpha=0.5):
        self.num_states = num_states
        self.num_actions = num_actions
        self.discount = discount
        self.alpha = alpha
        if Q is None or Q.size == 0:
            self.Q = np.zeros((num_states, num_actions))
        else:
            self.Q = Q

def get_state_idx(lice_level, min_lice, dp_round):
    return int((lice_level - min_lice) * (10 ** dp_round))

def get_action(model, state_idx, eps):
    if np.random.random() < eps:
        return np.random.randint(0, model.num_actions)
    else:
        return np.argmax(model.Q[state_idx])

def update(model, s, a, r, s2, model_large):
    if 0 <= s < model.num_states and 0 <= s2 < model.num_states and 0 <= a < model.num_actions:
        model.Q[s, a] += model.alpha * (r + model.discount * np.max(model.Q[s2, :]) - model.Q[s, a])
    return model

def calculate_reward(current_lice, next_lice, action, beta1, beta2, costs, threshold=0.5):
    """reward function that considers:
    1. Base penalty for current lice level
    2. Extra penalty for exceeding threshold
    3. Reward for reducing lice levels
    4. Treatment cost penalty
    """
    # base penalty for current lice level
    base_penalty = -beta1 * current_lice
    
    # extra penalty for exceeding threshold
    threshold_penalty = -10 * beta1 * max(0, current_lice - threshold)
    
    # reward for reducing lice levels
    reduction_reward = 5 * beta1 * max(0, current_lice - next_lice)
    
    # treatment cost
    treatment_cost = -beta2 * costs[bool(action)]
    
    return base_penalty + threshold_penalty + reduction_reward + treatment_cost

def simulate_episode(model, sublice_data, eps, beta1, beta2, costs, is_training=True, dp_round=2):
    rewards = []
    min_lice = sublice_data['adult_femalelice'].min()
    
    for i in range(len(sublice_data) - 1):
        current_lice = sublice_data.loc[i, 'adult_femalelice']
        next_lice = sublice_data.loc[i + 1, 'adult_femalelice']
        
        state_idx = get_state_idx(current_lice, min_lice, dp_round)
        next_state_idx = get_state_idx(next_lice, min_lice, dp_round)
        
        if is_training:
            action = get_action(model, state_idx, eps)
        else:
            action = np.random.randint(0, model.num_actions)
        
        reward = calculate_reward(current_lice, next_lice, action, beta1, beta2, costs)
        rewards.append(reward)
        
        if is_training:
            model = update(model, state_idx, action, reward, next_state_idx, True)
    
    return model, sum(rewards)

def run_experiment(model, epochs, locations, lice_data, eps, beta1, beta2, costs, dp_round, num_simulations=3):
    q_learning_results = np.zeros((num_simulations, epochs))
    random_results = np.zeros((num_simulations, epochs))
    
    for sim in range(num_simulations):
        model.Q = np.zeros((model.num_states, model.num_actions))
        
        for epoch in range(epochs):
            q_learning_epoch_reward = 0
            random_epoch_reward = 0
            
            for location in locations:
                sublice_data = lice_data[lice_data['locality_number'] == location].reset_index(drop=True)
                
                if len(sublice_data) > 1:
                    model, q_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, costs, 
                                                    is_training=True, dp_round=dp_round)
                    q_learning_epoch_reward += q_reward
                    
                    _, r_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, costs, 
                                                is_training=False, dp_round=dp_round)
                    random_epoch_reward += r_reward
            
            q_learning_results[sim, epoch] = q_learning_epoch_reward
            random_results[sim, epoch] = random_epoch_reward
    
    return q_learning_results, random_results

def plot_comparison(q_learning_results, random_results):
    epochs = q_learning_results.shape[1]
    epochs_range = np.arange(epochs)
    
    plt.figure(figsize=(12, 8))
    
    q_mean = np.mean(q_learning_results, axis=0)
    q_std = np.std(q_learning_results, axis=0)
    random_mean = np.mean(random_results, axis=0)
    random_std = np.std(random_results, axis=0)
    
    plt.plot(epochs_range, q_mean, 'b-', label='Q-Learning', linewidth=2)
    plt.plot(epochs_range, random_mean, 'r--', label='Random Policy', linewidth=2)
    plt.fill_between(epochs_range, q_mean - q_std, q_mean + q_std, color='b', alpha=0.2)
    plt.fill_between(epochs_range, random_mean - random_std, random_mean + random_std, color='r', alpha=0.2)
    
    plt.xlabel('Epoch')
    plt.ylabel('Total Reward')
    plt.title('Q-Learning vs Random Policy: Total Rewards Over Time')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    print(f"\nFinal Statistics (over {len(q_learning_results)} simulations):")
    print(f"Q-Learning - Final Mean Reward: {q_mean[-1]:.2f} ± {q_std[-1]:.2f}")
    print(f"Random Policy - Final Mean Reward: {random_mean[-1]:.2f} ± {random_std[-1]:.2f}")
    
    plt.show()

#  parameters
epochs = 30
alpha = 0.5
discount = 0.99  
beta1 = 5
beta2 = 0.5
eps = 0.1
costs = {True: 1, False: 0}
num_simulations = 3

# initialize model
model = QLearning(num_states, num_actions, discount, alpha=alpha)

# run experiment
locations = set(lice_data['locality_number'])
q_learning_results, random_results = run_experiment(
    model, epochs, locations, lice_data, 
    eps, beta1, beta2, costs, dp_round, 
    num_simulations=num_simulations
)

# plot results
plot_comparison(q_learning_results, random_results)

In [None]:
import collections
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

class QLearning:
    def __init__(self, num_states, num_actions, discount, Q=None, alpha=0.5):
        self.num_states = num_states
        self.num_actions = num_actions  # Now 6 actions instead of 2
        self.discount = discount
        self.alpha = alpha
        if Q is None or Q.size == 0:
            self.Q = np.zeros((num_states, num_actions))
        else:
            self.Q = Q

def get_state_idx(lice_level, min_lice, dp_round):
    return int((lice_level - min_lice) * (10 ** dp_round))

def get_action(model, state_idx, eps):
    if np.random.random() < eps:
        return np.random.randint(0, model.num_actions)
    else:
        return np.argmax(model.Q[state_idx])

def get_action_from_data(row):
    """Convert treatment combinations to action index
    0: no treatment
    1: bath only
    2: feed only
    3: mechanical only
    4: bath + feed
    5: bath + mechanical
    6: feed + mechanical
    7: bath + feed + mechanical
    """
    bath = row['bath_treatment']
    feed = row['feed_treatment']
    mech = row['mechanical_removal']
    
    if bath and feed and mech:
        return 7
    elif feed and mech:
        return 6
    elif bath and mech:
        return 5
    elif bath and feed:
        return 4
    elif mech:
        return 3
    elif feed:
        return 2
    elif bath:
        return 1
    else:
        return 0  # No treatment

def get_action_cost(action):
    """Return cost of each action combination
    Assuming: bath = 1, feed = 1.5, mechanical = 2 cost units
    """
    costs = {
        0: 0.0,    # no treatment
        1: 1.0,    # bath only
        2: 1.5,    # feed only
        3: 2.0,    # mechanical only
        4: 2.5,    # bath + feed
        5: 3.0,    # bath + mechanical
        6: 3.5,    # feed + mechanical
        7: 4.5     # bath + feed + mechanical
    }
    return costs[action]

def update(model, s, a, r, s2, model_large):
    if 0 <= s < model.num_states and 0 <= s2 < model.num_states and 0 <= a < model.num_actions:
        model.Q[s, a] += model.alpha * (r + model.discount * np.max(model.Q[s2, :]) - model.Q[s, a])
    return model

def calculate_reward(current_lice, next_lice, action, beta1, beta2, threshold=0.5):
    # base penalty for current lice level
    base_penalty = -beta1 * current_lice
    
    # extra penalty for exceeding 0.5 
    threshold_penalty = -10 * beta1 * max(0, current_lice - threshold)
    
    # reward for reducing lice levels
    reduction_reward = 5 * beta1 * max(0, current_lice - next_lice)
    
    # treatment cost using expanded action costs
    treatment_cost = -beta2 * get_action_cost(action)
    
    return base_penalty + threshold_penalty + reduction_reward + treatment_cost

def simulate_episode(model, sublice_data, eps, beta1, beta2, is_training=True, dp_round=2):
    rewards = []
    min_lice = sublice_data['adult_femalelice'].min()
    
    for i in range(len(sublice_data) - 1):
        current_lice = sublice_data.loc[i, 'adult_femalelice']
        next_lice = sublice_data.loc[i + 1, 'adult_femalelice']
        
        state_idx = get_state_idx(current_lice, min_lice, dp_round)
        next_state_idx = get_state_idx(next_lice, min_lice, dp_round)
        
        if is_training:
            action = get_action(model, state_idx, eps)
        else:
            action = np.random.randint(0, model.num_actions)
        
        reward = calculate_reward(current_lice, next_lice, action, beta1, beta2)
        rewards.append(reward)
        
        if is_training:
            model = update(model, state_idx, action, reward, next_state_idx, True)
    
    return model, sum(rewards)

def run_experiment(model, epochs, locations, lice_data, eps, beta1, beta2, dp_round, num_simulations=3):
    q_learning_results = np.zeros((num_simulations, epochs))
    random_results = np.zeros((num_simulations, epochs))
    
    for sim in range(num_simulations):
        model.Q = np.zeros((model.num_states, model.num_actions))
        
        for epoch in range(epochs):
            q_learning_epoch_reward = 0
            random_epoch_reward = 0
            
            for location in locations:
                sublice_data = lice_data[lice_data['locality_number'] == location].reset_index(drop=True)
                
                if len(sublice_data) > 1:
                    model, q_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, 
                                                    is_training=True, dp_round=dp_round)
                    q_learning_epoch_reward += q_reward
                    
                    _, r_reward = simulate_episode(model, sublice_data, eps, beta1, beta2, 
                                                is_training=False, dp_round=dp_round)
                    random_epoch_reward += r_reward
            
            q_learning_results[sim, epoch] = q_learning_epoch_reward
            random_results[sim, epoch] = random_epoch_reward
    
    return q_learning_results, random_results

# parameters
epochs = 10
alpha = 0.5
discount = 0.99
beta1 = 5
beta2 = 0.5
eps = 0.1
num_simulations = 3
num_actions = 8  

# initialize model 
model = QLearning(num_states, num_actions, discount, alpha=alpha)

# run experiment
locations = set(lice_data['locality_number'])
q_learning_results, random_results = run_experiment(
    model, epochs, locations, lice_data, 
    eps, beta1, beta2, dp_round, 
    num_simulations=num_simulations
)

plot_comparison(q_learning_results, random_results)