# Simulated Data Generator

#### Prerequisite
* Install VowpalWabbit(VW) by following [this instruction](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Building).

In [None]:
# Imports
import json
import os
import subprocess
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from collections import OrderedDict
from tqdm import tqdm
from vw_offline_utilities import *
from IPython.display import Markdown, display

import warnings
warnings.filterwarnings('ignore')

## Step 1. Generate a Simulated Dataset

Based on the config file, we will generate a simulated dataset and save the file for further use.

In [None]:
# Config File
config_file = r'config_data_generator.json'
configs = json.load(open(config_file, 'r'))
configs = update_params(configs)
np.random.seed(configs['random_state'])

In [None]:
# Generate data
df, context_action_stats = generate_data(**configs)

# Increase the leading gap of the best action
context_actions = summarize_dataset(df, configs, show_results=False)
df = increase_lead(df, context_actions, add_value=configs['increase_winning_margin'])

# Finalizing
if configs['center']:
    df['reward'] = df['reward'] - df['reward'].mean()

In [None]:
# Summarize data
display(df.groupby(list(configs['contexts'].keys())+['action']).agg({'action': 'count', 'reward': 'mean'}).unstack(-1))
context_actions = summarize_dataset(df, configs, show_results=True)

In [None]:
# Output data
df = df.reset_index().sample(frac=1, random_state=configs['random_state'])
df.to_csv(configs['df_file'], index=False)

## Step 2. Transform to DSLogs and Train a VW Model

### 2.1 Data Overview

In this section, we list the contexts, actions and winning actions for each unique context.

In [None]:
%matplotlib notebook

In [None]:
# Column names - context, reward and action columns
context_cols = list(configs['contexts'].keys())
action_col = 'action'
reward_col = 'reward'
df_cols = context_cols + [action_col, reward_col]
idx_cols = context_cols + [action_col]

In [None]:
# Reshape data for the analysis
df.dropna(inplace=True)
df[action_col] =  df[action_col].astype(str)
df = df.sort_values(idx_cols).set_index(idx_cols)

In [None]:
# Get the space of context and action
contexts = configs['contexts']
actions = [str(x) for x in configs['actions']]
action_mapping = {i: a for i, a in enumerate(actions)}
display(Markdown('**Contexts**:'), dict(contexts))
display(Markdown('**Actions**:'), actions)

In [None]:
# Summary
df_summary = df.reset_index().groupby(context_cols+[action_col])[reward_col].mean().unstack(-1)
df_summary.style.apply(lambda x: highlight_optimal(x, is_minimization=False), axis=1)

### 2.2 VW Command Lines
We will specify the training parameters and commands for VowpalWabbit(VW) .

In [None]:
# VW Parameters
tc = configs['model_parameters']
vwc = configs['vw_commands']

# VW Commands
cmd_train_initial = 'vw --dsjson {0} --cb_explore_adf {1} --cb_type {2} {3} -l {4} -f {5} {6}'.format(
    configs['batch_dsjson_path'], vwc['exploration_policy'], vwc['cb_type'], vwc['interactions'], vwc['learning_rate'], configs['model_file'], vwc['other_commands'])
cmd_train_continued = 'vw -i {4} --dsjson {0} --cb_explore_adf {1} --cb_type {2} -l {3} -f {4} {5}'.format(
    configs['batch_dsjson_path'], vwc['exploration_policy'], vwc['cb_type'], vwc['learning_rate'], configs['model_file'], vwc['other_commands'])
cmd_pred_unique_context = 'vw -t -i {0} --dsjson {1} -p {2} -l {3} {4}'.format(
    configs['model_file'], configs['context_dsjson_path'], configs['context_pred_path'], vwc['learning_rate'], vwc['other_commands'])

### 2.3 Transform Data for VW Modeling

VW requires a special data format, DSJson as input. We will transform our tabular data to this format. For details, please visit this [example](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Conditional-Contextual-Bandit#example-2).

In [None]:
# Unique context
df_contexts = get_unique_context(df_summary, action_col, reward_col, is_minimization=False)
df_contexts_json = transform_dsjson(df_contexts, context_cols, reward_col, action_col, actions, is_minimization=False)
export_dsjson(df_contexts_json, configs['context_dsjson_path'])

# DSLog preview
display(Markdown('**DSLog Preview**'))
display(eval(df_contexts_json['output_json'][0]))

### 2.4 Train a Model with VW 

We will train a Contextual Bandit model with VW in this section. We can monitor the accuracy of exploit actions in the mean time. The training logs will be saved in a \log subfolder.

In [None]:
# Prep plot
df_batch_accuracy = [np.nan]*(tc['iterations']+1)
fig, ax = init_plot(tc['iterations'])

# Training
trajectory = pd.DataFrame()
for i in tqdm(range(tc['iterations'] + 1)):
    # Select data
    df_batch, control_identifier = select_data(i, df, df_contexts, configs, action_mapping, context_cols, action_col, reward_col)
    trajectory = trajectory.append(df_batch)
    # Export to dsjson format
    df_batch_json = transform_dsjson(df_batch, context_cols, reward_col, action_col, actions, is_minimization=False, other_values=control_identifier)
    export_dsjson(df_batch_json, configs['batch_dsjson_path'])
    # Plot
    df_batch_exploit = df_batch.loc[df_batch['action_prob']==df_batch['prob_list'].apply(lambda x: max(x))]
    df_batch_compare = pd.merge(
        df_batch_exploit[idx_cols], df_contexts, 
        how='left', left_on=context_cols, right_on=context_cols, suffixes=['_pred', '_opt'])
    df_batch_accuracy[i] = (df_batch_compare[action_col+'_pred']==df_batch_compare[action_col+'_opt']).mean()
    plt_dynamic(fig, ax, df_batch_accuracy)
    # Train model (When i=iterations, only summarize the model prediction from the last batch without updating the model)
    if i!=tc['iterations']:
        # Update model
        if i == 0:
            job = subprocess.Popen(cmd_train_initial)
            job.wait()
        else:
            job = subprocess.Popen(cmd_train_continued)
            job.wait() 
        # Predict with new model
        job = subprocess.Popen(cmd_pred_unique_context)
        job.wait() 
        # Keep all inputs by renaming them
        new_name = configs['batch_dsjson_path'].replace('.json', '{0}.json'.format(i))
        if os.path.exists(new_name):
            os.remove(new_name)
        os.rename(configs['batch_dsjson_path'], new_name)
        # Create control group
        if tc['add_control_group']:
            create_control_logs(i, df, new_name, configs, actions, context_cols, action_col, reward_col)
print('Training logs are save in {0}'.format(os.path.dirname(configs['batch_dsjson_path'])))

### 2.5 Predictions and Regret

We can compare the model predictions with the optimal (ground truth) to validate that the model is taking the best actions.

We will also look at the average regret (distance from the optimal) over the training session.

In [None]:
# Compare the final prediction with the optimal value
pred_context = load_pred_context(configs['pred_file'], df_contexts, context_cols, action_mapping)
df_compare = pd.merge(df_contexts, pred_context, left_on=context_cols, right_on=context_cols, how='left')
df_compare.rename(columns={action_col: 'optimal_action'}, inplace=True)
df_compare = df_compare[context_cols + ['optimal_action', 'exploit_action']].astype(str)
df_compare.style.apply(lambda x: highlight_suboptimal(x, df_compare['optimal_action'], ['exploit_action']))

The table above shows that the model predictions match optimal actions in all contexts. 

Next, we'll look at the regret by context. Regret is defined as the distance between the optimal reward and that from the chosen action. So when then optimal action is learned, there will be no regret. In this particular example, as we used _epsilon 0.2_, which means that we always randomly explore for 20% of the population, so the regret will never be 0 but stay at a low level.

In [None]:
# Regret by iteration
regret = get_regrets(trajectory, df_contexts, context_cols, reward_col, vwc['exploration_policy'], is_minimization=False)

# Plot Regret by context
groups = context_cols + ['exploration', 'n_iteration']
plot_data = plot_regrets(regret, groups, rolling_window=10)

---