## Imports

In [20]:
%run base.ipynb
%load_ext autoreload
%autoreload 2

import pickle
from pprint import pprint
import tqdm.notebook as tqdm

import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import tabulate

# import signature policy and q-learning algorithm
import qlearning_method as qlearning
from qlearning_policies import SigQFunction
import utils

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


If results and plots from this notebook should be saved set `save=True` and select an identifiying prefix used for file names when saving results and plots, e.g. current date with a letter or number.

In [None]:
save_flag = False
file_prefix = '20240903_A'

if save_flag:
    # check if files with selected prefix exist
    from pathlib import Path
    if len(list(Path('../results').glob(file_prefix + '*'))) + \
        len(list(Path('../figures').glob(file_prefix + '*'))) != 0:
        file_prefix = input('Files with the chosen prefix already exist.\nPlease enter new prefix.')
    print('Results will be save with chosen prefix:', file_prefix)
else:
    print('Results will not be saved.')

## Environment 

In [None]:
steps = 200
env = gym.make('MountainCar-v0', max_episode_steps=steps)

## Train Q-function with different truncation orders

We want to perform training runs for different signature truncation orders $N\in\mathbb{N}.$

Specify the configuration of the Signature-Q-Function approximation and the training algorithm hyper-parameters.

In [None]:
truncation_orders = [2, 3, 4, 5, 6, 7, 8]
training_runs = 4

# signature config, sig_depth will be added below
sigq_params = dict(
    in_channels = 2,
    out_dimension = 3,
    initial_basepoint = [-0.5, 1.],
    initial_bias = -0.1 
)

# training config
training_params = dict(
    episodes = 4000,
    discount=0.99,
    learning_rate = 0.005,
    learning_rate_decay = dict(mode=None), 
    epsilon = 0.3,
    epsilon_decay = dict(mode='linear', end_value=0.05, epochs=2000),
    decay_increment = 'success',
    batch_size=1,
)

For each truncation order, we perform four training runs with all other hyperparameters held fixed. 

In [None]:
train_res_dict = {}

for trunc_order in truncation_orders:
    trunc_order_results = []

    # run training algorithm loop
    pbar = tqdm.trange(training_runs)
    for run in range(training_runs):
        pbar.set_description("Trunaction order: {}  |  Training runs".format(trunc_order, run))
        sigQfunction = SigQFunction(sig_depth=trunc_order, **sigq_params)
        run_results = qlearning.train(env, sigQfunction, **training_params) 
        trunc_order_results.append(run_results)
    
    train_res_dict[trunc_order] = trunc_order_results

### Plot training results

#### Single run
We plot the results for a specific run, set by ``run_id``. The results of interest are the reward obtained, the total loss occured, the car's end position, and the steps before reaching the goal or terminating, in each episode.

In [None]:
trunc_order = 5
run_id = 0

print('Truncation order {} in training run {}: {} Successes'.format(
    trunc_order, run_id, sum(np.array(train_res_dict[trunc_order][run_id][2])>=0.5)
))
utils.plot_results(train_res_dict[trunc_order], run=run_id, ma_window=100,
                   title="Results for truncation {}, run {}".format(trunc_order, run_id))

#### Averaged over all runs for specific truncation order

For a fixed truncation order, we average the results over all training runs and calculate the mean and standard deviation of
- reward per episode
- loss per episode
- end position per episode
- steps until termination per episode

and plot the average results. Additionally we create one plot for each of the statistics and save under ``../figures``.

In [None]:
trunc_order = 5

# select reward, loss, end position, steps for array
results_array = np.array(
    [train_res_dict[trunc_order][run][0:4] 
    for run in range(len(train_res_dict[trunc_order]))], 
    ndmin=3
)
training_results_means = results_array.mean(axis=0)
training_results_stds = results_array.std(axis=0)

utils.plot_mean_results(training_results_means, training_results_stds, 
                        title='Results averaged over training runs')

#### First Q-values

We check whether the Q-values at the beginning of the episode for a specified truncation order converge towards the same value as the overall reward obtained over the episode. This is an indication, that the algorithm learns the true optimal Q-values of the problem. Unfortunately this is not the case here.

The first Q-values are plotted in two ways:
- The actual first Q-values saved during training for each episode and calculated for the respective observation $o_0$ encountered in each episode.
- The first Q-values for a fixed observation at $t=0$, for example $o_0 = (-0.5, 1.)$, calculated with the intermediate Q-functions saved during training each 10 episodes.

In [None]:
# choose truncation order, starting position and run_id
trunc_order = 5
start_position = -0.6
run_id = -1

sigqfunction = SigQFunction(sig_depth=trunc_order, **sigq_params)

# plot with intermediate Q-function approximations
utils.plot_first_Q_values(train_res_dict[trunc_order], run_id, intermediate_qfunctions=True,
                          window=(1000,training_params['episodes']), sigq_container=sigqfunction,
                          start_position=start_position, show=True)

#### First observation value
Additionally we take a look at the value of the history at time $t=0$ for the start position $p_0 = -0.5$, which corresponds to the middle of the interval $[-0.6, -0.4]$ in which the car is placed randomly at the start of the episode. The value is calculated as the Q-values averaged over actions and is given by
$$
    V_0(\hat{h}_0) = \frac{1}{3}\sum_{i=0}^2 Q(\hat{h}_0, a_i),   
$$
It gives the value over an episode following a greedy policy based on the current approximate Q-function $Q$. If $V_0(\hat{h}_0)$ converges towards the average reward over an episode, this indicates that the algorithm has converged to the true Q-values (at least for the Q-values at the start of the episode). Unfortunatly this is not the case here, since the algorithm over estimates Q-values.

The plot displays the mean and standard deviation of $V_0(\hat{h}_0)$ calculated over all performed training runs. 

In [None]:
# choose truncation order
trunc_order = 5

# create array containing saved Q-values for all runs
first_observations_array = np.array(
    [train_res_dict[trunc_order][run][-3] for run in range(training_runs)]
)
value_means = first_observations_array.mean(axis=0)
values_stds = first_observations_array.std(axis=0)

utils.plot_first_obs_value(value_means, values_stds)

## Test Q-functions

We test the final Q-approximations for each truncation order and from each training run over a number of episodes and report statistics. 

Checkpoints of the `signatureQFunction` approximations during training was saved every 10 episodes and after the last training episode in each training run. The final learned approximation corresponds to the last checkpoint, given as `state_dict` of the `SigQFunction` class instance at the checkpoint time. Other approximation from training may be tested by selecting the appropriate checkpoint.

In [None]:
# choose number of test episodes and Q-function checkpoint
test_episodes = 500
sigq_checkpoint_id = -1

test_res_dict = {}

for trunc_order in truncation_orders:
    sigqfunction = SigQFunction(sig_depth=trunc_order, **sigq_params)
    training_results = train_res_dict[trunc_order]
    test_results = []
    
    pbar = tqdm.trange(training_runs)
    for run in pbar:
        pbar.set_description("Truncation order:  {}  |  Test runs".format(trunc_order, run))
        # load last Sig-Q approximation
        sigqfunction.load_state_dict(training_results[run][-1][sigq_checkpoint_id])
        sigqfunction.eval()
        results = qlearning.test_multiple_episodes(env, sigqfunction, 
                                                   test_episodes, epsilon=0.0)
        test_results.append(results)
    
    test_res_dict[trunc_order] = test_results

### Test result statistics

We display statistics as one table for each truncation order. This output format is not the best since we want to compare performance over truncation orders, however it will do to transfer it into a different format in the thesis document.

In [None]:
test_stats_dict = {}
cols = ['Mean\nreward', 'Std\nreward', 'Successes', 'Mean\nsteps', 'Std\nsteps', 'Min/max\nsteps', 'Mean\nstart', 'Mean first\nobs value']
rows = ['Run {}'.format(i) for i in range(training_runs)]

for trunc_order in truncation_orders:
    trunc_order_stats = []
    for test_run in test_res_dict[trunc_order]:
        test_run_array = np.array(test_run[0:5])
        test_run_stats = []
        test_run_stats.append(test_run_array[0].mean()) # mean reward
        test_run_stats.append(test_run_array[0].std()) # std reward
        test_run_stats.append(100*test_run_array[1].sum()/test_episodes) # percentage successes
        test_run_stats.append(test_run_array[2].mean()) # mean episode length
        test_run_stats.append(test_run_array[2].std()) # std episode length
        test_run_stats.append(int(test_run_array[2].min())) # min episode steps
        test_run_stats.append(int(test_run_array[2].max())) # max episode steps
        #test_run_stats.append(test_run_array[3].mean()) # mean starting position
        test_run_stats.append(test_run_array[4].mean()) # first observation value
        
        trunc_order_stats.append(test_run_stats)
    
    test_stats_dict[trunc_order] = trunc_order_stats
    print(f'\n\nTest statistics for truncation order {trunc_order}:\n')
    print(tabulate.tabulate(trunc_order_stats, headers=cols, showindex=rows,floatfmt='.4f'))

### Scatterplots

To gain a more qualitative insight into the policy deriveed from the learned Q-function approximation, we may plot 
- the number of steps vs. start positions for a specific truncation order and run,
- the observation trajectory for a specific truncation order, run and episode.

In [None]:
# scatterplots
trunc_order = 5
run_id = -1

# start position vs. number of steps
plt.figure(figsize=(5.5, 4.125))        
plt.scatter(test_res_dict[trunc_order][run_id][3], 
            test_res_dict[trunc_order][run_id][2],
            marker='x')
plt.xlabel("Start position", fontsize=11)
plt.ylabel("Number of steps", fontsize=11)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.tight_layout()
#plt.savefig('../figures/{}_start_pos_vs_steps.png'.format(file_prefix))
plt.show()

# start position vs. reward
plt.figure(figsize=(5.5, 4.125))        
plt.scatter(test_res_dict[trunc_order][run_id][3], 
            test_res_dict[trunc_order][run_id][0],
            marker='x')
plt.xlabel("Start position", fontsize=11)
plt.ylabel("Reward", fontsize=11)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.tight_layout()
#plt.savefig('../figures/{}_start_pos_vs_reward.png'.format(file_prefix))
plt.show()

## Save results

In [None]:
if save_flag:
    # create file name and write results
    file_path = '../results/' + file_prefix + '_truncation_order_results.pkl'
    data_to_save = dict(truncation_orders=truncation_orders,
                        sigq_params = sigq_params, 
                        training_params=training_params,
                        train_res_dict=train_res_dict, 
                        test_res_dict=test_res_dict)
    with open(file_path, 'wb') as f: 
        pickle.dump(data_to_save, f) # serialize the dict
    print(
        'Training, test results, parameter configuration saved under \n \"{}\".'.format(
            file_path
        ))

## Load results

Instead of performing new training and testing runs, prior saved training and testing results, together with the parameter configuration used in training, may be loaded. Saved results can be found in `../results/` and are identified by a distinct `file_prefix`, e.g. the date the runs were performed, in the format `YYYYMMDD`, together with an upper case letter which enumerates the result saved on the same date. To load results select the respective `file_prefix` of the results to be loaded and set `execute_cell_flag = True`.

The following objects are loaded:
- `truncation_orders` - list containing the tested signature truncation orders
- `sigq_params` - dict containing the signature-Q-function parameters
- `training_params` - dict containing the training algorithm parameters
- `train_res_dict` - dict, contains training run results for each truncation order
- `test_res_dict` - dict, contains test run results for each truncation order
– `training_runs`- int, number of performed training runs per truncation order

**Note:** To display results after loading them, the respective cells above this section need to be executed. 

In [21]:
execute_cell_flag = False

load_file_prefix = '20240903_A'
file_path = '../results/' + load_file_prefix + '_truncation_order_results.pkl'

if execute_cell_flag:
    # set date and id of saved results to load
    with open(file_path, 'rb') as f:
        loaded_data_dict = pickle.load(f)
    f.close()
    
    truncation_orders = loaded_data_dict['truncation_orders']
    sigq_params = loaded_data_dict['sigq_params']
    training_params = loaded_data_dict['training_params']
    train_res_dict = loaded_data_dict['train_res_dict']
    test_res_dict = loaded_data_dict['test_res_dict']
    training_runs = len(list(train_res_dict.values())[0])

    print('Training, test results, parameter configuration loaded from: \n \"{}\"\n'.format(file_path))
    print('Number of training runs: {}.\nWith parameters:\n'.format(training_runs))
    pprint({key:loaded_data_dict[key] for key in loaded_data_dict if key not in ('train_res_dict', 'test_res_dict')})

Training, test results, parameter configuration loaded from: 
 "../results/20240903_A_truncation_order_results.pkl"

Number of training runs: 4.
With parameters:

{'sigq_params': {'in_channels': 2,
                 'initial_basepoint': [-0.5, 1.0],
                 'initial_bias': 0.1,
                 'out_dimension': 3},
 'training_params': {'batch_size': 1,
                     'decay_increment': 'success',
                     'discount': 0.99,
                     'episodes': 4000,
                     'epsilon': 0.3,
                     'epsilon_decay': {'end_value': 0.05,
                                       'epochs': 2000,
                                       'mode': 'linear'},
                     'learning_rate': 0.005,
                     'learning_rate_decay': {'mode': None}},
 'truncation_orders': [2, 3, 4, 5, 6, 7, 8]}
