In [1]:
import os
os.chdir('..')
print(os.getcwd())

/Users/Placebo/OMSCS/CS7641-ML/MachineLearningProjects/ReinforcementLearning


In [2]:
import numpy as np
import gymnasium as gym
from bettermdptools.algorithms.planner import Planner
from bettermdptools.utils.test_env import TestEnv
from bettermdptools.algorithms.rl import RL
from bettermdptools.utils.grid_search import GridSearch
from bettermdptools.envs.cartpole_wrapper import CartpoleWrapper

import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from typing import List, Dict, Union, Optional

In [3]:
%load_ext autoreload
%autoreload 2
from src.experiments.cartpole_exp import CartPoleExperiment
from src.utils.plotting import create_param_heatmap, save_plot, create_v_iters_plot, set_plot_style

In [29]:
env = gym.make('CartPole-v1')
env = CartpoleWrapper(env, position_bins=2)
env.observation_space.n

4600

In [32]:
for i in [2, 5, 10, 20, 50]:
    print(i * 2300)

4600
11500
23000
46000
115000


# Value iteration and Policy Iteration Grid Search

In [None]:
params = {
    'position_bins': [2, 5, 10, 20, 50],
    # 'velocity_bins': [10],
    # 'angular_velocity_bins': [10],
    'gamma': [0.2, 0.4, 0.6, 0.8, 0.9, 0.99],
    'theta':[0.1, .01, 0.001, 1e-6],
    'n_iters': [10000],
}

cartpole_exp = CartPoleExperiment(env_name='cartpole_200_n10000',
                                    result_dir='results',
                                    fig_dir='figs',
                                    random_seed=17)
cartpole_exp.run_vi_pi_grid_search(params=params,
                                    test_iters=200,
                                    verbose=False,
                                    log_name='pi_vi_grid_search')


# Q Learning Grid Search

In [None]:
params = {
        'gamma': [0.2, 0.4, 0.6, 0.8, 0.9, 0.99],
        'init_alpha': np.round(np.linspace(0.2, 1.0, 5), 1),
        'init_epsilon': np.round(np.linspace(0.2, 1.0, 5), 1),
        'n_episodes': [10000],
    }
state_params = {
    'position_bins': [2, 5, 10, 20, 50]
}
env_name = 'cartpole_q_n10000'


cartpole_exp = CartPoleExperiment(env_name=env_name,
                                  result_dir='results',
                                  fig_dir='figs',
                                  random_seed=17)
results = cartpole_exp.run_q_learning_grid_search(
    params=params,
    state_params=state_params,
    test_iters=200,
    verbose=False,
    log_name='q_learning_grid_search',
    n_processes=None  # Will use cpu_count() by default
)

# save
with open(f'results/{env_name}/q_results.pkl', 'wb') as f:
    pickle.dump(results, f)

# SARSA Grid Search

In [None]:
params = {
    'gamma': [0.2, 0.4, 0.6, 0.8, 0.9, 0.99],
    'init_alpha': np.round(np.linspace(0.2, 1.0, 5), 1),
    'init_epsilon': np.round(np.linspace(0.2, 1.0, 5), 1),
    'n_episodes': [10000],
}
state_params = {
    'position_bins': [2, 5, 10, 20, 50]
}

env_name = 'cartpole_sarsa_n10000'

cartpole_exp = CartPoleExperiment(env_name=env_name,
                                  result_dir='results',
                                  fig_dir='figs',
                                  random_seed=17)
results = cartpole_exp.run_sarsa_leanring_grid_search(
    params=params,
    state_params=state_params,
    test_iters=200,
    verbose=False,
    log_name='sarsa_learning_grid_search',
    n_processes=None  # Will use cpu_count() by default
)

# save
with open(f'results/{env_name}/sarsa_results.pkl', 'wb') as f:
    pickle.dump(results, f)


# Analysis of the results

In [59]:
vi_pi_folder = 'results/cartpole_200_n10000'
q_folder = 'results/cartpole_q_n10000'
sarsa_folder = 'results/cartpole_sarsa_n10000'
position_bins = [2, 5, 10, 20, 50]

algo = []
position_bin = []
runtime = []
mean_reward = []
gamma = []

THETA = 1e-6
# load vi_pi results
for pos_bin in position_bins:
    for method in ['vi', 'pi']:
        with open(f'{vi_pi_folder}/{method}_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
            results = pickle.load(f)
            iteration_results = pd.DataFrame(results['iteration_results'])
            iteration_results = iteration_results[iteration_results['theta']==THETA]
            iteration_results.sort_values('gamma', 
            ascending=True, inplace=True)
            mean_reward.extend(iteration_results['mean_reward'])
            runtime.extend(iteration_results['runtime'])
            gamma.extend(iteration_results['gamma'])
            algo.extend([method] * len(iteration_results))
            position_bin.extend([pos_bin] * len(iteration_results))

# load q results
for pos_bin in position_bins:
    with open(f'{q_folder}/q_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
        results = pickle.load(f)
        best_results = results['best_params']
        iteration_results = pd.DataFrame(results['iteration_results'])
        iteration_results = iteration_results[
            (iteration_results['init_epsilon']==best_results['init_epsilon']) &
            (iteration_results['init_alpha']==best_results['init_alpha'])
        ]
        iteration_results.sort_values('gamma', 
        ascending=True, inplace=True)
        mean_reward.extend(iteration_results['mean_reward'])
        runtime.extend(iteration_results['runtime'])
        gamma.extend(iteration_results['gamma'])
        algo.extend(['q'] * len(iteration_results))
        position_bin.extend([pos_bin] * len(iteration_results))

# load sarsa results
for pos_bin in position_bins:
    with open(f'{sarsa_folder}/sarsa_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
        results = pickle.load(f)
        best_results = results['best_params']
        iteration_results = pd.DataFrame(results['iteration_results'])
        iteration_results = iteration_results[
            (iteration_results['init_epsilon']==best_results['init_epsilon']) &
            (iteration_results['init_alpha']==best_results['init_alpha'])
        ]
        iteration_results.sort_values('gamma', 
        ascending=True, inplace=True)
        mean_reward.extend(iteration_results['mean_reward'])
        runtime.extend(iteration_results['runtime'])
        gamma.extend(iteration_results['gamma'])
        algo.extend(['sarsa'] * len(iteration_results))
        position_bin.extend([pos_bin] * len(iteration_results))

results = pd.DataFrame({
    'Algorithm': algo,
    'Position_Bin': position_bin,
    'runtime': runtime,
    'mean_reward': mean_reward,
    'gamma': gamma
})


In [70]:

# Create the figure with appropriate size
set_plot_style(multiplier=2)
plt.figure(figsize=(12, 6))
# Create line plot using seaborn
sns.lineplot(
    data=results,
    x='gamma',
    y='runtime',
    hue='Algorithm',
    style='Position_Bin',
    markers=True,
    dashes=True,
    marker='o',
)

# Customize the plot
plt.xlabel('Gamma Value')
plt.ylabel('Runtime (seconds)')

# Adjust legend
plt.legend(bbox_to_anchor=(1.05, 1), fontsize='small')

# Adjust layout to prevent legend cutoff
plt.tight_layout()

# Save the plot
plt.savefig('figs/cartpole_gamma_runtime.png', dpi=300)

In [None]:


sns.lineplot(
    data=results,
    x='gamma',
    y='mean_reward',
    hue='algo',
    style='position_bin',
    markers=True,
    dashes=True,
    marker='o'
)

plt.title('Algorithm Mean Reward vs Gamma by Position Bin', fontsize=14, pad=15)
plt.xlabel('Gamma Value')
plt.ylabel('Mean Reward')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [19]:
def process_rl_results(
    vi_pi_folder: str,
    q_folder: str,
    sarsa_folder: str,
    position_bins: List[int],
    param_type: str = 'gamma',
    theta: float = 1e-6
) -> pd.DataFrame:
    """
    Process results from different RL algorithms and combine them into a single DataFrame.
    
    Args:
        vi_pi_folder (str): Path to folder containing VI/PI results
        q_folder (str): Path to folder containing Q-learning results
        sarsa_folder (str): Path to folder containing SARSA results
        position_bins (List[int]): List of position bin values to process
        param_type (str): Parameter to analyze ('gamma', 'init_epsilon', or 'init_alpha')
        theta (float): Threshold value for VI/PI results filtering
        
    Returns:
        pd.DataFrame: Combined results with columns [Algorithm, Position_Bin, runtime, mean_reward, param]
    """
    algo = []
    position_bin = []
    runtime = []
    mean_reward = []
    param_values = []

    # Process VI/PI results - only applicable for gamma analysis
    if param_type == 'gamma':
        for pos_bin in position_bins:
            for method in ['vi', 'pi']:
                with open(f'{vi_pi_folder}/{method}_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
                    results = pickle.load(f)
                    iteration_results = pd.DataFrame(results['iteration_results'])
                    iteration_results = iteration_results[iteration_results['theta']==theta]
                    iteration_results.sort_values(param_type, ascending=True, inplace=True)
                    
                    mean_reward.extend(iteration_results['mean_reward'])
                    runtime.extend(iteration_results['runtime'])
                    param_values.extend(iteration_results[param_type])
                    algo.extend([method] * len(iteration_results))
                    position_bin.extend([pos_bin] * len(iteration_results))

    # Process Q-learning results
    for pos_bin in position_bins:
        with open(f'{q_folder}/q_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
            results = pickle.load(f)
            best_results = results['best_params']
            iteration_results = pd.DataFrame(results['iteration_results'])
            
            # Filter based on best values of parameters we're not analyzing
            if param_type == 'gamma':
                mask = ((iteration_results['init_epsilon'] == best_results['init_epsilon']) & 
                       (iteration_results['init_alpha'] == best_results['init_alpha']))
            elif param_type == 'init_epsilon':
                mask = ((iteration_results['gamma'] == best_results['gamma']) & 
                       (iteration_results['init_alpha'] == best_results['init_alpha']))
            else:  # init_alpha
                mask = ((iteration_results['gamma'] == best_results['gamma']) & 
                       (iteration_results['init_epsilon'] == best_results['init_epsilon']))
            
            iteration_results = iteration_results[mask]
            iteration_results.sort_values(param_type, ascending=True, inplace=True)
            
            mean_reward.extend(iteration_results['mean_reward'])
            runtime.extend(iteration_results['runtime'])
            param_values.extend(iteration_results[param_type])
            algo.extend(['q'] * len(iteration_results))
            position_bin.extend([pos_bin] * len(iteration_results))

    # Process SARSA results
    for pos_bin in position_bins:
        with open(f'{sarsa_folder}/sarsa_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
            results = pickle.load(f)
            best_results = results['best_params']
            iteration_results = pd.DataFrame(results['iteration_results'])
            
            # Filter based on best values of parameters we're not analyzing
            if param_type == 'gamma':
                mask = ((iteration_results['init_epsilon'] == best_results['init_epsilon']) & 
                       (iteration_results['init_alpha'] == best_results['init_alpha']))
            elif param_type == 'init_epsilon':
                mask = ((iteration_results['gamma'] == best_results['gamma']) & 
                       (iteration_results['init_alpha'] == best_results['init_alpha']))
            else:  # init_alpha
                mask = ((iteration_results['gamma'] == best_results['gamma']) & 
                       (iteration_results['init_epsilon'] == best_results['init_epsilon']))
            
            iteration_results = iteration_results[mask]
            iteration_results.sort_values(param_type, ascending=True, inplace=True)
            
            mean_reward.extend(iteration_results['mean_reward'])
            runtime.extend(iteration_results['runtime'])
            param_values.extend(iteration_results[param_type])
            algo.extend(['sarsa'] * len(iteration_results))
            position_bin.extend([pos_bin] * len(iteration_results))

    # Combine results into DataFrame
    results = pd.DataFrame({
        'Algorithm': algo,
        'position_bins': position_bin,
        'runtime': runtime,
        'mean_reward': mean_reward,
        param_type: param_values
    })
    
    return results

In [20]:
def plot_rl_results(
    results: pd.DataFrame,
    x_metric: str,
    y_metric: str,
    style: str,
    output_path: str,
    plot_title: Optional[str] = None,
    figsize: tuple = (12, 6),
    dpi: int = 300,
    legend_fontsize: str = 'small'
) -> None:
    """
    Create and save a line plot of RL results.
    
    Args:
        results (pd.DataFrame): DataFrame containing the results
        y_metric (str): Metric to plot on y-axis ('runtime' or 'mean_reward')
        output_path (str): Path where to save the plot
        plot_title (str, optional): Title for the plot
        figsize (tuple): Figure size (width, height)
        dpi (int): DPI for saved figure
        legend_fontsize (str): Font size for legend
    """
    # Set plot style if needed
    set_plot_style(multiplier=2)
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # Create line plot
    sns.lineplot(
        data=results,
        x=x_metric,
        y=y_metric,
        hue='Algorithm',
        style=style,
        markers=True,
        dashes=True,
        marker='o',
        markersize=10,
        linewidth=4
       
    )
    
    
    metric_labels = {
        'runtime': 'Runtime (seconds)',
        'mean_reward': 'Mean Reward'
    }
    x_metric_labels = {
        'gamma': 'Gamma Value',
        'init_epsilon': 'Initial Epsilon Value',
        'init_alpha': 'Initial Alpha Value',
        'position_bins': 'Position Bins'
    }    


    plt.xlabel(x_metric_labels.get(x_metric, x_metric))
    plt.ylabel(metric_labels.get(y_metric, y_metric))
    
    # Set title if provided
    if plot_title:
        plt.title(plot_title)
    
    # Adjust legend
    plt.legend(bbox_to_anchor=(1.05, 1), fontsize=legend_fontsize)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save plot
    plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    
    # Close the figure to free memory
    plt.close()

In [21]:
results_gamma = process_rl_results(
    vi_pi_folder='results/cartpole_200_n10000',
    q_folder='results/cartpole_q_n10000',
    sarsa_folder='results/cartpole_sarsa_n10000',
    position_bins=[2, 5, 10, 20, 50],
    param_type='gamma'
)
results_epsilon = process_rl_results(
    vi_pi_folder='results/cartpole_200_n10000',
    q_folder='results/cartpole_q_n10000',
    sarsa_folder='results/cartpole_sarsa_n10000',
    position_bins=[2, 5, 10, 20, 50],
    param_type='init_epsilon'
)
results_alpha = process_rl_results(
    vi_pi_folder='results/cartpole_200_n10000',
    q_folder='results/cartpole_q_n10000',
    sarsa_folder='results/cartpole_sarsa_n10000',
    position_bins=[2, 5, 10, 20, 50],
    param_type='init_alpha'
)

In [22]:

plot_rl_results(
    results=results_gamma,
    x_metric='gamma',
    style='position_bins',
    y_metric='runtime',
    output_path='figs/cartpole_gamma_runtime.png'
)
plot_rl_results(
    results=results_gamma,
    x_metric='gamma',
    style='position_bins',
    y_metric='mean_reward',
    output_path='figs/cartpole_gamma_mean_reward.png'
)


In [None]:

plot_rl_results(
    results=results_epsilon,
    x_metric='init_epsilon',
    style='position_bins',
    y_metric='mean_reward',
    output_path='figs/cartpole_epsilon_mean_reward.png'
)

In [None]:

plot_rl_results(
    results=results_alpha,
    style='init_alpha',
    y_metric='mean_reward',
    output_path='figs/cartpole_alpha_mean_reward.png'
)


In [15]:
import pandas as pd
import pickle
from typing import List

def process_episode_metric(
    q_folder: str,
    sarsa_folder: str,
    position_bins: List[int],
    metric: str = 'max_value',
    alpha: float = 0.8,
    epsilon: float = 0.8,
    gamma: float = 0.99
) -> pd.DataFrame:
    """
    Process results from different RL algorithms and combine them into a single DataFrame.
    
    Args:
        vi_pi_folder (str): Path to folder containing VI/PI results
        q_folder (str): Path to folder containing Q-learning results
        sarsa_folder (str): Path to folder containing SARSA results
        position_bins (List[int]): List of position bin values to process
        param_type (str): Parameter to analyze ('gamma', 'init_epsilon', or 'init_alpha')
        theta (float): Threshold value for VI/PI results filtering
        
    Returns:
        pd.DataFrame: Combined results with columns [Algorithm, Position_Bin, runtime, mean_reward, param]
    """
    algo = []
    position_bin = []
    metric_values = []
    episode = []


    # Process Q-learning results
    for pos_bin in position_bins:
        with open(f'{q_folder}/q_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
            results = pickle.load(f)
            iteration_results = pd.DataFrame(results['iteration_results'])
            
            mask = ((iteration_results['gamma'] == gamma) &
                    (iteration_results['init_epsilon'] == epsilon) &
                    (iteration_results['init_alpha'] == alpha))
            
            iteration_results = iteration_results[mask]
            metric_value = iteration_results[metric].values[0]
            
            metric_values.extend(metric_value)
            episode.extend(range(1, len(metric_value) + 1))
            algo.extend(['q'] * len(metric_value))
            position_bin.extend([pos_bin] * len(metric_value))

    # Process SARSA results
    for pos_bin in position_bins:
        with open(f'{sarsa_folder}/sarsa_grid_search_results_posi{pos_bin}_vel10_ang10.pkl', 'rb') as f:
            results = pickle.load(f)
            best_results = results['best_params']
            iteration_results = pd.DataFrame(results['iteration_results'])

            mask = ((iteration_results['gamma'] == gamma) &
                    (iteration_results['init_epsilon'] == epsilon) &
                    (iteration_results['init_alpha'] == alpha))
            
            iteration_results = iteration_results[mask]
            metric_value = iteration_results[metric].values[0]
            metric_values.extend(metric_value)
            episode.extend(range(1, len(metric_value) + 1))
            algo.extend(['sarsa'] * len(metric_value))
            position_bin.extend([pos_bin] * len(metric_value))

    # Combine results into DataFrame
    results = pd.DataFrame({
        'Algorithm': algo,
        'position_bins': position_bin,
        'episode': episode,
        metric: metric_values
    })
    
    return results


def plot_episode_results(
    results: pd.DataFrame,
    metric: str,
    output_path: str,
    plot_title: Optional[str] = None,
    figsize: tuple = (12, 6),
    dpi: int = 300,
    legend_fontsize: str = 'small',
    n_points: int = 100  # Number of points to show
) -> None:
    """
    Create and save a line plot of episode-based results with improved readability.
    
    Args:
        results (pd.DataFrame): DataFrame containing episode results
        metric (str): Name of the metric being plotted
        output_path (str): Path where to save the plot
        plot_title (str, optional): Title for the plot
        figsize (tuple): Figure size (width, height)
        dpi (int): DPI for saved figure
        legend_fontsize (str): Font size for legend
        n_points (int): Number of points to show in the plot
    """
    # Set plot style if needed
    set_plot_style(multiplier=2)
    
    # Create figure
    plt.figure(figsize=figsize)
    
    # Thin out the data
    max_episode = results['episode'].max()
    step_size = max(1, max_episode // n_points)
    
    # Method 1: Use step size
    plot_data = results[results['episode'] % step_size == 0].copy()
    
    # Alternative Method 2: Group by windows and take mean
    # window_size = max_episode // n_points
    # plot_data = results.groupby(
    #     ['Algorithm', 'position_bins', results['episode'] // window_size]
    # ).agg({
    #     'episode': 'mean',
    #     metric: 'mean'
    # }).reset_index()
    
    # Create line plot with improved styling
    sns.lineplot(
        data=plot_data,
        x='episode',
        y=metric,
        hue='Algorithm',
        style='position_bins',
        markers=True,
        dashes=True,
        marker='o',
        markersize=10,  
        linewidth=4,  
        alpha=0.8,  # Slight transparency
    )
    
    # Set labels
    plt.xlabel('Episode')
    plt.ylabel(metric.replace('_', ' ').title())
    
    # Set title if provided
    if plot_title:
        plt.title(plot_title)
    
    # Improve x-axis ticks
    plt.gca().xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    # Optional: Set x-axis to use scientific notation
    # plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
    
    # Adjust legend
    plt.legend(
        bbox_to_anchor=(1.05, 1), 
        fontsize=legend_fontsize,
        title_fontsize=legend_fontsize
    )
    
    # Optional: Add grid for better readability
    plt.grid(True, linestyle='--', alpha=0.3)
    
    # Adjust layout
    plt.tight_layout()
    
    # Save plot
    plt.savefig(output_path, dpi=dpi, bbox_inches='tight')
    
    # Close the figure to free memory
    plt.close()

In [23]:
results_max = process_episode_metric(
    q_folder='results/cartpole_q_n10000',
    sarsa_folder='results/cartpole_sarsa_n10000',
    position_bins=[2, 5, 10, 20, 50],
    metric='max_values',
    alpha=0.6,
    epsilon=0.8,
    gamma=0.99
)

results_mean = process_episode_metric(
    q_folder='results/cartpole_q_n10000',
    sarsa_folder='results/cartpole_sarsa_n10000',
    position_bins=[2, 5, 10, 20, 50],
    metric='mean_values',
    alpha=0.6,
    epsilon=0.8,
    gamma=0.99
)

In [24]:
plot_episode_results(
    results=results_max,
    metric='max_values',  # or whatever your metric name is
    output_path='figs/episode_max_values_results.png',
    plot_title=None,
    n_points=20
)

In [25]:
plot_episode_results(
    results=results_mean,
    metric='mean_values',  # or whatever your metric name is
    output_path='figs/episode_mean_values_results.png',
    plot_title=None,
    n_points=20
)