# Network Training 

For the actual training, I ran '$ python RLManager.py', and getting a model to train in the notebook would take considerable rewriting of many of the functions we used to train the model, so this notebook is more of a quicker way to show the training process 

Many of the functions and classes are defined and imported from RLManager.py, and I did my best comment when this is true to avoid confusion when reading the code. The important functions are: 
* optimze_model()

If you want to run an interactive mode, call '$ python Interactive.py'  
* To use a model that you have trained, rename the directory 'Output' to 'GameModel' in your file explorer  
* You may want to rename the current GameModel directory or move it to somewhere you can reuse it for later

In order to run, you will need to have
* torch
* gymnasium
* numpy
* pygame
* matplotlib

This script also need access to the functions and classes defined in RLManager.py and MissileEnv.py

# Github Access 

In order to simplify running the code, you can pull from the Github I set up for the project: https://github.com/matty-cua/MissileTesting

This includes requirements.txt for quicker install with pip, and also has the final trained model

There are also other exploratory notebooks that I used, such as PathTesting.ipynb that I used to develop the random paths used for the target 

In [9]:
print("Custom imports...")
from MissileEnv import MissileEnv
from RLManager import * 

# Imports 
import gymnasium as gym
import math
import random
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
from collections import namedtuple, deque 
import random 
from pathlib import Path 
import numpy as np

# Torch imports 
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print("Finished all imports") 

Custom imports...
Finished all imports


In [10]:
# User inputs 
num_episodes = 100
save_loc = Path('Output')

# Hyperparameters 
BATCH_SIZE = 64
GAMMA = 0.99        
EPS_START = 0.8
EPS_END = 0.05
EPS_DECAY = 50000
TAU = 0.05  # Started at 0.005
LR = 1e-4
SAVE_EVERY = 15

In [11]:
# Set up the environment 
env = MissileEnv()

# Get number of actions from gym action space
n_actions = env.action_space.n
# Get the number of state observations
state, info = env.reset()
# n_observations = len(state)
n_observations = 5

In [12]:
# Prep for training 

# Ensure the output directory exists (will crash if not) 
save_loc.mkdir(parents=True, exist_ok=True)

# Set up policies 
print("Setting up policies...")
model_class = DQN  # Use dense model (opposed to DQN_RNN model (does not train w/ current setup)) 
policy_net = model_class(n_observations, n_actions).to(device)
target_net = model_class(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())

# Set up the optimizer and memory (store output from env for training) 
print("Initialize optimizer and replay memory...")
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)

# training step counter + episode duration counter
save_step = 0
point_step = 0
save_points = np.zeros(2*SAVE_EVERY)
check_ref = 0
steps_done = 0
episode_durations = []
average_rewards = []
epsilons = []
target_hit = []

Setting up policies...
Initialize optimizer and replay memory...


In [None]:
print("Beginning training loop...")
for i_episode in range(num_episodes):
    print(f"Episode: {i_episode + 1}")
    # Initialize the environment and get its state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)

    reward_total = 0
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated

        # Update some stats 
        reward_total += reward 

        if terminated:
            next_state = None
            print("    - Hit the Target!")
        else:
            next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        if done:
            episode_durations.append(t + 1)
            average_rewards.append(reward_total.cpu().numpy()[0])
            cep = eps_threshold = EPS_END + (EPS_START - EPS_END) * \
                math.exp(-1. * steps_done / EPS_DECAY)
            epsilons.append(cep)
            target_hit.append(terminated)
            # plot_durations()
            break

    # Try and print out rewards 
    print(f"    - Duration: {episode_durations[-1]}")
    print(f"    - Reward  : {average_rewards[-1]}")
    print(f"    - Eps     : {cep}")

    # Checkpointing 
    if point_step >= len(save_points): 
        point_step = 0
    save_points[point_step] = terminated
    point_step += 1
    if save_step >= SAVE_EVERY: 
        save_step = 0
        if np.mean(point_step) > check_ref: 
            check_ref = np.mean(point_step)
            torch.save(target_net_state_dict, save_loc / 'CheckpointWeights.wts')
            print("======== MODEL CHECKPOINT =======")
    else: 
        save_step += 1
            

    # Intermediate plots
    if (i_episode + 1) % 50 == 0: 
        if use_plots: 
            try: 
                # Plot and save (can't display it) 
                episode_plot(episode_durations, 'episode')
                plt.gcf().savefig(save_loc / 'DurationPlot.png')
                episode_plot(average_rewards, 'reward')
                plt.gcf().savefig(save_loc / 'RewardPlot.png')
                episode_plot(target_hit, 'target hit')
                plt.gcf().savefig(save_loc / 'TargetHits.png')
                episode_plot(epsilons, 'epsilon')
                plt.gcf().savefig(save_loc / 'EpsilonPlot.png')
            except Exception as e: 
                print('ERROR: Could not show plots: ') 
                print(e)
        # Intermediate stats 
        torch.save(
            {
                'durations': episode_durations, 
                'rewards': average_rewards, 
                'impacts': target_hit, 
                'epsilon': epsilons 
            }, 
            save_loc / 'ModelStats', 
        )

In [None]:
# Save the model + stats 
torch.save(target_net_state_dict, save_loc / 'ModelWeights.wts')
torch.save(target_net, save_loc / 'ModelTorch.pkl')

# Plot and save training stats 
if use_plots: 
    episode_plot(episode_durations, 'episode')
    plt.gcf().savefig(save_loc / 'DurationPlot.png')
    episode_plot(average_rewards, 'reward')
    plt.gcf().savefig(save_loc / 'RewardPlot.png')
    episode_plot(target_hit, 'target hit')
    plt.gcf().savefig(save_loc / 'TargetHits.png')
    episode_plot(epsilons, 'epsilon')
    plt.gcf().savefig(save_loc / 'EpsilonPlot.png')

# Save the training stats (just episode duration for now, should figure out better loss plots like reward and such)
torch.save(
    {
        'durations': episode_durations, 
        'rewards': average_rewards, 
        'impacts': target_hit, 
        'epsilon': epsilons 
    }, 
    save_loc / 'ModelStats', 
)

In [None]:
# Create plots 
def movmean(data, Nmean=50): 
    data = target_impact_lengths
    flat = np.ones(target_impact_lengths.shape)
    avg_filt = np.ones(Nmean) / Nmean
    edge_destroyer = 1 / np.convolve(flat, avg_filt, 'same')
    out = np.convolve(data, avg_filt, 'same') * edge_destroyer
    return out 

# Reload the script (for debugging and such) 
reload(RLManager)

# mod_path = Path('GameModel')  # The model to be used in interactive simulations  
mod_path = Path('Output')  # The last trained model 

# Load the data dictionary (torch.load() is just a wrapper for pickle.load()) 
data = mod_path / 'ModelStats'
dd = torch.load(data, weights_only=False)
epp = lambda s: RLManager.episode_plot(dd[s], s)  # quick wrapper for quicker coding 

# Get the episode length for each target impact 
target_impact_lengths = np.array(dd['durations'])[dd['impacts']]

epp('impacts')
plt.show()
epp('rewards')
plt.show()
epp('durations')
plt.show()
epp('epsilon')
plt.show()

# Plot episode length of each target impact 
RLManager.episode_plot(target_impact_lengths, 'impact_durations')
plt.show()