# Deep Q-Learning for Lunar Landing

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [None]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0
Collecting ale-py>=0.9 (from gymnasium[accept-rom-license,atari])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling colle

### Importing the libraries

In [None]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [None]:
class Network(nn.Module): #inheritance where nn is inheriting features from module class
  def __init__(self, state_size, action_size, seed=42):
      super(Network,self).__init__()
      self.seed= torch.manual_seed(seed)
      self.fc1=nn.Linear(state_size, 64) #first full connection and 64 are neurons which is calculated by scientist
      self.fc2=nn.Linear(64,64) #second full connection and 1st 64 is because the no. of neurons in the 1st connection was 64 only and 2nd one is again calculated
      self.fc3=nn.Linear(64,action_size) #third full connection and 1st 64 is because the no. of neurons in the previous connection was 64 only and action_size because we want to terminate connection now
  def forward(self,state): #forward signal between the layers from input layer to output
    x= self.fc1(state)
    x=F.relu(x) #relu is rectifier func and we abbreviated fucntion as F when importing library
    x= self.fc2(x) #forwarding x further as input from previous
    x=F.relu(x)
    return self.fc3(x) #fully activated it takes input from previous therefore forwarding x throughout




## Part 2 - Training the AI

### Setting up the environment

In [None]:
import gymnasium as gym
env = gym.make('LunarLander-v3') # The Lunar Lander environment was upgraded to v3
state_shape = env.observation_space.shape #taking state shape from observation space available in the gymnasium documentatiob site
state_size = env.observation_space.shape[0] #now for size and 0 coz index in python starts with 0
number_actions = env.action_space.n #now we will use action space
print('State shape: ', state_shape) #(8,) coz 8 vectors
print('State size: ', state_size) #(8) coz 8 vectors
print('Number of actions: ', number_actions) #(4) coz 4 actions


State shape:  (8,)
State size:  8
Number of actions:  4


### Initializing the hyperparameters

In [None]:
learning_rate=5e-4 #5e-4 bcoz it is calculated thru lots of experimentation
minibatch_size=100 #it is the no. of observations used in 1 step of the training
gamma= 0.99 # we take discount factor close to 1
replay_buffer_size=int(1e5)  #related to experience replay where it shows how many experiences it has stored in it of the agent (we took it as to stablize and improve process) 1e5 means 100,000
interpolation_parameter=1e-3 #came from experimentations and interpolation_parameter is called tao and 1e-3 is 0.001




### Implementing Experience Replay

In [None]:
class ReplayMemory(object): #no inheritance this time only object

  def __init__(self, capacity):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") #else use gpu or cpu
    self.capacity = capacity #max size if memory buffer
    self.memory = [] #it will contain all the experiences each one will contain the state,action,reward.complete or not

  def push(self, event): #making a method which will add the experiences in our list and also check whether it exceeded length or not(1e5)
    self.memory.append(event)
    if len(self.memory) > self.capacity:
      del self.memory[0] #delete the oldest event !!!!!!

  def sample(self, batch_size): #randomly selects batch of experiences from memory buffer
    experiences = random.sample(self.memory, k = batch_size)
    #now we extract the elements states,next state,actions,rewards,dones one by one from the list
    states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device) #for loop within list #we used numpy and torch too as we r builiding using pytorch tensor and since pythorch tensor only uses float so we use it and self device can be gpu or cpu
    actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)#long for action as it can be 0 1 2 3 only just for safety not floatttt
    rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
    next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
    dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) #dones include linear velocity angular velocity etc of the lunar lander and uint8 is a boolean type which has to be used before the conversion to float
    return states, next_states, actions, rewards, dones

### Implementing the DQN class

In [None]:
class Agent(): #new class
#initialise dqn
  def __init__(self, state_size, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.state_size = state_size #for agent
    self.action_size = action_size #for agent
    self.local_qnetwork = Network(state_size, action_size).to(self.device) #for local Q network
    self.target_qnetwork = Network(state_size, action_size).to(self.device) #for target Q network
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate) #optimizing!! and adam is a function
    self.memory = ReplayMemory(replay_buffer_size) #replay memory class
    self.t_step = 0 #time step counter initialised to 0
#for storing experiences and decide when to  learn from them
  def step(self, state, action, reward, next_state, done):
    self.memory.push((state, action, reward, next_state, done)) #adding experiences
    self.t_step = (self.t_step + 1) % 4 #increment it and reset it every 4 step if it comes out to be 0 then it means 4 steps have been done so now comes the learning part then
    if self.t_step == 0:
      if len(self.memory.memory) > minibatch_size: #then learn coz observations are many now and rather than studying 1 then study a batch of it
        experiences = self.memory.sample(100)
        self.learn(experiences, gamma) #agent learns the experience
#action selection policy of epsilon greedy
  def act(self, state, epsilon = 0.):
    #an extra dimension is necessary in dql
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) #unsqueeze brings an extra dimension  means 1 more to 8 so that to recognize which batch to does the experience belong to and 0 coz it should be first of all
    self.local_qnetwork.eval() #evaluation mode
    with torch.no_grad(): #it makes sure any gradient computation is disabled
      action_values = self.local_qnetwork(state) #getting q values and out of this highest 90% one will be takes and random too 10% if epsilon is 0.1
    self.local_qnetwork.train() #train method will bring back to training mode rather than evaluating mode
    if random.random() > epsilon:
      return np.argmax(action_values.cpu().data.numpy()) #highest q value
    else:
      return random.choice(np.arange(self.action_size)) #else random
#learn method it will update the q values
  def learn(self, experiences, gamma):
    states, next_states, actions, rewards, dones = experiences
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1) #next q target from target q network
    q_targets = rewards + gamma * next_q_targets * (1 - dones) #formula
    q_expected = self.local_qnetwork(states).gather(1, actions) #q expectef=d from local q network
    loss = F.mse_loss(q_expected, q_targets) #loss between expected and target and now we will back propagate to update values
    self.optimizer.zero_grad()
    loss.backward() #back  propagation using backward func
    self.optimizer.step()
    self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter) #updation
#update
  def soft_update(self, local_model, target_model, interpolation_parameter):
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): #to use both local and target togther we used zip
      target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data) #formula by taking avergae

### Initializing the DQN agent

In [None]:
agent = Agent(state_size, number_actions)

### Training the DQN agent

In [None]:
number_episodes = 2000 #max no. of episodes
maximum_number_timesteps_per_episode = 1000
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0.01
epsilon_decay_value  = 0.995
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen = 100) #double ended queue

for episode in range(1, number_episodes + 1):
  state, _ = env.reset() #reset environement
  score = 0
  for t in range(0,maximum_number_timesteps_per_episode):
    action = agent.act(state, epsilon)
    next_state, reward, done, _, _ = env.step(action) #_  will discard the other info which we wont need
    agent.step(state, action, reward, next_state, done) #it will help agent to learn better
    state = next_state
    score += reward
    if done:
      break
  scores_on_100_episodes.append(score) #append last score we finished
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon) #so that ending value is not exceeded
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")#{;.2f} means decimal after 2 places from comma and \r is for dynamic printing and no overwriting and avg of scores of episodes using np.mean and end="" will end the line here
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
  if np.mean(scores_on_100_episodes) >= 200.0: #means we won
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))#{:d}means answer will be in double integer
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth') #conversion to dictionary
    break

Episode 100	Average Score: -155.39
Episode 200	Average Score: -113.39
Episode 300	Average Score: -34.71
Episode 400	Average Score: -14.22
Episode 500	Average Score: 90.62
Episode 600	Average Score: 125.87
Episode 700	Average Score: 144.34
Episode 800	Average Score: 176.41
Episode 862	Average Score: 200.80
Environment solved in 762 episodes!	Average Score: 200.80


## Part 3 - Visualizing the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v3')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

