In [None]:
import torch as T
from torch import nn
from torch import optim
from torch.nn import functional as F
import numpy as np
import gym
import matplotlib.pyplot as plt
import random

In [None]:
env = gym.make('CartPole-v0')
env.observation_space.shape

(4,)

In [50]:
class Policy_Network(nn.Module):
    def __init__(self, lr, input_dims, n_actions, gamma=0.999, fc1 = 256, fc2 = 128):
        super(Policy_Network,self).__init__()
        self.input_dims = input_dims
        self.n_actions = n_actions
       
        self.fc1 = fc1 
        self.fc2 = fc2
        
        self.fc1_l = nn.Linear(*self.input_dims,self.fc1)
        self.fc2_l = nn.Linear(self.fc1,self.fc2)
        self.final = nn.Linear(self.fc2,self.n_actions)
        
        self.device = 'cuda:0' if T.cuda.is_available() else 'cpu'
        self.optimizer = optim.SGD(self.parameters(), lr = lr)
        self.to(self.device)
        self.gamma = gamma
        
    def forward(self,X):
        X = X.float()
        X = F.relu(self.fc1_l(X))
        X = F.relu(self.fc2_l(X))
        X = F.softmax(self.final(X))
        return X

    def get_actions(self,state):
        state = T.tensor(state).float()
        probs = self.forward(state)
        highest_prob_action = np.random.choice(self.n_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = T.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

    def update_policy(self,log_probs,rewards):
        discounted_rewards = []
        policy_gradients = []
        for t in range(len(rewards)):
            Gt = 0
            p = 0
            for i in range(t,len(rewards)):
                Gt+=(self.gamma**p)*rewards[i]
                p+=1
            discounted_rewards.append(Gt)
        discounted_rewards = torch.tensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) 
        for prob,Gt in zip(log_probs,discounted_rewards):
            policy_gradients.append(-prob*Gt)
        self.optimizer.zero_grad()
        policy_gradients = T.stack(policy_gradients).sum()

        policy_gradients.backward()
        self.optimizer.step()

In [53]:
agent = Policy_Network(0.0001,(4,),2)
episodes = 10000
gamma = 0.999

scores, eps_history = [],[]
for i in range(episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    done = False
    while not done:
        action,log_prob = agent.get_actions(state)
        state_,reward,done,info = env.step(action)
        state = state_
        log_probs.append(log_prob)
        rewards.append(reward)
    agent.update_policy(log_probs,rewards)
    scores.append(sum(rewards))
    avg_score = np.mean(scores[-100:])
    if i%200==0:
        print("episode : {} | score : {} | average score :{} | gamma : {}".format(
                                                                            i,sum(rewards),avg_score,agent.gamma))



episode : 0 | score : 19.0 | average score :19.0 | gamma : 0.999
episode : 200 | score : 53.0 | average score :25.92 | gamma : 0.999
episode : 400 | score : 10.0 | average score :27.08 | gamma : 0.999
episode : 600 | score : 12.0 | average score :28.79 | gamma : 0.999
episode : 800 | score : 14.0 | average score :31.83 | gamma : 0.999
episode : 1000 | score : 47.0 | average score :30.11 | gamma : 0.999
episode : 1200 | score : 28.0 | average score :29.29 | gamma : 0.999
episode : 1400 | score : 24.0 | average score :35.42 | gamma : 0.999
episode : 1600 | score : 32.0 | average score :37.94 | gamma : 0.999
episode : 1800 | score : 55.0 | average score :39.83 | gamma : 0.999
episode : 2000 | score : 17.0 | average score :44.96 | gamma : 0.999
episode : 2200 | score : 55.0 | average score :47.77 | gamma : 0.999
episode : 2400 | score : 35.0 | average score :47.72 | gamma : 0.999
episode : 2600 | score : 88.0 | average score :52.45 | gamma : 0.999
episode : 2800 | score : 20.0 | average sc