In [None]:
'''
Code adapted from
https://github.com/berkeleydeeprlcourse/homework/blob/master/hw3/dqn.py
'''
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
import random
import gym
import math
import matplotlib.pyplot as plt
from gym.wrappers.monitor import Monitor
import time
import sys
import random
from collections import deque

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.999
        self.learning_rate = 0.001
        self.batch_size = 64
        self.train_start = 1000
        self.render = False
        
        self.memory = deque([], 2000)
        self.score_history = []
        self.model = self.DQNNet()
        self.target_model = self.DQNNet()
        
        self.updateTarget()
        
    def DQNNet(self):
        model = Sequential()
        
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        
        return model
    
    def updateTarget(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def getAction(self, state):
        r = np.random.uniform(low=0, high=1)

        if r <= self.epsilon:
            action = random.randrange(self.action_size)
        else:
            actionValues = self.model.predict(state)
            action = np.argmax(actionValues[0])
        return action
                
    def train(self):
        if len(self.memory) < self.train_start:
            return
        
        batch_size = min(self.batch_size, len(self.memory))
        mb = random.sample(self.memory, batch_size)
        
        new = np.zeros((batch_size, self.state_size))
        newTarget = np.zeros((batch_size, self.state_size))
        action = []
        reward = []
        done = []
        
        for i in range(self.batch_size):
            new[i] = mb[i][0]
            action.append(mb[i][1])
            reward.append(mb[i][2])
            newTarget[i] = mb[i][3]
            done.append(mb[i][4])
            
        target = self.model.predict(new)
        target_val = self.target_model.predict(newTarget)
        
        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_val[i]))
        
        self.model.fit(new, target, batch_size=self.batch_size, epochs=1, verbose=0)


In [18]:
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
numEpisodes = 200

In [19]:
scores = []
episodes = []

In [20]:
for e in range(numEpisodes):
    done = False
    score = 0    
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    
    while not done:
        if agent.render:
            env.render()
            
        action = agent.getAction(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        reward = reward if not done or score == 499 else -100
        
        agent.remember(state, action, reward, next_state, done)
        agent.train()
        score += reward
        state = next_state
        
        if done:
            agent.updateTarget()
            
            score = score if score == 500 else score + 100
            scores.append(score)
            episodes.append(e)
            print(e, score)
            
#             if np.mean(scores[-min(10, len(scores)):]) > 490:
#                 sys.exit()

0 24.0
1 15.0
2 32.0
3 10.0
4 13.0
5 11.0
6 15.0
7 49.0
8 51.0
9 76.0
10 18.0
11 31.0
12 61.0
13 73.0
14 67.0
15 79.0
16 25.0
17 146.0
18 105.0
19 68.0
20 117.0
21 51.0
22 20.0
23 8.0
24 121.0
25 42.0
26 122.0
27 98.0
28 199.0
29 93.0
30 87.0
31 136.0
32 130.0
33 370.0
34 268.0
35 166.0
36 149.0
37 153.0
38 188.0
39 185.0
40 194.0
41 174.0
42 169.0
43 150.0
44 158.0
45 178.0
46 163.0
47 155.0
48 189.0
49 150.0
50 170.0
51 158.0
52 167.0
53 154.0
54 165.0
55 158.0
56 155.0
57 160.0
58 160.0
59 176.0
60 170.0
61 179.0
62 178.0
63 178.0
64 174.0
65 170.0
66 187.0
67 165.0
68 179.0
69 194.0
70 186.0
71 189.0
72 238.0
73 196.0
74 187.0
75 251.0
76 249.0
77 453.0
78 320.0
79 380.0
80 500.0
81 292.0
82 99.0
83 23.0
84 18.0
85 16.0
86 14.0
87 123.0
88 185.0
89 500.0
90 151.0
91 140.0
92 171.0
93 175.0
94 152.0
95 159.0
96 173.0
97 175.0
98 187.0
99 176.0
100 192.0
101 175.0
102 190.0
103 171.0
104 177.0
105 169.0
106 178.0
107 162.0
108 175.0
109 174.0
110 164.0
111 190.0
112 180.0
113 209.0
1

KeyboardInterrupt: 