In [1]:
import gym

In [2]:
# Make Environment
env = gym.make('CartPole-v0')

In [3]:
# bring environment to initial state
env.reset()

array([-0.0154208 ,  0.00641002,  0.00846035, -0.0496595 ])

In [4]:
env.observation_space

Box(4,)

In [5]:
# all possible actions
env.action_space

Discrete(2)

In [6]:
env.action_space.n

2

In [7]:
# take random action
env.step(env.action_space.sample())

(array([-0.0152926 , -0.18883222,  0.00746716,  0.24568069]), 1.0, False, {})

In [9]:
#  Random GamePlay
env.reset()
for _ in range(200):
    random_action = env.action_space.sample()
    env.step(random_action)
    env.render()
env.close()

### Mutiple Episodes of Random Games

In [10]:
for e in range(20):
    observation = env.reset()
    for t in range(50):
        action = env.action_space.sample()
        new_observation, reward, done, info = env.step(action)
        env.render()
        
        if done:
            print("Episode {}/{} with score {}".format(e,20,t))
            break
env.close()
print("All episodes done")

Episode 0/20 with score 12
Episode 1/20 with score 22
Episode 2/20 with score 19
Episode 3/20 with score 11
Episode 4/20 with score 22
Episode 5/20 with score 21
Episode 6/20 with score 12
Episode 7/20 with score 15
Episode 8/20 with score 38
Episode 9/20 with score 9
Episode 10/20 with score 20
Episode 11/20 with score 12
Episode 12/20 with score 26
Episode 13/20 with score 24
Episode 14/20 with score 18
Episode 15/20 with score 16
Episode 16/20 with score 10
Episode 17/20 with score 14
Episode 18/20 with score 8
Episode 19/20 with score 10
All episodes done


## Q - Learning
### Agent Design and Neural Networks

In [11]:
from keras.models import Sequential
from keras.layers import *
from collections import deque
import random
import pickle
import os

Using TensorFlow backend.


In [12]:
class Agent():
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95       #hyperparameter
        
        # Explorationm : good in the beginning
        # Exploitation : good in the later stage of game
        
        self.epsilon = 1.0 # 100% random exploration in the start 
        self.epsilon_decay  = 0.995
        self.epsilon_min = 0.01
        self.learning_rate =0.001
        self.model = self.create_model()
        
    def create_model(self):
        # Neural Network to estimate Q-value function
        model = Sequential()
        model.add(Dense(24, input_dim =self.state_size, activation="relu"))
        model.add(Dense(24, activation="relu"))
        model.add(Dense(self.action_size, activation="linear"))
        model.compile(loss="mse", optimizer = "adam")
        
        return model
    
    def remember(self, state, action, new_state, reward, done):
        
        self.memory.append((state, action, new_state, reward, done))
        
    def act(self, state):
        # Sampling according to the Epsiolon Greedy Method
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        # Else ask NN to give the most suitable action
        pred = self.model.predict(state)
        return np.argmax(pred[0])
    
    def train(self, batch_size=32):
        # Training using the 'replay buffer'
        # Pick batch_size of experiences of tuples (s,a,s',r,d) and train NN
        
        minibatch = random.sample(self.memory, batch_size)
        
        for experience in minibatch:
            # unpack the tuples of 5 values
            state, action, reward, new_state, done = experience
            
            # X, y  = state, expected_reward
            
            if not done:
                # game is not over yet, bellman equation to approx the target value of reward
                
                target = reward + self.gamma *np.amax(self.model.predict(new_state)[0])
                
            else:
                # Game is over here
                target = reward
                
            # y values -
            target_f = self.model.predict(state)
            target_f[0][action] = target # Update one particular with higher reward 
            
            # X = state, y = target_f
            
            self.model.fit(state, target_f, epochs=1, verbose=0) # SGD Training
            
        
        # Reduce the exoploration increase the exploitation
        if self.epsilon >= self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def save_model(self, name):
        with open(name, 'wb') as file:
            pickle.dump(self, file)

### Training DQN Agent 'Deep Q Learner'

In [13]:
state_size = 4
action_size = 2
batch_size = 32
n_episode = 1000
done = False
output_dir = "cartpole_model/"

In [14]:
agent = Agent(state_size,action_size)

Instructions for updating:
Colocations handled automatically by placer.


In [15]:
for e in range(1,n_episode+1):
    state = env.reset()
    state = state.reshape((1,state_size)) # 1 batch that have 4 state_size
    
    for time in range(500):
        env.render()
        
        action = agent.act(state) # action is left or right (0 or 1)
        
        next_state, reward, done, info = env.step(action)
        
        reward = reward if not done else -10    # Penalise if it fails
        
        next_state = next_state.reshape((1,state_size))
        
        agent.remember(state, action, reward, next_state, done) # 1 Experience for agent 
        
        state = next_state
    
        if done:
            print("Game Episode : {}/{},  with score {} Exploration rate: {:.2}".format(e, n_episode, time+1, agent.epsilon))
            break
            
    if len(agent.memory) >= batch_size:
        agent.train(batch_size)
            
    if e%50 == 0 :
        agent.save_model(output_dir + "weights_"+ "%d"%(e) +'.h5')

print("Model Trained")
env.close()

Game Episode : 1/1000,  with score 14 Exploration rate: 1.0
Game Episode : 2/1000,  with score 22 Exploration rate: 1.0
Instructions for updating:
Use tf.cast instead.
Game Episode : 3/1000,  with score 26 Exploration rate: 0.99
Game Episode : 4/1000,  with score 28 Exploration rate: 0.99
Game Episode : 5/1000,  with score 23 Exploration rate: 0.99
Game Episode : 6/1000,  with score 15 Exploration rate: 0.98
Game Episode : 7/1000,  with score 28 Exploration rate: 0.98
Game Episode : 8/1000,  with score 13 Exploration rate: 0.97
Game Episode : 9/1000,  with score 10 Exploration rate: 0.97
Game Episode : 10/1000,  with score 33 Exploration rate: 0.96
Game Episode : 11/1000,  with score 65 Exploration rate: 0.96
Game Episode : 12/1000,  with score 30 Exploration rate: 0.95
Game Episode : 13/1000,  with score 28 Exploration rate: 0.95
Game Episode : 14/1000,  with score 16 Exploration rate: 0.94
Game Episode : 15/1000,  with score 20 Exploration rate: 0.94
Game Episode : 16/1000,  with sco

Game Episode : 129/1000,  with score 14 Exploration rate: 0.53
Game Episode : 130/1000,  with score 17 Exploration rate: 0.53
Game Episode : 131/1000,  with score 12 Exploration rate: 0.52
Game Episode : 132/1000,  with score 19 Exploration rate: 0.52
Game Episode : 133/1000,  with score 13 Exploration rate: 0.52
Game Episode : 134/1000,  with score 20 Exploration rate: 0.52
Game Episode : 135/1000,  with score 39 Exploration rate: 0.51
Game Episode : 136/1000,  with score 46 Exploration rate: 0.51
Game Episode : 137/1000,  with score 72 Exploration rate: 0.51
Game Episode : 138/1000,  with score 11 Exploration rate: 0.51
Game Episode : 139/1000,  with score 40 Exploration rate: 0.5
Game Episode : 140/1000,  with score 15 Exploration rate: 0.5
Game Episode : 141/1000,  with score 68 Exploration rate: 0.5
Game Episode : 142/1000,  with score 24 Exploration rate: 0.5
Game Episode : 143/1000,  with score 24 Exploration rate: 0.49
Game Episode : 144/1000,  with score 23 Exploration rate: 0

Game Episode : 260/1000,  with score 50 Exploration rate: 0.27
Game Episode : 261/1000,  with score 29 Exploration rate: 0.27
Game Episode : 262/1000,  with score 20 Exploration rate: 0.27
Game Episode : 263/1000,  with score 23 Exploration rate: 0.27
Game Episode : 264/1000,  with score 22 Exploration rate: 0.27
Game Episode : 265/1000,  with score 26 Exploration rate: 0.27
Game Episode : 266/1000,  with score 124 Exploration rate: 0.27
Game Episode : 267/1000,  with score 51 Exploration rate: 0.26
Game Episode : 268/1000,  with score 43 Exploration rate: 0.26
Game Episode : 269/1000,  with score 71 Exploration rate: 0.26
Game Episode : 270/1000,  with score 85 Exploration rate: 0.26
Game Episode : 271/1000,  with score 124 Exploration rate: 0.26
Game Episode : 272/1000,  with score 62 Exploration rate: 0.26
Game Episode : 273/1000,  with score 84 Exploration rate: 0.26
Game Episode : 274/1000,  with score 78 Exploration rate: 0.26
Game Episode : 275/1000,  with score 68 Exploration r

Game Episode : 390/1000,  with score 66 Exploration rate: 0.14
Game Episode : 391/1000,  with score 74 Exploration rate: 0.14
Game Episode : 392/1000,  with score 42 Exploration rate: 0.14
Game Episode : 393/1000,  with score 71 Exploration rate: 0.14
Game Episode : 394/1000,  with score 200 Exploration rate: 0.14
Game Episode : 395/1000,  with score 122 Exploration rate: 0.14
Game Episode : 396/1000,  with score 154 Exploration rate: 0.14
Game Episode : 397/1000,  with score 61 Exploration rate: 0.14
Game Episode : 398/1000,  with score 38 Exploration rate: 0.14
Game Episode : 399/1000,  with score 52 Exploration rate: 0.14
Game Episode : 400/1000,  with score 38 Exploration rate: 0.14
Game Episode : 401/1000,  with score 88 Exploration rate: 0.14
Game Episode : 402/1000,  with score 68 Exploration rate: 0.13
Game Episode : 403/1000,  with score 200 Exploration rate: 0.13
Game Episode : 404/1000,  with score 141 Exploration rate: 0.13
Game Episode : 405/1000,  with score 79 Exploratio

Game Episode : 519/1000,  with score 200 Exploration rate: 0.075
Game Episode : 520/1000,  with score 200 Exploration rate: 0.075
Game Episode : 521/1000,  with score 200 Exploration rate: 0.074
Game Episode : 522/1000,  with score 36 Exploration rate: 0.074
Game Episode : 523/1000,  with score 200 Exploration rate: 0.073
Game Episode : 524/1000,  with score 200 Exploration rate: 0.073
Game Episode : 525/1000,  with score 200 Exploration rate: 0.073
Game Episode : 526/1000,  with score 200 Exploration rate: 0.072
Game Episode : 527/1000,  with score 200 Exploration rate: 0.072
Game Episode : 528/1000,  with score 200 Exploration rate: 0.072
Game Episode : 529/1000,  with score 126 Exploration rate: 0.071
Game Episode : 530/1000,  with score 200 Exploration rate: 0.071
Game Episode : 531/1000,  with score 200 Exploration rate: 0.071
Game Episode : 532/1000,  with score 200 Exploration rate: 0.07
Game Episode : 533/1000,  with score 200 Exploration rate: 0.07
Game Episode : 534/1000,  wi

Game Episode : 646/1000,  with score 200 Exploration rate: 0.04
Game Episode : 647/1000,  with score 132 Exploration rate: 0.039
Game Episode : 648/1000,  with score 147 Exploration rate: 0.039
Game Episode : 649/1000,  with score 171 Exploration rate: 0.039
Game Episode : 650/1000,  with score 200 Exploration rate: 0.039
Game Episode : 651/1000,  with score 200 Exploration rate: 0.039
Game Episode : 652/1000,  with score 200 Exploration rate: 0.038
Game Episode : 653/1000,  with score 200 Exploration rate: 0.038
Game Episode : 654/1000,  with score 136 Exploration rate: 0.038
Game Episode : 655/1000,  with score 166 Exploration rate: 0.038
Game Episode : 656/1000,  with score 189 Exploration rate: 0.038
Game Episode : 657/1000,  with score 200 Exploration rate: 0.038
Game Episode : 658/1000,  with score 200 Exploration rate: 0.037
Game Episode : 659/1000,  with score 200 Exploration rate: 0.037
Game Episode : 660/1000,  with score 200 Exploration rate: 0.037
Game Episode : 661/1000,  

Game Episode : 773/1000,  with score 167 Exploration rate: 0.021
Game Episode : 774/1000,  with score 39 Exploration rate: 0.021
Game Episode : 775/1000,  with score 133 Exploration rate: 0.021
Game Episode : 776/1000,  with score 200 Exploration rate: 0.021
Game Episode : 777/1000,  with score 191 Exploration rate: 0.021
Game Episode : 778/1000,  with score 200 Exploration rate: 0.02
Game Episode : 779/1000,  with score 200 Exploration rate: 0.02
Game Episode : 780/1000,  with score 200 Exploration rate: 0.02
Game Episode : 781/1000,  with score 200 Exploration rate: 0.02
Game Episode : 782/1000,  with score 194 Exploration rate: 0.02
Game Episode : 783/1000,  with score 200 Exploration rate: 0.02
Game Episode : 784/1000,  with score 132 Exploration rate: 0.02
Game Episode : 785/1000,  with score 54 Exploration rate: 0.02
Game Episode : 786/1000,  with score 84 Exploration rate: 0.02
Game Episode : 787/1000,  with score 166 Exploration rate: 0.02
Game Episode : 788/1000,  with score 9

Game Episode : 901/1000,  with score 86 Exploration rate: 0.011
Game Episode : 902/1000,  with score 160 Exploration rate: 0.011
Game Episode : 903/1000,  with score 111 Exploration rate: 0.011
Game Episode : 904/1000,  with score 126 Exploration rate: 0.011
Game Episode : 905/1000,  with score 104 Exploration rate: 0.011
Game Episode : 906/1000,  with score 95 Exploration rate: 0.011
Game Episode : 907/1000,  with score 80 Exploration rate: 0.011
Game Episode : 908/1000,  with score 101 Exploration rate: 0.011
Game Episode : 909/1000,  with score 200 Exploration rate: 0.011
Game Episode : 910/1000,  with score 200 Exploration rate: 0.011
Game Episode : 911/1000,  with score 200 Exploration rate: 0.01
Game Episode : 912/1000,  with score 200 Exploration rate: 0.01
Game Episode : 913/1000,  with score 200 Exploration rate: 0.01
Game Episode : 914/1000,  with score 200 Exploration rate: 0.01
Game Episode : 915/1000,  with score 200 Exploration rate: 0.01
Game Episode : 916/1000,  with sc

In [16]:
agent.memory[100]

(array([[1.18368934, 1.41581493, 0.11969745, 0.13070449]]),
 1,
 1.0,
 array([[ 1.21200564,  1.60903699,  0.12231153, -0.12194771]]),
 False)

**LOAD MODEL**

In [33]:
with open('cartpole_model/weights_1000.h5', 'rb') as file:
    new_model = pickle.load(file)

### Test Playing 

In [31]:
def test_palying(new_model):
    total_score = 0
    
    for e in range(1,11):
        observation = env.reset()
        observation = observation.reshape((1,state_size))
        
        for t in range(1,500):
            env.render()
            action = new_model.act(observation)
            new_observation, reward, done, info = env.step(action)

            new_observation = new_observation.reshape((1,state_size))

            observation = new_observation
            
            if done:
                total_score += t
                print("Episode {}/{} with score {}".format(e,10,t))
                break
    env.close()
    print("Average Score :- {}".format(total_score/10))
    print("All episodes done")

In [36]:
for m in os.listdir("cartpole_model/"):
    file_name = "cartpole_model/"+m
    with open(file_name, 'rb') as file:
        new_model = pickle.load(file)
    print("Model - " + m)
    test_palying(new_model)
    print("----------------------------")

Model - weights_100.h5
Episode 1/10 with score 12
Episode 2/10 with score 17
Episode 3/10 with score 16
Episode 4/10 with score 24
Episode 5/10 with score 16
Episode 6/10 with score 15
Episode 7/10 with score 15
Episode 8/10 with score 12
Episode 9/10 with score 13
Episode 10/10 with score 15
Average Score :- 15.5
All episodes done
----------------------------
Model - weights_150.h5
Episode 1/10 with score 21
Episode 2/10 with score 11
Episode 3/10 with score 10
Episode 4/10 with score 14
Episode 5/10 with score 21
Episode 6/10 with score 15
Episode 7/10 with score 13
Episode 8/10 with score 38
Episode 9/10 with score 33
Episode 10/10 with score 39
Average Score :- 21.5
All episodes done
----------------------------
Model - weights_200.h5
Episode 1/10 with score 34
Episode 2/10 with score 27
Episode 3/10 with score 30
Episode 4/10 with score 33
Episode 5/10 with score 31
Episode 6/10 with score 42
Episode 7/10 with score 35
Episode 8/10 with score 46
Episode 9/10 with score 24
Episode 