In [1]:
import warnings; warnings.simplefilter('ignore')
import gym
import numpy as np
import pandas as pd
import tflearn
from tflearn.layers.core import input_data
from tflearn.layers.core import dropout
from tflearn.layers.core import fully_connected
from tflearn.layers.estimator import regression
from collections import Counter
import time
import random

In [2]:
LR = 1e-3
env = gym.make('CartPole-v1')
goal_steps = 500 # Max no. of time_steps in each Episode
score_requirement = 50
initial_games = 10000 # No. of Episodes

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


initial_population() will create a training set using random action in each state (observation).

In [3]:
def initial_population():
    
    training_data = []
    all_scores = []
    accepted_scores = []
    
    for _ in range(initial_games): # Looping through all Episodes
        
        score = 0 # Value
        game_memory = [] # Policy = [State, Action]
        observation = env.reset() # Initial State
        
        for _ in range(goal_steps):  # Looping through all time_steps in the Episode
            
            # action = random.randrange(0,2)
            action = env.action_space.sample() # Random action chosen from the action_space
            game_memory.append([observation, action]) # Add [State, Action] to Policy
            
            # Move to the next State with a random action chosen from the action_space
            observation, reward, done, info = env.step(action)
            
            # prev_observation = observation
            # Increase Value by Reward
            score += reward
            
            # if pole tilted beyond 15 degrees from vertical then end episode
            if done: break
                
        # if Value of Episode > requirement
        if score >= score_requirement:
            accepted_scores.append(score)
            
            # Generating training data for the game
            # training_data.extend(game_memory)
            
            for data in game_memory:
                if data[1] == 1:
                    output = [data[0],[0,1]]
                elif data[1] == 0:
                    output = [data[0],[1,0]]
                training_data.append(output)
            # The action label has both actions (left & right -- 0,1) so that we can 
            # get the probability of both actions in any given state(observation)
            # The agent will chose the action with higher probability
        
        all_scores.append(score)
        
        
    # np.savetxt('Data_Files/training_data.txt', training_data)
    # pd.DataFrame([training_data[0][0].tolist(), training_data[0][1]]).to_csv('Data_Files/training_data.csv', \
    #                                    index=False, header=False)
    
    print('Average accepted score: {}'.format(np.mean(accepted_scores)))
    print('Median accepted score: {}'.format(np.median(accepted_scores)))
    print(Counter(accepted_scores)) # frequency count of all accepted values
    
    return training_data



# initial_population();
# training_data = initial_population()

In [4]:
def neural_network_model(input_size):
    
    network = input_data(shape=[None, input_size, 1], name='input')
    
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8)
    
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)
    
    network = fully_connected(network, 512, activation='relu')
    network = dropout(network, 0.8)
    
    network = fully_connected(network, 256, activation='relu')
    network = dropout(network, 0.8)
    
    network = fully_connected(network, 128, activation='relu')
    network = dropout(network, 0.8)
    
    network = fully_connected(network, 2, activation='softmax')
    network = regression(network, optimizer='adam', learning_rate=LR,\
                        loss='categorical_crossentropy', name='targets')
    model = tflearn.DNN(network, tensorboard_dir='log')
    
    return model

    

In [5]:
def train_model(training_data, model=False):
    
    # X = observation as array of input features - shape (-1, 4, 1)
    # y = action as array of output labels
    
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]), 1) 
    y = np.array([i[1] for i in training_data]).reshape(-1, 2)
    
    if not model:
        model = neural_network_model(input_size = len(X[0]))
    
    # fit the neural network model over the training data for n_epochs
    model.fit(X, y, n_epoch=3, validation_set=0.1, snapshot_step=500, \
             show_metric=True, run_id='openai_learning')

    return model

In [6]:
training_data = initial_population()

Average accepted score: 61.515151515151516
Median accepted score: 58.0
Counter({51.0: 30, 53.0: 29, 52.0: 25, 54.0: 22, 55.0: 21, 50.0: 20, 60.0: 18, 56.0: 17, 61.0: 15, 59.0: 14, 57.0: 12, 62.0: 11, 58.0: 11, 65.0: 10, 66.0: 10, 67.0: 9, 63.0: 9, 70.0: 8, 68.0: 7, 69.0: 7, 64.0: 5, 78.0: 4, 71.0: 4, 80.0: 4, 88.0: 3, 76.0: 3, 72.0: 3, 84.0: 3, 81.0: 3, 77.0: 2, 114.0: 2, 83.0: 2, 74.0: 2, 79.0: 2, 73.0: 2, 89.0: 2, 75.0: 2, 99.0: 1, 102.0: 1, 90.0: 1, 129.0: 1, 123.0: 1, 96.0: 1, 116.0: 1, 103.0: 1, 107.0: 1, 112.0: 1})


In [7]:
model = train_model(training_data)

Training Step: 944  | total loss: [1m[32m0.67922[0m[0m | time: 12.894s
| Adam | epoch: 003 | loss: 0.67922 - acc: 0.5787 -- iter: 20096/20097
Training Step: 945  | total loss: [1m[32m0.67796[0m[0m | time: 13.949s
| Adam | epoch: 003 | loss: 0.67796 - acc: 0.5834 | val_loss: 0.66656 - val_acc: 0.5920 -- iter: 20097/20097
--


In [13]:
all_scores = [] # Total value
choices = [] # Choice - left or right

for each_game in range(10):
    score = 0
    game_memory = []
    observation = env.reset()
    
    for _ in range(goal_steps):
        env.render()
        
        if len(observation) == 0:
            action = random.randrange(0,2)
        else:
            # predict = probability of both actions based on state
            predict = model.predict(observation.reshape(-1, len(observation), 1))[0]
            # action = action with the highest probability
            action = np.argmax(predict)
        
        choices.append(action)
        
        observation, reward, done, info = env.step(action)
        # prev_obs = observation
        game_memory.append([observation, action])
        score += reward
        
        if done: break
            
    all_scores.append(score)
    
print('Average Score: {}'.format(sum(all_scores)/len(all_scores)))
print('Choices 1: {}, Choice 2: {}'.format(choices.count(1)/len(choices), \
                                           choices.count(0)/len(choices)))

Average Score: 396.5
Choices 1: 0.4968474148802018, Choice 2: 0.5031525851197982


In [14]:
# model.save('Model_Files/carpole_v1.model')

INFO:tensorflow:/home/paresh/Code/Data Science Projects/Reference_Projects/Tensorflow_Keras_RL_Tutorial/Model_Files/carpole_v1.model is not in all_model_checkpoint_paths. Manually adding it.


In [11]:
# model.load('Model_Files/carpole_v1.model')