In [1]:
# import requirements
import gym
import random
import numpy as np
from keras.models     import Sequential
from keras.layers     import Dense
from keras.optimizers import Adam
import time

In [2]:
# set environment for MountainCar and define some parameters
env = gym.make('MountainCar-v0')
env.reset()
limit_steps = 200
score_requirement = -198
num_games = 10000
num_epochs = 10

In [3]:
# play game several times and generate training data
def model_data_generation():
    training_data = []
    # scores better than "score requirement" will be saved in this array
    best_scores = []
    # play game "num_games" times to generate training data
    for game in range(num_games):
        # At start of game with no plays, score is 0
        score = 0
        # array for saving (state, action) where state is (position, velocity)
        game_memory = []
        # array for saving previous state 
        game_history = []
        # this loop continues until 200 actions are taken or we win the game
        for step_index in range(limit_steps):
            # choose an action: left = 0, stay = 1, right = 2
            action = random.randrange(0, 3)
            # take a step for choosen action and save (current state, reward we get from that action, we achived the flag or not, extra information(!)) afterwards
            state, reward, done, info = env.step(action)    # state: (position, velocity)
            
            # after second step is taken(game has a history), save that history and the action has been taken in our game memory
            if len(game_history) > 0:
                game_memory.append([game_history, action])
            # set previous state with position we are and  velocity we have(for usage in next move)
            game_history = state

            # set a better reward for cases we are near the flag(x > -0.2)
            if state[0] > -0.2:
                reward = 1
            # add the reward we got from that action to our game score
            score += reward

            # if we achived to the flag, we are done, end of game
            if done:
                break

        # after playeing the game, save the score, if it is better than the score we want, and -> 
        if score >= score_requirement:
            best_scores.append(score)
            # -> add states we were and actions we have taken(in form of vector) to our traing data
            for history, action in game_memory:
                if action == 0:    # left
                    output = [1, 0, 0]
                elif action == 1:   # stay
                    output = [0, 1, 0]
                elif action == 2:   # right
                    output = [0, 0, 1]
                training_data.append([history, output])
        # reset the environment for next game
        env.reset()    
    return training_data

In [4]:
# building a model and fitting training data to that
def train_model(training_data):
    # X: array of (position, velocity)
    X = np.array([i[0] for i in training_data]).reshape(-1, len(training_data[0][0]))
    # y: array of actions for related (position, velocity)
    y = np.array([i[1] for i in training_data]).reshape(-1, len(training_data[0][1]))
    # model building
    model = Sequential()
    model.add(Dense(128, input_dim=len(X[0]), activation='relu'))
    model.add(Dense(52, activation='relu'))
    model.add(Dense(len(y[0]), activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    # fit the model to our training data through "num_epochs" epochs
    model.fit(X, y, epochs=num_epochs)
    # model is ready to return
    return model

In [5]:
start_time = time.time()
# prepare training data with playing game several times
training_data = model_data_generation()
# build a model and fit these training data to that
trained_model = train_model(training_data)
print("Training time:", time.time() - start_time, "s")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training time: 100.82101655006409 s


In [6]:
start_time = time.time()
actions = []
score = 0
history = []

# this loop continues until 200 actions are taken or we win the game
for step_index in range(limit_steps):
    # without any history, a random action is taken
    if len(history) == 0:
        action = random.randrange(0, 3)
    # with history, find the action maximises the game score
    else:
        action = np.argmax(trained_model.predict(history.reshape(-1, len(history)))[0])
    
    # add that action to the array of actions from the begining of the game
    actions.append(action)
    # take a step for that action and save (current state, reward we get from that action, we achived the flag or not, extra information(!)) afterwards
    state, reward, done, info = env.step(action)
    # save current (position, velocity) in our history
    history = state
    # add the reward we got from that action to our game score  
    score += reward
    
    # if we achived to the flag, we are done, end of game
    if done:
        break


print('Score:',score)
print("Testing time:", time.time() - start_time, "s")

Score: -125.0
Testing time: 4.5083723068237305 s
