In [5]:
#imports
import numpy as np
import gym

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Activation, Convolution2D, Permute
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, Visualizer, TrainIntervalLogger, TestLogger

In [6]:
# makes the enviroment
env = gym.make('SpaceInvaders-ram-v4')

nb_actions = env.action_space.n
nb_obs = env.observation_space.shape

print("actions:", nb_actions, "   observations:", nb_obs)

IMG_SHAPE = (100,100) 
WINDOW_LENGTH = 4

input_shape = (WINDOW_LENGTH, IMG_SHAPE[0], IMG_SHAPE[1])


# setup the memory buffer
memory = SequentialMemory(limit=1000,window_length=WINDOW_LENGTH)

# create the policy
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), 
                              attr='eps',
                              value_max=1.,
                              value_min=.1,
                              value_test=.05,
                              nb_steps=150000) 

actions: 6    observations: (128,)


In [8]:
# create the neural network model
model = Sequential()
model.add(Flatten(input_shape=(4,) + nb_obs))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(64))
model.add(Activation('relu'))

model.add(Dense(64))
model.add(Activation('relu'))


model.add(Dense(nb_actions))
model.add(Activation('linear'))

In [9]:
# create the agent
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,
               target_model_update=1000, policy=policy, batch_size=32, gamma=0.99)

In [10]:
# compile fit and evaluate teh agent
dqn.compile(Adam(lr=1e-3), metrics=['mae']) 
train_history = dqn.fit(env, nb_steps=150000,callbacks=[logger], visualize=False, verbose=2)

Training for 150000 steps ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
    397/150000: episode: 1, duration: 0.885s, episode steps: 397, steps per second: 448, episode reward: 35.000, mean reward:  0.088 [ 0.000, 15.000], mean action: 2.458 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
    854/150000: episode: 2, duration: 0.941s, episode steps: 457, steps per second: 486, episode reward: 80.000, mean reward:  0.175 [ 0.000, 20.000], mean action: 2.501 [0.000, 5.000],  loss: --, mae: --, mean_q: --, mean_eps: --
   1482/150000: episode: 3, duration: 4.042s, episode steps: 628, steps per second: 155, episode reward: 55.000, mean reward:  0.088 [ 0.000, 20.000], mean action: 2.518 [0.000, 5.000],  loss: 83.563985, mae: 46.604588, mean_q: 61.557617, mean_eps: 0.992554
   1902/150000: episode: 4, duration: 2.933s, episode steps: 420, steps per second: 143, episode reward: 55.000, mean reward:  0.131 

In [30]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: 180.000, steps: 875
Episode 2: reward: 210.000, steps: 870
Episode 3: reward: 210.000, steps: 866
Episode 4: reward: 210.000, steps: 852
Episode 5: reward: 210.000, steps: 856


In [164]:
import pandas as pd
import matplotlib.pyplot as plt

# get the text file
file = open("training_logs copy.txt").readlines()
# its a list so just get the first and only text output
text = file[0]

# list of things in the file to remove such that i will be left with lists then loop to delete them
char_del = ['{"loss": ',', "mae": ','"mean_q": ,','"mean_eps": ,','"episode_reward": ,','"nb_episode_steps": ,','"nb_steps": ','"episode": ','"duration": ','}']

textProc = ''
for i in range(9):
    temp = text.replace(char_del[i],"")
    textProc += temp
print(textProc)
textProc = textProc.replace("[","")

# this turns it from a text thing to a list
textProc = textProc.split("]")
for i in range(9):
    print(f"Item number {i}",char_del[i] in textProc[i])
    print(textProc[i])

# list of the data columns
loss = []
mae = []
mean_q = []
mean_eps = []
episode_reward = []
nb_episode_steps = []
nb_steps = []
episode = []
duration = []
data = [loss, mae, mean_q, mean_eps, episode_reward, nb_episode_steps, nb_steps, episode, duration]

for i in range(9):
    data[i] = textProc[i]

df = pd.DataFrame()


[NaN, NaN, 83.56398478565494, 33.47217636562529, 25.463258834734354, 13.017391969731795, 27.332392838739214, 10.315901911676509, 9.52195905452656, 11.275318092183221, 8.164236139817671, 6.389449019689818, 6.411522198693697, 6.341532456694617, 5.848760267815639, 13.496846169615404, 26.580278341902034, 9.82819367157248, 5.834495996696907, 4.516216748109147, 5.499556204256339, 8.17353724884209, 6.291425077535839, 5.139894945368902, 7.318139222066753, 7.948596943855286, 17.93583979131956, 26.395295578768735, 8.392455796669715, 7.7706787475777155, 7.482692897920344, 6.7445945082243925, 5.134149885699828, 5.981075499305683, 4.609054115200826, 5.317365902296284, 6.373398380961648, 6.07546409296348, 6.401007774315382, 7.122820240673342, 6.810925329945864, 7.535356485472494, 6.240430982145544, 3.7239888399102843, 7.459813634527141, 25.94347512691449, 14.090334810555762, 5.461148287022431, 4.5733769596677964, 5.735727499737711, 4.659674121257735, 4.8161554823319115, 4.414523900434467, 6.80694615