<a href="https://colab.research.google.com/github/marcinwolter/Machine-learning-KISD-2023/blob/main/CartPole.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import tensorflow as tf
import gym
import os
import datetime
from statistics import mean
from gym import wrappers

from gym.wrappers.record_video import RecordVideo

class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for i in hidden_units:
            self.hidden_layers.append(tf.keras.layers.Dense(
                i, activation='tanh', kernel_initializer='RandomNormal'))
        self.output_layer = tf.keras.layers.Dense(
            num_actions, activation='linear', kernel_initializer='RandomNormal')

    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output


class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences

    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))

    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards, rewards+self.gamma*value_next)

        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.num_actions), axis=1)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss

    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])

    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())


def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    print(env.reset())
    observations = env.reset()
    losses = list()
    while not done:
        action = TrainNet.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            reward = -200
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, mean(losses)

def make_video(env, TrainNet):
    ###env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    ###env = gym.make('CartPole-v0', render_mode="rgb_array")
    env = RecordVideo(env, os.path.join(os.getcwd(), "videos"+str(i)),  episode_trigger = lambda episode_number: True)
    #env.reset()

    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        env.render()
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))


def main():
    env = gym.make('CartPole-v0')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space.sample())
    num_actions = env.action_space.n
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 1500 #50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward, losses = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            tf.summary.scalar('average loss)', losses, step=n)
        if n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward, "eps:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)
    print("avg reward for last 100 episodes:", avg_rewards)
    make_video(env, TrainNet)
    env.close()


if __name__ == '__main__':
    for i in range(3):
        main()

  logger.warn(
  deprecation(
  deprecation(


[ 0.03839562 -0.04451829 -0.015453   -0.00702861]
episode: 0 episode reward: 14.0 eps: 0.989901 avg reward (last 100): 14.0 episode loss:  0
[0.04242378 0.0297394  0.01469701 0.01707498]
[ 0.04988892 -0.02605876 -0.03200975  0.02266151]
[ 0.0376315  -0.02944501 -0.04072583 -0.04694194]
[0.0016589  0.04594631 0.00390275 0.0056472 ]
[-0.00323686 -0.04289654 -0.01866562 -0.0277416 ]
[-0.04911524  0.03173086 -0.04701769  0.03875876]




[ 0.02742236 -0.0407641  -0.04315785 -0.03226724]
[ 0.02658702  0.03012739 -0.0290587   0.04828034]
[-0.01829571 -0.03514311 -0.04085046 -0.00204282]
[-0.01539804  0.00366468  0.03417132 -0.00755383]
[ 0.01121092  0.04096664  0.00962693 -0.02310772]
[-0.03688818  0.01125324  0.04658045 -0.02599362]
[ 0.03785615 -0.03421121  0.00753628 -0.00831614]
[-0.02371687  0.00257577 -0.04899581 -0.04762075]
[-0.03176772  0.0195645  -0.00773997 -0.03059459]
[-0.0009213  -0.0118308   0.02325285  0.00987844]
[-0.03263411  0.03723107  0.04607034 -0.02288174]
[0.000248   0.03611098 0.01162418 0.04541934]
[-0.01065475 -0.01778712 -0.00819036  0.03400587]
[-0.02190558 -0.02163204  0.01514282 -0.04490919]
[ 0.01108575  0.02765279  0.01875961 -0.01599873]
[ 0.0053705   0.0280358   0.03762873 -0.04213225]
[ 0.02046601 -0.02026804 -0.00856054 -0.02603527]
[-0.02434029  0.02861423  0.00140222  0.04278059]
[-0.00149324  0.00442122  0.03526096  0.04643045]
[ 0.04062098 -0.04115991  0.00657584  0.03694565]
[ 0.

  logger.warn(
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Testing steps: 66 rewards 66.0: 
[-0.04163025 -0.00786493  0.03982296  0.02321415]
episode: 0 episode reward: 9.0 eps: 0.989901 avg reward (last 100): 9.0 episode loss:  0
[-0.04176074  0.03545858 -0.0288648   0.03377385]
[ 0.04706825  0.02027694 -0.03982023 -0.01130964]
[ 0.00530405  0.00249359  0.02140507 -0.03863729]
[ 0.04496521  0.02698085 -0.01246287  0.03477538]


  logger.warn(
  deprecation(
  deprecation(


[ 0.00426555 -0.00586402 -0.02229509 -0.02233543]
[ 0.00034049  0.0158845  -0.02717423 -0.02614154]
[-0.01883664  0.03635879 -0.0288134  -0.00988518]
[ 2.8633222e-02 -3.3465955e-02 -3.3370446e-02  7.9279584e-05]
[ 0.00682564  0.04651587 -0.00366133 -0.01932815]
[ 4.4620432e-02  4.4172932e-05 -9.3922038e-03 -1.2960922e-02]
[-0.01741924 -0.04719957 -0.02479446 -0.02249615]
[0.04087381 0.04007876 0.00653069 0.03585181]
[ 0.04667491 -0.00658685  0.02014472 -0.03493204]
[-0.03383511 -0.0446283  -0.01569053 -0.01637645]
[ 0.02602501 -0.03088436 -0.02285868  0.01403042]
[ 0.01660036 -0.00990234 -0.00959809 -0.04544621]
[-0.0146483  -0.03092724 -0.02363287  0.01853134]
[ 0.01357677 -0.00170753  0.01869346 -0.00762189]
[ 0.04687702 -0.01907361 -0.01577056  0.03137723]
[ 0.03657683 -0.02364194  0.0170086   0.00554652]
[ 0.04869343 -0.04588057  0.03108636  0.01331126]
[-0.01170744 -0.03588502 -0.0125729   0.04096508]
[ 0.01368237 -0.04476241 -0.00425648 -0.04780532]
[-0.04821375 -0.01245307  0.02

  logger.warn(
  logger.deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Testing steps: 65 rewards 65.0: 
[-0.01114495  0.0166902   0.0409986   0.04933541]
episode: 0 episode reward: 25.0 eps: 0.989901 avg reward (last 100): 25.0 episode loss:  0
[-0.03624683  0.03769664  0.0233416  -0.02368866]
[ 0.02942306  0.00268787 -0.0143748   0.04886342]
[0.01795944 0.02336736 0.02863281 0.0406864 ]
[-0.02384934 -0.04126436 -0.04756442  0.0430078 ]
[ 0.04496139  0.02098415 -0.03913138  0.02610182]


  logger.warn(
  deprecation(
  deprecation(


[-0.03459796 -0.03008571 -0.04652229  0.03976858]
[-0.00805587 -0.03352104  0.00198754  0.00403396]
[-0.03896931 -0.01642132 -0.01538846 -0.03347692]
[ 0.00439321 -0.01080602 -0.019703    0.01587823]
[-0.02306877 -0.04751668  0.04639246 -0.00489731]
[ 0.0072182  -0.04549176 -0.01137235 -0.01233018]
[ 0.02678499  0.01021539 -0.0152312   0.02736987]
[-0.00435215  0.03821095  0.01079766 -0.04773921]
[-0.02129591  0.04030136 -0.02801322 -0.04787779]
[0.01996248 0.03504087 0.04865564 0.01170263]
[-0.03157119 -0.04174021  0.01538918 -0.02291704]
[ 0.04601051 -0.02639162  0.03304356  0.01437291]
[ 0.02108377 -0.00629709  0.00132963 -0.00872023]
[ 0.04799388 -0.00278801  0.00289063 -0.01212508]
[ 0.03107712 -0.01144505  0.00301799 -0.02235785]
[ 0.00792221 -0.02957495  0.04851606  0.02709733]
[ 0.04885596  0.00764586 -0.03680405 -0.00270043]
[ 0.02308899 -0.04605341  0.03959881 -0.03894111]
[-0.01289933  0.04196598  0.03614228 -0.02036481]
[ 0.04573368 -0.0351468  -0.04320906  0.0236534 ]
[ 0.

  logger.warn(
  logger.deprecation(
If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


Testing steps: 102 rewards 102.0: 
