# Using the gym version of the cartpole to start with Imitation Learning

gym.openai.com/envs/Cartpole-v1

https://www.gymlibrary.dev/environments/classic_control/cart_pole/

https://www.youtube.com/watch?v=3zeg7H6cAJw

In [91]:
#imports
import gym
import numpy as np

from statistics import mean, median, stdev
from collections import Counter

import time

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [102]:
lr = 1e-3
env = gym.make('CartPole-v1', render_mode="human")
env.reset()
goal_steps = 500
score_requirement = 50 # all random games have a score of 50 or better
initial_games = 10000


def some_random_games_first(): # just to see what the game looks like
    for episode in range(5):
        env.reset()
        for t in range(goal_steps):
            env.render() # slows down the speed of the game
            action = env.action_space.sample()
            observation, reward, done, info, _ = env.step(action)
            if done:
                break


: 

Playing some games first with random actions and picking the best ones (score requirement above 50). Then imitate these actions with the model. 

In [93]:
def initial_population():
    training_data = []
    scores = []
    accepted_scores = []

    for _ in range(initial_games):
        score = 0
        game_memory = []
        prev_observation = []

        for _ in range(goal_steps):
            action = env.action_space.sample()
            observation, reward, done, info, _ = env.step(action)

            if len(prev_observation) > 0:
                game_memory.append([prev_observation, action])
            
            prev_observation = observation
            score += reward

            if done:
                break
        
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory: # convert to one-hot output

                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])

        env.reset()
        scores.append(score)
    #training_data_save = np.array(training_data)
    #np.save('saved.npy', training_data_save)

    print('Average accepted score:', mean(accepted_scores))
    print('Median accepted score:', median(accepted_scores))
    print(Counter(accepted_scores))

    return training_data

env.close()
env = gym.make('CartPole-v1')
env.reset()
training_data = initial_population()

Average accepted score: 62.06849315068493
Median accepted score: 58.0
Counter({51.0: 28, 55.0: 27, 50.0: 24, 52.0: 22, 56.0: 21, 61.0: 19, 53.0: 19, 59.0: 15, 57.0: 15, 58.0: 14, 54.0: 14, 65.0: 12, 64.0: 12, 68.0: 11, 60.0: 11, 66.0: 9, 69.0: 8, 62.0: 7, 67.0: 7, 63.0: 6, 73.0: 6, 76.0: 5, 74.0: 4, 93.0: 3, 72.0: 3, 89.0: 3, 70.0: 3, 75.0: 3, 95.0: 2, 97.0: 2, 85.0: 2, 105.0: 2, 79.0: 2, 77.0: 2, 88.0: 2, 86.0: 2, 91.0: 2, 71.0: 2, 107.0: 1, 84.0: 1, 87.0: 1, 100.0: 1, 83.0: 1, 81.0: 1, 115.0: 1, 82.0: 1, 102.0: 1, 92.0: 1, 111.0: 1, 90.0: 1, 94.0: 1, 78.0: 1})


In [94]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

def build_model(state_size, action_size, lr=0.001):
    model = Sequential()
    model.add(Dense(24, input_dim=state_size, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(96, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(action_size, activation='linear'))
    #model.compile(loss='mse', optimizer=Adam(learning_rate=lr))
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    return model

def train_model(training_data, state_size, action_size, model=False):
    X = np.array([i[0] for i in training_data]).reshape(-1, state_size)
    y = np.array([i[1] for i in training_data]).reshape(-1, action_size) 
    print(training_data[0][0])
    print(X[0])
    print(training_data[0][1])
    print(y[0])

    if not model:
        model = build_model()

    model.fit(X, y, epochs=5, verbose=1)
    return model

model = build_model(state_size, action_size, lr)

In [95]:
# pretraining test
env.close()
env = gym.make('CartPole-v1')
env.reset()

test_episodes = 10
test_average = 0
for e in range(test_episodes):
    state = env.reset()
    state = np.reshape(state[0], [1, state_size])
    done = False
    t = 0
    while not done:
        #env.render()  # Render the environment to visualize the agent's behavior
        action = np.argmax(model.predict(state, verbose = 0)[0])
        next_state, reward, done, info, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        t += 1
        if done:
            print("Test Episode: {}/{}, Score: {}".format(e + 1, test_episodes, t))
            test_average += t
            break
test_average/=test_episodes
print()
print('pretraining: score average ', test_average)
test_average = 0
env.close() # finish the rendering


Test Episode: 1/10, Score: 10
Test Episode: 2/10, Score: 10
Test Episode: 3/10, Score: 9
Test Episode: 4/10, Score: 10
Test Episode: 5/10, Score: 10
Test Episode: 6/10, Score: 10
Test Episode: 7/10, Score: 11
Test Episode: 8/10, Score: 12
Test Episode: 9/10, Score: 28
Test Episode: 10/10, Score: 22

pretraining: score average  13.2


2023-05-08 11:00:25.986147: W tensorflow/c/c_api.cc:291] Operation '{name:'dense_61/bias/Assign' id:5378 op device:{requested: '', assigned: ''} def:{{{node dense_61/bias/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](dense_61/bias, dense_61/bias/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


In [96]:
model = train_model(training_data, state_size, action_size, model) # fast training process

[-0.02687128 -0.15812276  0.0300712   0.25525638]
[-0.02687128 -0.15812276  0.0300712   0.25525638]
[1, 0]
[1 0]
Train on 22290 samples
Epoch 1/5


2023-05-08 11:00:26.259315: W tensorflow/c/c_api.cc:291] Operation '{name:'loss_10/mul' id:5514 op device:{requested: '', assigned: ''} def:{{{node loss_10/mul}} = Mul[T=DT_FLOAT, _has_manual_control_dependencies=true](loss_10/mul/x, loss_10/dense_65_loss/value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-05-08 11:00:26.340127: W tensorflow/c/c_api.cc:291] Operation '{name:'training_20/Adam/dense_60/bias/v/Assign' id:5726 op device:{requested: '', assigned: ''} def:{{{node training_20/Adam/dense_60/bias/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_20/Adam/dense_60/bias/v, training_20/Adam/dense_60/bias/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an er

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
# postraining test
env = gym.make('CartPole-v1') # no visualization

# Test
test_episodes = 30
test_scores = []
start_time = time.time()

for e in range(test_episodes):
    state, done = env.reset()

    for t in range(501):
        state = np.reshape(state, [1, state_size])
        action = np.argmax(model.predict(state, verbose = 0)[0])
        state, reward, done, info, _ = env.step(action)
    

        if done or t == 500:
            print("Test Episode: {}/{}, Score: {}".format(e + 1, test_episodes, t))
            test_scores.append(t)
            break

test_average = mean(test_scores)
test_sigma = stdev(test_scores)
end_time = time.time()
total_time = end_time - start_time
total_steps = sum(test_scores)
average_time_per_step = total_time / total_steps

print()
print('Score average: {:.2f}, Sigma: {:.2f}'.format(test_average, test_sigma))
print('Average time per step: {:.4f} seconds'.format(average_time_per_step))

env.close()

Test Episode: 1/30, Score: 110
Test Episode: 2/30, Score: 121
Test Episode: 3/30, Score: 206
Test Episode: 4/30, Score: 202
Test Episode: 5/30, Score: 215
Test Episode: 6/30, Score: 148
Test Episode: 7/30, Score: 421
Test Episode: 8/30, Score: 145
Test Episode: 9/30, Score: 374
Test Episode: 10/30, Score: 181
Test Episode: 11/30, Score: 147
Test Episode: 12/30, Score: 308
Test Episode: 13/30, Score: 247
Test Episode: 14/30, Score: 123
Test Episode: 15/30, Score: 154
Test Episode: 16/30, Score: 87
Test Episode: 17/30, Score: 289
Test Episode: 18/30, Score: 161
Test Episode: 19/30, Score: 95
Test Episode: 20/30, Score: 120
Test Episode: 21/30, Score: 130
Test Episode: 22/30, Score: 182
Test Episode: 23/30, Score: 119
Test Episode: 24/30, Score: 198
Test Episode: 25/30, Score: 95
Test Episode: 26/30, Score: 94
Test Episode: 27/30, Score: 362
Test Episode: 28/30, Score: 280
Test Episode: 29/30, Score: 116
Test Episode: 30/30, Score: 108

Score average: 184.60, Sigma: 90.83
Average time per

In [98]:
initial_games = 100 # trying to get improved data
score_requirement = 300

def improved_data():
    training_data = []
    scores = []
    accepted_scores = []

    for g in range(initial_games):
        score = 0
        game_memory = []
        prev_observation = []

        state = env.reset()
        state = np.reshape(state[0], [1, state_size])

        for _ in range(goal_steps):
            action = np.argmax(model.predict(state, verbose = 0)[0])
            observation, reward, done, info, _ = env.step(action)
            state = np.reshape(observation, [1, state_size])

            if len(prev_observation) > 0:
                game_memory.append([prev_observation, action])
            
            prev_observation = observation
            score += reward

            if done:
                break
        
        if score >= score_requirement:
            accepted_scores.append(score)
            for data in game_memory: # convert to one-hot output

                if data[1] == 1:
                    output = [0, 1]
                elif data[1] == 0:
                    output = [1, 0]
                training_data.append([data[0], output])

        env.reset()
        scores.append(score)

        if g % 100 == 0:
            print('Game ', g, 'score: ', score)
    #training_data_save = np.array(training_data)
    #np.save('saved.npy', training_data_save)

    if len(accepted_scores) > 0:
        print('Average accepted score:', mean(accepted_scores))
        print('Median accepted score:', median(accepted_scores))
        print(Counter(accepted_scores))
    else:
        print('No scores above the score requirement')

    print(scores)
    return training_data

env.reset()
training_data = improved_data()

Game  0 score:  265.0
Average accepted score: 380.6666666666667
Median accepted score: 369.5
Counter({448.0: 1, 407.0: 1, 475.0: 1, 332.0: 1, 302.0: 1, 320.0: 1})
[265.0, 103.0, 189.0, 78.0, 179.0, 243.0, 171.0, 101.0, 166.0, 112.0, 127.0, 83.0, 85.0, 117.0, 243.0, 86.0, 229.0, 96.0, 122.0, 133.0, 89.0, 79.0, 183.0, 179.0, 221.0, 103.0, 117.0, 173.0, 448.0, 110.0, 166.0, 80.0, 179.0, 147.0, 221.0, 148.0, 191.0, 117.0, 87.0, 91.0, 187.0, 166.0, 172.0, 157.0, 135.0, 207.0, 120.0, 136.0, 192.0, 86.0, 94.0, 76.0, 145.0, 128.0, 89.0, 279.0, 204.0, 186.0, 165.0, 107.0, 125.0, 290.0, 147.0, 108.0, 196.0, 234.0, 407.0, 124.0, 118.0, 121.0, 156.0, 76.0, 99.0, 86.0, 475.0, 264.0, 332.0, 193.0, 97.0, 130.0, 234.0, 125.0, 302.0, 101.0, 157.0, 141.0, 273.0, 189.0, 129.0, 239.0, 91.0, 183.0, 130.0, 107.0, 320.0, 158.0, 156.0, 81.0, 225.0, 92.0]


In [99]:
model = train_model(training_data, state_size, action_size, model) # train again

[-0.05051852  0.14719273  0.02287117 -0.26850793]
[-0.05051852  0.14719273  0.02287117 -0.26850793]
[1, 0]
[1 0]
Train on 2278 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [100]:
# postraining 2 test

env = gym.make('CartPole-v1') # no visualization

# Test
test_episodes = 30
test_scores = []
start_time = time.time()

for e in range(test_episodes):
    state, done = env.reset()

    for t in range(501):
        state = np.reshape(state, [1, state_size])
        action = np.argmax(model.predict(state, verbose = 0)[0])
        state, reward, done, info, _ = env.step(action)
    

        if done or t == 500:
            print("Test Episode: {}/{}, Score: {}".format(e + 1, test_episodes, t))
            test_scores.append(t)
            break

test_average = mean(test_scores)
test_sigma = stdev(test_scores)
end_time = time.time()
total_time = end_time - start_time
total_steps = sum(test_scores)
average_time_per_step = total_time / total_steps

print()
print('Score average: {:.2f}, Sigma: {:.2f}'.format(test_average, test_sigma))
print('Average time per step: {:.4f} seconds'.format(average_time_per_step))

env.close()

Test Episode: 1/30, Score: 174
Test Episode: 2/30, Score: 192
Test Episode: 3/30, Score: 98
Test Episode: 4/30, Score: 138
Test Episode: 5/30, Score: 143
Test Episode: 6/30, Score: 175
Test Episode: 7/30, Score: 155
Test Episode: 8/30, Score: 80
Test Episode: 9/30, Score: 164
Test Episode: 10/30, Score: 74
Test Episode: 11/30, Score: 249
Test Episode: 12/30, Score: 107
Test Episode: 13/30, Score: 247
Test Episode: 14/30, Score: 75
Test Episode: 15/30, Score: 101
Test Episode: 16/30, Score: 151
Test Episode: 17/30, Score: 86
Test Episode: 18/30, Score: 142
Test Episode: 19/30, Score: 148
Test Episode: 20/30, Score: 500
Test Episode: 21/30, Score: 85
Test Episode: 22/30, Score: 94
Test Episode: 23/30, Score: 94
Test Episode: 24/30, Score: 172
Test Episode: 25/30, Score: 151
Test Episode: 26/30, Score: 120
Test Episode: 27/30, Score: 84
Test Episode: 28/30, Score: 500
Test Episode: 29/30, Score: 386
Test Episode: 30/30, Score: 128

Score average: 167.10, Sigma: 111.21
Average time per ste

## Creating the video

In [101]:
from moviepy.editor import ImageSequenceClip

# Load the cartpole environment
env = gym.make("CartPole-v1", render_mode="rgb_array")

# Visualization and video creation
def save_video():
    frames = []

    state, done = env.reset()
    for t in range(501):
        pixels = env.render()
        frames.append(pixels)

        state = np.reshape(state, [1, state_size])
        action = np.argmax(model.predict(state, verbose = 0)[0])
        state, reward, done, info, _ = env.step(action)

        if done or t == 500:
            break

    # Save the frames as a video
    clip = ImageSequenceClip(frames, fps=50)
    clip.write_videofile("video/ML_gym_balance.mp4", codec="libx264")

# Call the save_video function with your policy function
save_video()

env.close()

Moviepy - Building video video/ML_gym_balance.mp4.
Moviepy - Writing video video/ML_gym_balance.mp4



                                                              

Moviepy - Done !
Moviepy - video ready video/ML_gym_balance.mp4


