In [1]:
import numpy as np
import gym
import tensorflow as tf
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Input

  _RESOLVED_ROMS = _resolve_roms()
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  np.bool:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def randint(low, high=None, size=None, dtype=onp.int):  # pylint: disable=missing-function-docstring
  from ._conv import register_converters as _register_converters


In [2]:
def prepro(I):
    # preprocess each frame for learning
    # save some memory and computation
    # pre-process the image from a 210x160x3 uint8 frame into an (80x80) float array 
    I = I[35:195,:,:].copy() # crop the top of the image...score image doesn't matter for how to play
    I = I[::2,::2,0].copy()
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.array(I.copy())

In [3]:
def discount_rewards(r):
    # take 1D float array of rewards and compute discounted reward
    # gym returns a reward with every single frame.  most of those rewards are 0
    # sometimes they're 1 or -1 if we win or lose a point in that specific frame
    # we want non-0 rewards for every frame. 
    # so take each frame, figure out if we eventually won the corresponding point or not
    # if so make the reward positive, if not negative
    # but more recent actions (relative to the frame where the point is awarded) are more 
    # impactful to the score that frames a long time ago, so discount rewards...
    
    delt = 0.99 # discount factor
    nr = len(r)
    # we want to change all those zeros into discounted values of the next reward (this is the value function!)
    discounted_r = [0.0]*nr
    
    for t in range(nr):
        # start at the end
        if r[nr-t-1] > 0: # if you won a point in this frame we want a good reward
            discounted_r[nr-t-1] = 1
        elif r[nr-t-1] < 0: # if we lost the point we want a bad reward
            discounted_r[nr-t-1] = -1
        elif t==0: # this is just for error catching...at t==0 r[nr-t-1] should have already been + or -...
            discounted_r[nr-t-1] = 0
        elif discounted_r[nr-t-1] == 0: # otherwise you want to look at the next reward value and discount it
            discounted_r[nr-t-1] = delt*discounted_r[nr-t]
    return discounted_r

In [4]:
def create_model(height,width,channels):
    # we cannot simply have 3 output nodes because we want to put a weight on each node's impact to the objective
    # that is different for each data point.  the only way to achieve this is to have 3 output layers, each having 1 node
    # the effect is the same, just the way TF/keras handles weights is different
    imp = Input(shape=(height,width,channels))
    mid = Conv2D(16,(8,8),strides=4,activation='relu')(imp)
    mid = Conv2D(32,(4,4),strides=2,activation='relu')(mid)
    mid = Flatten()(mid)
    mid = Dense(256,activation='relu')(mid)
    out0 = Dense(3,activation='softmax')(mid)
    model = Model(imp,out0) 
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),loss='sparse_categorical_crossentropy')
    
    return model

In [5]:
frames_to_net = 4              # how many previous frames will we feed the NN
possible_actions = [0,2,3]
mod = create_model(80,80,frames_to_net)
mod.call = tf.function(mod.call,experimental_relax_shapes=True)

2022-04-29 16:15:48.986992: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
mod.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 80, 80, 4)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 19, 19, 16)        4112      
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 8, 8, 32)          8224      
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 256)               524544    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 771       
Total params: 537,651
Trainable params: 537,651
Non-trainable params: 0
_______________________________________________________

In [7]:
def play1game(model):
    env0 = gym.make("Pong-v0")
    pix = env0.reset()
    pix = prepro(pix)
    frames_this_game = 0
    feed = np.zeros((1,80,80,frames_to_net))
    feed[0,:,:,0] = pix.copy()
    
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    fcount = 0
    while not done:
        if fcount == 0:
            vf = model(feed,training=False).numpy()[0]
            action = np.random.choice(3,p=vf)
            fcount += 1
        elif fcount == 3:
            fcount = 0
        else:
            fcount += 1

        
        action0 = possible_actions[action]
        pix_new, reward, done, info = env0.step(action0)
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro(pix_new)
        frames_this_game += 1

        for f in range(1,frames_to_net):
            feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
        feed[0,:,:,0] = pix.copy()
        score += reward
        
    return frame_array, action_array, reward_array, score

In [8]:
mod = create_model(80,80,frames_to_net)
mod.call = tf.function(mod.call,experimental_relax_shapes=True)

In [11]:
ngames = 1000
nbatch = 10
buffn = 200000 # memory buffer with max frames indicated
warmupgames = 50
len_buff = 0
buffer = {'frames':[],'actions':[],'rewards':[]} # dictionary with 3 keys

In [12]:
for game in range(ngames):
    start = time.time()
    frames, actions, rewards, score = play1game(mod)
    rewards = discount_rewards(rewards.copy())
    buffer['frames'] += frames.copy()
    buffer['actions'] += actions.copy()
    buffer['rewards'] += rewards.copy()
    len_buff += len(actions)
    if len_buff > buffn:
        excess = len_buff - buffn
        buffer['frames'] = buffer['frames'][excess:].copy()
        buffer['actions'] = buffer['actions'][excess:].copy()
        buffer['rewards'] = buffer['rewards'][excess:].copy()
        len_buff = len(buffer['actions'])
    rewards = np.array(rewards)
    actions = np.array(actions)
    nframes = len(frames)
    current_frames = np.zeros((nframes,80,80,frames_to_net))
    
    
    # after warm up game
    if game >= warmupgames:
        prob = np.ones(len_buff)
        prob[np.array(buffer['rewards']) > 0] = 5.0
        prob /= np.sum(prob)
        which_choose = np.random.choice(len_buff,size=nframes,replace=False,p=prob)
    
        for grab in range(nframes):
            rewards[grab] = buffer['rewards'][which_choose[grab]]
            actions[grab] = buffer['actions'][which_choose[grab]]
            for f in range(frames_to_net):
                if grab-f > 0:
                    current_frames[grab,:,:,f] = buffer['frames'][which_choose[grab]-f].copy()
    
        mod.fit(current_frames,actions,epochs=1,steps_per_epoch=nbatch,verbose=0,sample_weight=rewards,use_multiprocessing=True)
    stop = time.time()
    print(game, score, stop-start,len_buff)

0 -21.0 0.8219120502471924 1025
1 -20.0 1.05250883102417 2427
2 -21.0 0.9091272354125977 3612
3 -21.0 1.0164144039154053 4960
4 -21.0 0.8180630207061768 6054
5 -21.0 0.8177399635314941 7154
6 -20.0 1.0306291580200195 8558
7 -21.0 0.9089388847351074 9753
8 -21.0 0.8009719848632812 10852
9 -21.0 0.8601930141448975 12028
10 -19.0 1.1356158256530762 13624
11 -21.0 1.029306173324585 15047
12 -21.0 1.0137641429901123 16466
13 -21.0 0.8083970546722412 17562
14 -21.0 1.0306589603424072 18981
15 -20.0 1.006746768951416 20361
16 -21.0 0.9189038276672363 21624
17 -20.0 1.1353967189788818 23180
18 -21.0 0.7882330417633057 24193
19 -20.0 1.0678739547729492 25582
20 -18.0 1.0547819137573242 26991
21 -21.0 0.9168908596038818 28159
22 -21.0 0.9021990299224854 29330
23 -21.0 0.8773369789123535 30433
24 -21.0 0.9140632152557373 31609
25 -21.0 0.9200379848480225 32788
26 -19.0 1.132863998413086 34311
27 -21.0 0.9407920837402344 35555
28 -19.0 1.1410870552062988 37061
29 -20.0 1.0882086753845215 38453
30 

235 -21.0 1.7414360046386719 200000
236 -21.0 1.690521240234375 200000
237 -21.0 2.077662944793701 200000
238 -21.0 1.856950044631958 200000
239 -21.0 2.086625099182129 200000
240 -21.0 1.7436370849609375 200000
241 -21.0 1.9216721057891846 200000
242 -21.0 1.6255691051483154 200000
243 -21.0 1.932995080947876 200000
244 -21.0 1.6805570125579834 200000
245 -21.0 2.0301272869110107 200000
246 -21.0 2.0579771995544434 200000
247 -21.0 1.8498170375823975 200000
248 -20.0 1.7025530338287354 200000
249 -20.0 1.6506812572479248 200000
250 -21.0 1.6413178443908691 200000
251 -21.0 2.0686967372894287 200000
252 -20.0 2.2071850299835205 200000
253 -20.0 1.9641122817993164 200000
254 -21.0 1.5549488067626953 200000
255 -21.0 1.747413158416748 200000
256 -21.0 1.7652738094329834 200000
257 -21.0 1.7343900203704834 200000
258 -21.0 1.5575289726257324 200000
259 -21.0 1.5498130321502686 200000
260 -21.0 1.8079659938812256 200000
261 -21.0 1.4337530136108398 200000
262 -21.0 1.6975460052490234 20000

465 -21.0 1.7489237785339355 200000
466 -21.0 1.40529203414917 200000
467 -21.0 1.610285997390747 200000
468 -21.0 1.708772897720337 200000
469 -21.0 1.68641996383667 200000
470 -21.0 1.5088081359863281 200000
471 -21.0 1.8088529109954834 200000
472 -21.0 1.5467660427093506 200000
473 -21.0 1.3847160339355469 200000
474 -21.0 1.483605146408081 200000
475 -20.0 1.8095109462738037 200000
476 -21.0 1.8308768272399902 200000
477 -21.0 1.6245381832122803 200000
478 -21.0 2.231119155883789 200000
479 -18.0 2.0836281776428223 200000
480 -20.0 1.7113428115844727 200000
481 -21.0 1.5701558589935303 200000
482 -21.0 1.7608649730682373 200000
483 -21.0 1.5642821788787842 200000
484 -21.0 1.5694780349731445 200000
485 -21.0 1.5797278881072998 200000
486 -21.0 2.166699171066284 200000
487 -21.0 1.399549961090088 200000
488 -21.0 1.4573493003845215 200000
489 -21.0 1.3688111305236816 200000
490 -20.0 1.7777929306030273 200000
491 -20.0 1.9642219543457031 200000
492 -21.0 1.4459741115570068 200000
49

695 -21.0 2.036367177963257 200000
696 -21.0 1.734203815460205 200000
697 -21.0 1.8825249671936035 200000
698 -21.0 2.0333831310272217 200000
699 -21.0 1.559391975402832 200000
700 -21.0 1.640519142150879 200000
701 -21.0 1.7820558547973633 200000
702 -20.0 1.788633108139038 200000
703 -21.0 2.0310299396514893 200000
704 -21.0 1.6009788513183594 200000
705 -21.0 1.7136640548706055 200000
706 -21.0 1.6819608211517334 200000
707 -21.0 1.7473030090332031 200000
708 -21.0 1.6824557781219482 200000
709 -21.0 1.8060059547424316 200000
710 -21.0 1.81882905960083 200000
711 -21.0 1.8689978122711182 200000
712 -20.0 1.8884761333465576 200000
713 -21.0 1.940101146697998 200000
714 -21.0 1.614206075668335 200000
715 -21.0 1.634124994277954 200000
716 -20.0 1.740342140197754 200000
717 -21.0 1.7466070652008057 200000
718 -21.0 1.544814109802246 200000
719 -21.0 1.4139819145202637 200000
720 -20.0 1.927628993988037 200000
721 -21.0 1.871931791305542 200000
722 -20.0 1.988678216934204 200000
723 -21

925 -21.0 1.5247371196746826 200000
926 -20.0 1.7027029991149902 200000
927 -20.0 1.6597959995269775 200000
928 -20.0 2.190925121307373 200000
929 -20.0 1.8014492988586426 200000
930 -21.0 1.5329930782318115 200000
931 -21.0 1.7094318866729736 200000
932 -21.0 1.459319829940796 200000
933 -20.0 2.041882038116455 200000
934 -21.0 1.6311140060424805 200000
935 -21.0 1.579408884048462 200000
936 -21.0 1.6661701202392578 200000
937 -21.0 1.3762507438659668 200000
938 -21.0 2.0011210441589355 200000
939 -21.0 1.3719801902770996 200000
940 -21.0 1.765068769454956 200000
941 -21.0 1.6795392036437988 200000
942 -21.0 2.0780029296875 200000
943 -21.0 1.9590480327606201 200000
944 -21.0 1.8733909130096436 200000
945 -21.0 1.6547901630401611 200000
946 -21.0 1.9753410816192627 200000
947 -21.0 1.720034122467041 200000
948 -21.0 1.9884538650512695 200000
949 -21.0 1.6872069835662842 200000
950 -21.0 1.413808822631836 200000
951 -19.0 2.2808969020843506 200000
952 -21.0 1.7664837837219238 200000
95