# Optimization Project 3
**Ian Arzt, Mahika Bansal, Prakhar Bansal, Brett Bartol**

In [None]:
import numpy as np
import gym
import tensorflow as tf
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout, Conv2D, Input
import keras.backend as K
import matplotlib.pyplot as plt
import json
import math
import random
import os
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

# Pong

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# modify the location where the ROMs are saved on your GoogleDrive
!python -m atari_py.import_roms /content/drive/MyDrive/optimization_roms/ROMS

copying adventure.bin from ROMS/Adventure (1980) (Atari, Warren Robinett) (CX2613, CX2613P) (PAL).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/adventure.bin
copying air_raid.bin from ROMS/Air Raid (Men-A-Vision) (PAL) ~.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/air_raid.bin
copying alien.bin from ROMS/Alien (1982) (20th Century Fox Video Games, Douglas 'Dallas North' Neubauer) (11006) ~.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/alien.bin
copying amidar.bin from ROMS/Amidar (1982) (Parker Brothers, Ed Temple) (PB5310) ~.bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/amidar.bin
copying assault.bin from ROMS/Assault (AKA Sky Alien) (1983) (Bomb - Onbase) (CA281).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/assault.bin
copying asterix.bin from ROMS/Asterix (AKA Taz) (07-27-1983) (Atari, Jerome Domurat, Steve Woita) (CX2696) (Prototype).bin to /usr/local/lib/python3.7/dist-packages/a

In [None]:
def prepro(I):
    # preprocess each frame for learning
    # save some memory and computation
    # pre-process the image from a 210x160x3 uint8 frame into an (80x80) float array 
    I = I[35:195, :, :].copy() # crop the top of the image...score image doesn't matter for how to play
    I = I[::2, ::2, 0].copy()
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return np.array(I.copy())

In [None]:
def discount_rewards(r):
    # take 1D float array of rewards and compute discounted reward
    # gym returns a reward with every single frame.  most of those rewards are 0
    # sometimes they're 1 or -1 if we win or lose a point in that specific frame
    # we want non-0 rewards for every frame. 
    # so take each frame, figure out if we eventually won the corresponding point or not
    # if so make the reward positive, if not negative
    # but more recent actions (relative to the frame where the point is awarded) are more 
    # impactful to the score that frames a long time ago, so discount rewards...
    
    delt = 0.99 # discount factor
    nr = r.shape[0]
    # we want to change all those zeros into discounted values of the next reward (this is the value function!)
    discounted_r = np.zeros(nr)
    
    for t in range(nr):
        # start at the end
        if r[nr - t - 1] > 0: # if you won a point in this frame we want a good reward
            discounted_r[nr - t - 1] = 1 
        elif r[nr - t - 1] < 0: # if we lost the point we want a bad reward
            discounted_r[nr - t - 1] = -1
        elif t == 0: # this is just for error catching...at t==0 r[nr-t-1] should have already been + or -...
            discounted_r[nr - t - 1] = 0
        elif discounted_r[nr - t - 1] == 0: # otherwise you want to look at the next reward value and discount it
            discounted_r[nr - t - 1] = delt * discounted_r[nr - t]
    return discounted_r

In [None]:
def create_model():
    # we cannot simply have 3 output nodes because we want to put a weight on each node's impact to the objective
    # that is different for each data point.  the only way to achieve this is to have 3 output layers, each having 1 node
    # the effect is the same, just the way TF/keras handles weights is different
    imp = Input(shape = (1, 80, 80))
    # mid = Conv2D(filters = 8, kernel_size = (2, 2), activation = tf.nn.relu, padding = 'same')(imp)
    mid = Flatten()(imp)
    mid = Dense(32, activation = 'relu')(mid)
    mid = Dense(32, activation = 'relu', kernel_regularizer = tf.keras.regularizers.L1(0.005))(mid)
    mid = Dense(16, activation = 'relu', kernel_regularizer = tf.keras.regularizers.L1(0.005))(mid)
    out0 = Dense(3, activation = 'softmax')(mid)
    model = Model(imp, out0) 
    
    model.compile(optimizer = tf.keras.optimizers.RMSprop(learning_rate = 1e-4), loss = 'sparse_categorical_crossentropy')
    
    return model

In [None]:
def play1game(model, epsilon = 0, use_model = True, hit_reward = False, return_reward = False):
    env0 = gym.make("Pong-v0")
    pix = env0.reset()
    pix = prepro(pix)
    frames_this_game = 0    
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    fcount = 0
    while not done:
        if fcount == 0:
            
            if use_model:

                if len(frame_array) < 3:
                    feed = np.array(pix).reshape(-1, 1, 80, 80)
                else:
                    feed = np.array(frame_array[-1]) + .5 * np.array(frame_array[-2]) + .25 * np.array(frame_array[-3])
                    feed = feed.reshape(-1, 1, 80, 80)
                
                mod2 = create_model()
                ### don't compile the dummy model!
                ### set the weights/biases of the new model to that of the old model
                mod2.set_weights(model.get_weights())
                vf = mod2(feed, training = False).numpy()[0]
                if random.random() < epsilon:
                      action = np.random.choice(3)
                else:
                      action = np.argmax(vf)
            
            else:
                ball_height = np.where(pix[:, 10:70] == 1)[0].mean() # model.predict(pixel_feed)...predict is signifincantly slower for some weird reason (overhead...)
                padel_height = np.where(pix[:, 71] == 1)[0].mean()

                action = 0
                if ball_height != np.nan:
                    if ball_height > padel_height:
                        action = 2
                    elif ball_height < padel_height:
                        action = 1

            fcount += 1
        
        elif fcount == 3:
            
            fcount = 0
       
        else:
            
            fcount += 1

        
        action0 = possible_actions[action]
        pix_new, reward, done, info = env0.step(action0)
        
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro(pix_new)
        frames_this_game += 1

        if reward == 1:
          score += reward
    try:
        del mod2
        K.clear_session()
    except:
        pass  
    return np.array(frame_array), np.array(action_array), np.array(reward_array), score

In [None]:
possible_actions = [0, 2, 3]
mod = create_model()
mod.call = tf.function(mod.call, experimental_relax_shapes = True)

In [None]:
mod.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 80, 80)]       0         
                                                                 
 flatten (Flatten)           (None, 6400)              0         
                                                                 
 dense (Dense)               (None, 32)                204832    
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 3)                 51        
                                                                 
Total params: 206,467
Trainable params: 206,467
Non-trainable

In [None]:
ngames = 5000
warmupgames = 0
nbatch = 25
buffn = 200000
len_buff = 0
buffer = {'frames' : [], 'actions' : [], 'rewards' : []}

In [None]:
mod = tf.keras.models.load_model('drive/MyDrive/optiproject/pongmodel_v3')

In [None]:
for game in range(2701, ngames):
    
    # train some games with a move towards the ball strategy
    if game < 300:
        use_model = False
        epsilon = 0
    else:
        use_model = True
        epsilon = max(.6 - .001 * (game - 300), .1)
    
    if game > 3500:
        epsilon = 0
    
    start = time.time()
    frames, actions, rewards, score = play1game(mod, epsilon = epsilon, use_model = use_model)
    rewards = discount_rewards(rewards.copy())
    rewards = np.array(rewards[2:])
    actions = np.array(actions[2:])
    nframes = len(frames)

    buffer['frames'] += list(frames.copy())
    buffer['actions'] += list(actions.copy())
    buffer['rewards'] += list(rewards.copy())
    len_buff += len(actions)
    
    if len_buff > buffn:
        excess = len_buff - buffn
        buffer['frames'] = buffer['frames'][excess:].copy()
        buffer['actions'] = buffer['actions'][excess:].copy()
        buffer['rewards'] = buffer['rewards'][excess:].copy()
        len_buff = len(buffer['actions'])
    
    rewards = np.array(rewards)
    actions = np.array(actions)
    nframes = len(frames)   
    

    if game >= warmupgames:
        prob = np.ones(len_buff)
        prob[np.array(buffer['rewards']) > 0] = 25.0
        prob /= np.sum(prob)


        size_grab = nframes - 2

        which_choose = np.random.choice(len_buff, size = size_grab, replace = False, p = prob)
        
        rewards = np.zeros(size_grab)
        actions = np.zeros(size_grab)
        current_frames = np.zeros((size_grab, 80, 80))

        for grab in range(size_grab):
            rewards[grab] = buffer['rewards'][which_choose[grab]]
            actions[grab] = buffer['actions'][which_choose[grab]]
            frames[grab] = list(buffer['frames'][which_choose[grab]] + .5 * buffer['frames'][which_choose[grab] - 1] + .25 * buffer['frames'][which_choose[grab] - 2])

        current_frames = current_frames.reshape(-1, 1, 80, 80)
        rewards = np.array([5 * x if x > 0 else x for x in rewards])
        mod.fit(current_frames, actions, epochs = 1, steps_per_epoch = nbatch, verbose = 0, sample_weight = rewards, use_multiprocessing = True)

        if game % 50 == 0:
          tf.keras.models.save_model(mod, 'drive/MyDrive/optiproject/pongmodel_v3')
          print('Saved at game:', game)

    
    del rewards
    del actions
    del frames
    del current_frames

    stop = time.time()
    print(game, score, stop-start, len_buff)

2701 0 16.62369203567505 1260
2702 0 12.285634994506836 2378
2703 0 12.051575660705566 3466
2704 0 11.134445428848267 4483
2705 0 12.248957872390747 5596
2706 0 12.356229782104492 6696
2707 0 13.2954740524292 7871
2708 0 11.174623250961304 8887
2709 0 13.108042001724243 10068
2710 0 12.977286100387573 11250
2711 0 11.278056383132935 12257
2712 0 13.143208503723145 13436
2713 0 12.001460552215576 14538
2714 0 11.348950386047363 15554
2715 0 11.390077829360962 16572
2716 0 12.145644903182983 17663
2717 0 11.354830980300903 18687
2718 0 11.935905456542969 19762
2719 0 12.25110387802124 20858
2720 0 12.247316122055054 21959
2721 0 11.999263525009155 23056
2722 0 11.571660280227661 24091
2723 0 12.260263681411743 25193
2724 0 11.225608825683594 26201
2725 0 11.439231634140015 27225
2726 0 11.232802629470825 28231
2727 0 11.15496563911438 29233
2728 0 11.241268396377563 30248
2729 0 11.439229965209961 31270
2730 0 11.450128555297852 32303
2731 0 10.852751016616821 33291
2732 0 11.31964564323

In [None]:
tf.keras.models.save_model(mod, 'drive/MyDrive/optiproject/pongmodel_v3')

INFO:tensorflow:Assets written to: drive/MyDrive/optiproject/pongmodel_v3/assets


# Skiing

In [None]:
env = gym.make("Skiing-v0")

In [None]:
env.unwrapped.get_action_meanings() 

In [None]:
list(range(env.action_space.n))

In [None]:
raw_pixels = env.reset()
raw_pixels.shape

In [None]:
def prepro_ski(I):
    # preprocess each frame for learning
    # save some memory and computation
    # pre-process the image from a 210x160x3 uint8 frame into an (80x80) float array 
    I = I[35:195,:,:].copy() # crop the top of the image...score image doesn't matter for how to play
    I = I[::2,::2,0].copy()
    I[I == 144] = 0 # erase background (background type 1)

    return np.array(I.copy())

In [None]:
def create_model_ski(height,width,channels):
    # we cannot simply have 3 output nodes because we want to put a weight on each node's impact to the objective
    # that is different for each data point.  the only way to achieve this is to have 3 output layers, each having 1 node
    # the effect is the same, just the way TF/keras handles weights is different
    imp = Input(shape=(height,width,channels))
    mid = Conv2D(16,(8,8),strides=4,activation='relu')(imp)
    mid = Conv2D(32,(4,4),strides=2,activation='relu')(mid)
    mid = Flatten()(mid)
    mid = Dense(256,activation='relu')(mid)
    #mid = Dense(128,activation='relu')(mid)
    mid = Dense(64,activation='relu')(mid)
    out0 = Dense(3,activation='softmax')(mid)
    model = Model(imp,out0) 
    
    
    return model

In [None]:
frames_to_net = 6          # how many previous frames will we feed the NN
possible_actions = [0,1,2]
mod = create_model_ski(80,80,frames_to_net)
mod.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),loss='sparse_categorical_crossentropy')   
mod.call = tf.function(mod.call,experimental_relax_shapes=True)

In [None]:
mod.summary()

In [None]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
def play1game(model):
    #env0 = gym.make("Pong-v0")
    env0 =  gym.make("Skiing-v0")
    pix = env0.reset()
    pix = prepro_ski(pix)
    #print(pix.shape)
    frames_this_game = 0
    feed = np.zeros((1,80,80,frames_to_net))
    feed[0,:,:,0] = pix.copy()
    
    
    frame_array = []
    action_array = []
    reward_array = []
    
    score = 0
    done = False
    fcount = 0
    mod2 = create_model_ski(80,80,frames_to_net)
    ### don't compile the dummy model!
    ### set the weights/biases of the new model to that of the old model
    mod2.set_weights(model.get_weights())
    mod2.call = tf.function(mod2.call,experimental_relax_shapes=True)
    
    while not done:
        if fcount == 0:
    
            vf = model(feed,training=False).numpy()[0]
            action = np.random.choice(3,p=vf)
            fcount += 1
        elif fcount == 3:
            fcount = 0
        else:
            fcount += 1

        
        action0 = possible_actions[action]
        pix_new, reward, done, info = env0.step(action0)
        frame_array.append(pix)
        action_array.append(action)
        reward_array.append(reward)
        pix = prepro_ski(pix_new)
        #print(pix.shape)
        frames_this_game += 1

        for f in range(1,frames_to_net):
            feed[0,:,:,frames_to_net-f] = feed[0,:,:,frames_to_net-f-1].copy()
        feed[0,:,:,0] = pix.copy()
        score += reward
    #del mod2
    #K.clear_session()   
    return frame_array, action_array, reward_array, score

In [None]:
ngames = 1000
nbatch = 10
buffn = 200000
warmupgames = 25
len_buff = 0
buffer = {'frames':[],'actions':[],'rewards':[]}

In [None]:
scores = np.ndarray((ngames))
for game in range(ngames):
    start = time.time()
    frames, actions, rewards, score = play1game(mod)
    #print(len(actions),actions)
    rewards = discount_rewards(rewards.copy())
    buffer['frames'] += frames.copy()
    buffer['actions'] += actions.copy()
    buffer['rewards'] += rewards.copy()
    len_buff += len(actions)
    if len_buff > buffn:
        excess = len_buff - buffn
        buffer['frames'] = buffer['frames'][excess:].copy()
        buffer['actions'] = buffer['actions'][excess:].copy()
        buffer['rewards'] = buffer['rewards'][excess:].copy()
        len_buff = len(buffer['actions'])
    rewards = np.array(rewards)
    actions = np.array(actions)
    nframes = len(frames)
    current_frames = np.zeros((nframes,80,80,frames_to_net))
    
    
    if game >= warmupgames:
        prob = np.ones(len_buff)
        prob[np.array(buffer['rewards']) > 0] = 5.0
        prob /= np.sum(prob)
        which_choose = np.random.choice(len_buff,size=nframes,replace=False,p=prob)
    
        for grab in range(nframes):
            rewards[grab] = buffer['rewards'][which_choose[grab]]
            actions[grab] = buffer['actions'][which_choose[grab]]
            for f in range(frames_to_net):
                if grab-f > 0:
                    current_frames[grab,:,:,f] = buffer['frames'][which_choose[grab]-f].copy()

    
    
        mod.fit(current_frames,actions,epochs=1,steps_per_epoch=nbatch,verbose=0,sample_weight=rewards,use_multiprocessing=True)
    K.clear_session()
    scores[game] = score
    stop = time.time()
    print(game, score, stop-start,len_buff)

In [None]:
scores