In [1]:
import gym
from gym import wrappers
import io
import base64
import fastai
import fastprogress
from fastai.vision import *
from IPython import display
from IPython.display import HTML

# Save and load video

```
env = wrappers.Monitor(env, "./gym-results", force=True)

video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))
```

# Show image while playing

In [2]:
# observation, reward, done, info = env.step(action)
def prep(t):
    return (t.float()/255.0).expand(3,-1,-1)

def create_tensor(observation):
    t = pil2tensor(observation, dtype=np.uint8)
    t = t[:,23:196,:]
    return t

def show_frame(img, t):
    img.set_data(image2np(t)) # just update the data
    display.display(plt.gcf())
    display.clear_output(wait=True)

def game_stats(game):
    pass

In [3]:
def get_action(dist):
    tp = 0
    x = random.random()
    i = 0
    for p in dist:
        tp += p
        if x <= tp:
            return i
        i+=1
    return i

In [15]:
def play_one(env, model, max_frames=1000, show=False):
    if show: img = plt.imshow(env.render(mode='rgb_array')) # only call this once
    observation = create_tensor(env.reset())
    O,A,R = [],[],[]
    for _ in range(max_frames):
        action = env.action_space.sample()
        if model is None: action = env.action_space.sample()
        else: 
            x = prep(observation)[None].cuda().expand(2,-1,-1,-1)
            res = model(x)[0]
            print(res)
            res = torch.functional.F.log_softmax(res)
            action = get_action(res)
        state = env.step(action)
        obs, reward, done, __ = state
        O.append(observation)
        A.append(action)
        R.append(reward)
        if done: break
        observation = create_tensor(obs)
        if show: show_frame(img, observation)
    return O,A,R

def play(model=None, number=100, **kwargs):
    env = gym.make('SpaceInvaders-v0')
    gs = []
    for i in range(number):
        gs.append(play_one(env, model, **kwargs))
    env.close()
    sums = np.array([sum(g[2]) for g in gs])
    sums = (sums - np.mean(sums))/np.std(sums)
    for i in range(len(sums)):
        gs[i] = (*gs[i], sums[i])
    return gs

In [5]:
env = gym.make('SpaceInvaders-v0')
classes = np.array(list(range(env.action_space.n)))
env.close()

In [6]:
def open_fn(self, i):
    return Image(prep(i))

ImageList.open = open_fn

In [7]:
class PolicyActionList(MultiCategoryList):
    loss_func = CrossEntropyFlat()
    
    def __init__(self, items, classes, good=True, **kwargs):
        super().__init__(items, classes, **kwargs)
        self.good = good
    
    def get(self, i):
        ret = super().get(i)
        if not self.good:
             ret.data[ret.data==1]=-1
        return ret

In [8]:
def create_learner(data):
    l = cnn_learner(data, models.resnet18, metrics=partial(accuracy_thresh, thresh=.8))
    l.model[1] = nn.Sequential(*l.model[1], nn.Softmax(dim=0))
    l.unfreeze()
    l.summary()
    return l

In [9]:
def train_on_game(l, g, silent=True, its=5, lr=1e-3):
    images, actions, rewards, total_rew = g
    good = total_rew >= 0
    imagelist = ImageList(images)
    actions = [[a] for a in actions]
    ll = LabelList(imagelist, PolicyActionList(actions, classes, good=good))
    ll = LabelLists('.', ll, EmptyLabelList([]))
    data = ll.databunch(bs=128)
    if l is None: l = create_learner(data)
    else: l.data = data
    if silent: 
        with progress_disabled(l) as l:
            l.fit_one_cycle(its, lr)
    else: 
        l.fit_one_cycle(its, lr)
    return l

In [10]:
class progress_disabled():
    ''' Context manager to disable the progress update bar and Recorder print'''
    def __init__(self,learn:Learner):
        self.learn = learn
    def __enter__(self):
        fastprogress.fastprogress.NO_BAR = True
        fastai.basic_train.master_bar, fastai.basic_train.progress_bar = fastprogress.force_console_behavior()
        self.learn.callback_fns[0] = partial(Recorder,add_time=True,silent=True) #silence recorder
        
        return self.learn
    
    def __exit__(self,type,value,traceback):
        fastai.basic_train.master_bar, fastai.basic_train.progress_bar = master_bar,progress_bar
        self.learn.callback_fns[0] = partial(Recorder,add_time=True)

In [11]:
games = play(number=2)



In [12]:
learn = None
for g in games:
    learn = train_on_game(learn, g)
    
learn.save('rollout-resnet18')



In [13]:
learn.summary()

Sequential
Layer (type)         Output Shape         Param #    Trainable 
Conv2d               [64, 87, 80]         9,408      True      
______________________________________________________________________
BatchNorm2d          [64, 87, 80]         128        True      
______________________________________________________________________
ReLU                 [64, 87, 80]         0          False     
______________________________________________________________________
MaxPool2d            [64, 44, 40]         0          False     
______________________________________________________________________
Conv2d               [64, 44, 40]         36,864     True      
______________________________________________________________________
BatchNorm2d          [64, 44, 40]         128        True      
______________________________________________________________________
ReLU                 [64, 44, 40]         0          False     
___________________________________________________

In [None]:
# del games
import gc
gc.collect()
learn.load('rollout-resnet18')

def stats(sums):
    print("Mean: ", np.mean(sums))
    print("std: ", np.std(sums))
    print("max :", np.max(sums))
    print("min :", np.min(sums))

for i in range(50):
    print(i)
    games = play(learn.model, number=10)
    sums = []
    stats([sum(g[-2]) for g in games])
    for g in games:
        learn = train_on_game(learn, g, silent=False)
    print("==============")

In [None]:
env = gym.make('SpaceInvaders-v0')
_, __, rewards = play_one(env, learn.model, show=True)
print("Score:", sum(rewards))
env.close()