In [1]:
#This implementation is based on Sarsa(lambda) algorithm from (http://www-anw.cs.umass.edu/~barto/courses/cs687/)


# https://deeplearningcourses.com/c/deep-reinforcement-learning-in-python
# https://www.udemy.com/deep-reinforcement-learning-in-python
from __future__ import print_function, division
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future
#
# Note: gym changed from version 0.7.3 to 0.8.0
# MountainCar episode length is capped at 200 in later versions.
# This means your agent can't learn as much in the earlier episodes
# since they are no longer as long.
#
# Adapt Q-Learning script to use TD(lambda) method instead

import gym
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime
from sklearn.linear_model import SGDRegressor

# code we already wrote
from q_learning import plot_cost_to_go, FeatureTransformer, plot_running_avg

In [2]:
class SubModel:
    def __init__(self, env, featureTransformer, lumda, learningRate, discount):
        self.env = env        
        self.ft = featureTransformer(env)
        self.lumda = lumda
        self.learningRate = learningRate
        self.discount = discount
        
        #set size of the input vector
        X = self.ft.transform([env.observation_space.sample()])
        self.vSize = X.size       
        self.el = np.zeros(self.vSize)
        self.w = np.zeros(self.vSize)
        
        #create a sub-model for each action
        self.actionSize  = env.action_space.n
        
    def predict(self, status):
        X = self.ft.transfor([status])
        Y = self.w.dot(X)
        return Y
    
    def partial_fit(self, reward, oldStatus, newValue):
        oldValue = self.predict(oldStatus)
        self.el = self.lumda * self.discount * self.el - self.ft.transform([oldStatus])
        diff = reward + self.discount * newValue - oldValue
        self.w += self.learningRate * diff * self.el
        
        
        

In [3]:
class Models:
    def __init__(self, env, featureTransformer, lumda, learningRate, discount):
        self.env = env        
        self.ft = featureTransformer(env)      
           
        
        #create a sub-model for each action
        self.actionSize  = env.action_space.n
        self.models = []
        for i in range(self.actionSize):
            self.models.append(SubModel(env, featureTransformer, lumda, learningRate, discount))

    def update(self, reward, oldStatus, newStatus, action, newAction):        
        newValue = self.models[newAction].predict(newStatus)
        self.models[action].partial_fit(reward, oldStatus, newValue)
        
    def predict(self, status): 
        results = np.stack([m.predict(status) for m in models])
        return results
    
    def sample_action(self, status, eps):
        if np.random.random() < eps:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.predict(status))

In [None]:
def playOne(env, models, eps):
    oldObs = env.reset()
    models.initEligibility()
    done = False    
    while not done:        
        action = models.sample_action(oldObs, eps)
        newObs, reward, done, info = env.step(action)
        nextValue = np.max(models.predict(newObs))
        models.update(reward, oldObs, action, nextValue)
        oldObs = newObs

In [None]:
def main():
    env = gym.make('MountainCar-v0')
    env.reset()
    ft = FeatureTransformer(env)    
    model = Model(env, ft)
    for i in range(100):
        playOne(env, model)
    