In [6]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import gym
import itertools
import matplotlib
import numpy as np
import sys
import sklearn.pipeline
import sklearn.preprocessing
import pickle
import matplotlib.pyplot as plt

from value_estimator import ValueEstimator

from sklearn.linear_model import SGDRegressor
from sklearn.kernel_approximation import RBFSampler

matplotlib.style.use('ggplot')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
def parse(s):
    split = s.split(',')
    n = np.zeros(len(split))
    for i in range(len(split)):
        n[i] = float(split[i])
    return n

In [8]:
demonstration_file = "swimmer_demonstrations_10k"
with open(demonstration_file) as f:
    for i, l in enumerate(f):
        pass
num_observations = i + 1

with open(demonstration_file) as f:
    l = f.readline()
    s, r, a, sp = l.strip().split(';')
    
    slen = len(s.split(','))
    rlen = len(r.split(','))
    alen = len(a.split(','))
    
    s_arr = np.zeros((num_observations, slen))
    r_arr = np.zeros((num_observations, rlen))
    a_arr = np.zeros((num_observations, alen))
    sprime_arr = np.zeros((num_observations, slen))
    
    
with open(demonstration_file) as f:
    i = 0
    for l in f:
        if len(l.strip().split(';')) <= 1:
            continue
        i += 1
        s, r, a, sp = l.strip().split(';')
        s_arr[i,:] = parse(s)
        r_arr[i,:] = parse(r)
        a_arr[i,:] = parse(a)
        sprime_arr[i,:] = parse(sp)

In [11]:
print(s_arr.shape)
print(r_arr.shape)
print(a_arr.shape)
print(sprime_arr.shape)

(10010000, 8)
(10010000, 1)
(10010000, 2)
(10010000, 8)


In [12]:
def td_learning(estimator, num_episodes, discount_factor=0.95):
    
    for episode in range(num_episodes):
        pkl_name = 'swimmer_value_estimator%d.pkl' % episode
        sys.stdout.flush()
        for i in itertools.count():
            if i >= s_arr.shape[0]:
                break
            state = s_arr[i,:]
            reward = r_arr[i,:]
            # action not needed for incremental update when we are only learning value function!
            # we have no idea how states transition or how to choose a good state action pair
            # but that's ok :) 
            action = a_arr[i,:]
            next_state = sprime_arr[i,:]
            
            # TD Update
            q_value_next = estimator.value([next_state])
            
            td_target = reward + discount_factor * q_value_next
            
            # Update the function approximator using our target
            estimator.update(state, td_target)
            
            print("\rStep {} @ Episode {}/{}".format(i, episode + 1, num_episodes), end="")
        
        with open(pkl_name, 'wb') as pkl:
            pickle.dump(estimator, pkl)

In [13]:
# Feature Preprocessing: Normalize to zero mean and unit variance
# We use a few samples from the observation space to do this
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(s_arr)

# Used to convert a state to a featurizes represenation.
# We use RBF kernels with different variances to cover different parts of the space
featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
        ])
featurizer.fit(scaler.transform(s_arr))

model = SGDRegressor(learning_rate="constant")
# allow for predictions by doing a single partial fit
model.partial_fit(featurizer.transform(scaler.transform([s_arr[0,:]])), [0])

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
       eta0=0.01, fit_intercept=True, l1_ratio=0.15,
       learning_rate='constant', loss='squared_loss', max_iter=None,
       n_iter=None, n_iter_no_change=5, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [15]:
value_estimator = ValueEstimator(scaler, featurizer, model)

In [None]:
stats = td_learning(value_estimator, 30, discount_factor=0.95)

Step 2835414 @ Episode 3/300

In [69]:
 with open('swimmer_value_estimator.pkl', 'wb') as pkl:
    pickle.dump(estimator, pkl) 