# Q learning - Linear Regression Attempt
In Q learning, the update rule is:
$$Q(\mathbf{s},\mathbf{a}) = Q(\mathbf{s},\mathbf{a}) + \alpha (r+\gamma max_{a'} Q(\mathbf{s}',\mathbf{a}') - Q(\mathbf{s},\mathbf{a}))$$
If the Q function reached the best, there would be nothing more to learn. Hence, we wanted to make $r+\gamma max_{a'} Q(\mathbf{s}',\mathbf{a}') - Q(\mathbf{s},\mathbf{a})$ close to 0. Then the problem became fitting the dataset: [$r+\gamma max_{a'} Q(\mathbf{s}',\mathbf{a}')$, $Q(\mathbf{s},\mathbf{a})$]. In every step of environment evolution, the [$r+\gamma max_{a'} Q(\mathbf{s}',\mathbf{a}')$, $Q(\mathbf{s},\mathbf{a})$] pair can form a dataset for fitting.
The most basic fitting problem is linear regression. The first attempt is to fit the dataset to a linear model: $$Q(\mathbf{s},\mathbf{a})=\sum_{i}{A_i V_i},$$ where $\mathbf{V}=(\mathbf{s},\mathbf{a})$ and $A_i$ are constants.

In [20]:
from osim.env import L2RunEnv
import numpy as np
from scipy.optimize import minimize, Bounds
from sklearn.linear_model import LinearRegression

DEFAULT_SEED = 20180101
rng = np.random.RandomState(DEFAULT_SEED)

env = L2RunEnv(visualize=False)
# Obtain the dimension observation space and action space
dim_obs = env.get_observation_space_size()
dim_act = env.get_action_space_size()

# Set the range of action values
action_low = -0.1
action_high = 0.1
bnds = Bounds(action_low, action_high)

# Set hyperparameters
discount = 0.0001
epsilon = 1
episode = 2000

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [21]:
class qfunction:
    # A class to store the coefficents of linear function
    
    def __init__(self, dim_obs, dim_act, rng=None):
        # randomly initialize coefficents
        
        if rng is None:
            rng = np.random.RandomState(DEFAULT_SEED)
        self.rng = rng
        
        self.dim_obs = dim_obs
        self.dim_act = dim_act
        self.dim = dim_obs + dim_act
        self.coeff = rng.uniform(-1, 1, self.dim)

    def __call__(self, obs, act):
        # input states and action, return value of q function
        obs_part = obs * self.coeff[:self.dim_obs]
        act_part = act * self.coeff[self.dim_obs:]
        res = np.sum(obs_part) + np.sum(act_part)
        return res

    def set_coeff(self, coeff):
        # set coefficients of q function
        self.coeff = coeff

In [22]:
# Initialize Q function
qf = qfunction(dim_obs, dim_act)
model = LinearRegression(fit_intercept=False)
# Initialize the dataset:(xdata, ydata)
xdata = np.zeros((qf.dim,))
ydata = np.zeros((1,))
print(xdata.shape, ydata.shape)

(59,) (1,)


In [23]:
action0 = np.zeros(dim_act)
for i in range(episode):
    # Initialize a new simulation
    state = np.array(env.reset())
    reward = 0

    # Run the simulation until the framework stop it
    done = False
    while not done:
        '''
        # get the action based on Q function and epsilon greedy
        if (rng.rand() < epsilon) :
            # exploration: randomly choose an action
            action = rng.uniform(action_low, action_high, dim_act)
        else:
            # exploitation: choose the action maximumizing Q function
            action_func = lambda x: -qf(state, x)
            bnds = Bounds(action_low, action_high)
            res = minimize(action_func, action0, method='SLSQP', bounds=bnds)
            action = res.x
        '''
        # randomly choose an action
        action = rng.uniform(action_low, action_high, dim_act)

        # evolve the system to the next time step
        state_, reward, done, info = env.step(action)
        state_ = np.array(state_)

        # get maximum of Q(s', a') under given s'
        action_func = lambda x: -qf(state_, x)
        res = minimize(action_func, action0, method='SLSQP', bounds=bnds)
        # note: https://en.wikipedia.org/wiki/Sequential_quadratic_programming
        max_q = -res.fun # max Q(s', a')
        
        # {s, a} and [r + gamma * max_a` Q(s`, a`)]
        xx = np.concatenate((state, action))
        yy = np.array(reward + discount * max_q)
        # put the data point into dataset
        xdata = np.vstack((xdata, xx))
        ydata = np.vstack((ydata, yy))

        # Do linear fitting and update Q function coefficients
        model.fit(xdata, ydata)
        qf.set_coeff(model.coef_.T)
        
        # Update Q values with new coefficients
        ydata = np.sum(xdata*model.coef_, axis=1, keepdims=True)

        # Update state
        state = state_

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [24]:
data = np.hstack((xdata, ydata))
np.savetxt("data.csv", data, delimiter=",")