In [2]:
## This file contains a prototype idea of trying to learn a value function that represents the viable region
## for a LIP model. 
## Author : Avadesh Meduri
## Date : 20/02/2020

import numpy as np
import IPython
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from matplotlib.animation import FuncAnimation
import pickle as p
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [3]:
## LIPM Environment

class lipm_env:
    def __init__(self, h):
        self.omega = np.sqrt(9.81/h)
        self.max_leg_length = 0.6
        self.dt = 0.001
        self.h = h
        self.A = np.matrix([[1, self.dt], [(self.omega**2)*self.dt, 1]])
        self.B = np.matrix([0, -(self.omega**2)*self.dt])
        self.t = 0
                                 
    def integrate_lip_dynamics(self, x_t, u_t):
        ## integrates dynamics for one step
        assert np.shape(x_t) == (2,)
        x_t_1 = np.matmul(self.A, np.transpose(x_t)) + np.matmul(self.B.transpose(), [u_t])
        return x_t_1

    def reset_env(self, x0, u0, epi_time):
        ## initialises environment
        self.t = 0
        self.sim_data = np.zeros((4, int(epi_time/self.dt)+1))
        self.sim_data[:,0][0:2] = x0
        self.sim_data[:,0][2] = u0
        self.sim_data[:,0][3] = self.h
    def step_env(self):
        ## integrates the simulation one step
        self.sim_data[:,self.t + 1][0:2] = self.integrate_lip_dynamics(self.sim_data[:,self.t][0:2],\
                                                   self.sim_data[:,self.t][2])
        self.sim_data[:,self.t + 1][2] = self.sim_data[:,self.t][2]
        self.sim_data[:,self.t + 1][3] = self.sim_data[:,self.t][3] 
        self.t += 1
    
    def set_action(self, u):
        self.sim_data[:,self.t][2] = u
        
    def return_sample_data(self):
        return self.sim_data[:,0:self.t]
           
    def show_episode(self, freq, i_no):
        ## Input:
            ## Freq : frame rate (if freq = 5 one in 5 is shown)
            ## i_no : iteration number 
        sim_data = self.sim_data[:,::freq]

        fig = plt.figure()
        ax = plt.axes(xlim=(-5, 5), ylim=(0, sim_data[:,0][3] + 0.2))
        text_str = "iter - " + str(i_no)
        line, = ax.plot([], [], lw=3)
        def init():
            line.set_data([], [])
            return line,
        def animate(i):
            x = sim_data[:,i][0]
            y = sim_data[:,i][3]
            u = sim_data[:,i][2]
            line.set_data([u,x], [0,y])
            return line,
        props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
        ax.text(0.05, 0.95, text_str, transform=ax.transAxes, fontsize=15,
        verticalalignment='top', bbox=props)
        
        anim = FuncAnimation(fig, animate, init_func=init,
                                       frames=np.shape(sim_data)[1], interval=25, blit=True)

        plt.close(fig)
        plt.close(anim._fig)
        IPython.display.display_html(IPython.core.display.HTML(anim.to_html5_video()))

    def compute_reward(self, step_time):
        ## Computes the reward after step
        r = 0
        step_data = self.sim_data[:,int(self.t - step_time*1000):int(self.t)].copy()
        step_data[0] = np.subtract(step_data[0], step_data[2])
        min_dist = step_data[0].argmin()
        r += step_data[0][min_dist]**2 ## min distance between COM and COP
        r += step_data[1][min_dist]**2 ## Min velocity when min dist is achieved

        r = np.power(5, -r)
        if self.sim_data[:,int(self.t - step_time*1000)-1][2] != self.sim_data[:,int(self.t - step_time*1000)][2]:
            r -= 1 ## penalises if step is taken
        else:
            r += 1 ## rewards if no step is taken
            
        return r


In [153]:
## this block is for the Q function
class ANN(nn.Module):
    
    def __init__(self, input_size, outputs):
        super(ANN, self).__init__()
        self.l1 = nn.Linear(input_size, 128)
        self.l2 = nn.Linear(128, 128)
        self.action_value = nn.Linear(128, outputs)
        
    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        return self.action_value(x)
        
        

In [159]:
### This block samples and store data using epsillon greedy algorithm

def sample_data(no_episodes, epi_t, h, action_set, value_function, show_episode = False):
    # this function samples data
    env = lipm_env(h)
    train_data = []
    for e in range(no_episodes):
        print("running iter number - " + str(e))
        x = [0.0, 4*np.random.random() - 2]
        #espillon greedy
        if np.random.random() > 0.2:
            x_in = np.tile([x[0], x[1], 0],(len(action_set),1))
            x_in[:,2] = action_set
            a = np.argmax(value_function(torch.tensor((x_in), dtype=torch.float)).cpu().detach().numpy())
        else:
            a = np.random.randint(9)
        u = x[0] + action_set[a] #starting with a random action
        step_time = 0.15
        env.reset_env(x, u, epi_t)
        
        sars_t = np.zeros(6) ## State, Action, reward, state array
        sars_t[0:2] = env.sim_data[:,env.t][0:2] 
        sars_t[0] = env.sim_data[:,env.t][2] - sars_t[0]
        sars_t[2] = action_set[a]

        for t in range(0, int(epi_t*1000) - 1):
            if t % int(step_time * 1000) == 0 and t > 0:
                sars_t[3] = env.compute_reward(step_time)
                sars_t[4:6] = env.sim_data[:,env.t][0:2]
                sars_t[4] = env.sim_data[:,env.t][2] - sars_t[4]
                train_data.append(sars_t)

                sars_t = np.zeros(6)
                sars_t[0:2] = env.sim_data[:,env.t][0:2]
                sars_t[0] = env.sim_data[:,env.t][2] - sars_t[0]
                # epsillon greedy
                if np.random.random() > 0.2:
                    x_in = [sars_t[0], sars_t[1], 0].copy()
                    x_in = np.tile(x_in,((len(action_set),1)))
                    x_in[:,2] = action_set
                    a = np.argmax(value_function(torch.tensor((x_in), dtype=torch.float)).cpu().detach().numpy())
                else:
                    a = np.random.randint(9)
                sars_t[2] = action_set[a]
                u = env.sim_data[:,env.t][0] + action_set[a]
                env.set_action(u) ## setting action

            env.step_env()
    
        if show_episode: 
            env.show_episode(5, e)
            
    return np.asarray(train_data)

def store_data(data_array, file_name, dir):
    batch_no = str(len(os.listdir(dir)))
    f = open(dir + file_name + "_" + batch_no + ".pkl", 'wb')
    print("dumping data ...")
    p.dump(data_array, f, -1)  
    f.close()    
    print("finished dumping...")

In [27]:
## This block is to sample data

action_set = np.linspace(-0.2, 0.2, 9)
sample = sample_data(1, 1.5, 0.2, action_set, True)
# store_data(sample, "random", "../../data/")

running iter number - 0


device = torch.device("cpu")
dqs = dq_stepper(2, 9).to(device)

test = torch.randn(1,1,2)
print(test)
dqs(test).max(1)[1]

In [161]:
#Training block
device = torch.device("cpu")
dq_stepper = ANN(3, 1).to(device)

action_set = np.linspace(-0.2, 0.2, 9)
sample = sample_data(1, 1.5, 0.2, action_set, dqs, False)

running iter number - 0
[[ 0.00000000e+00  3.55632014e-01  0.00000000e+00  1.81582757e+00
  -6.34802395e-02  5.68511895e-01]
 [-6.34802395e-02  5.68511895e-01 -5.00000000e-02 -4.07973109e-01
  -1.81409051e-01  1.34659154e+00]
 [-1.81409051e-01  1.34659154e+00 -2.00000000e-01 -9.49348448e-01
  -5.60085495e-01  3.90373871e+00]
 [-5.60085495e-01  3.90373871e+00 -2.00000000e-01 -1.00000000e+00
  -1.01653572e+00  7.99158379e+00]
 [-1.01653572e+00  7.99158379e+00 -2.00000000e-01 -1.00000000e+00
  -1.74621521e+00  1.45263964e+01]
 [-1.74621521e+00  1.45263964e+01 -2.00000000e-01 -1.00000000e+00
  -2.91267788e+00  2.49729212e+01]
 [-2.91267788e+00  2.49729212e+01  1.50000000e-01 -1.00000000e+00
  -4.21787178e+00  3.86082984e+01]
 [-4.21787178e+00  3.86082984e+01  2.00000000e-01 -1.00000000e+00
  -6.57185393e+00  5.99679863e+01]
 [-6.57185393e+00  5.99679863e+01 -2.00000000e-01 -1.00000000e+00
  -1.10239921e+01  9.76156625e+01]]
