-
Notifications
You must be signed in to change notification settings - Fork 50
Implementing more actions #8
Comments
also whats the difference between using Trevel.variable and env.variable |
var R = {}; // the Recurrent library
(function (global) {
"use strict";
// Utility fun
function assert(condition, message) {
// from http://stackoverflow.com/questions/15313418/javascript-assert
if (!condition) {
message = message || "Assertion failed";
if (typeof Error !== "undefined") {
throw new Error(message);
}
throw message; // Fallback
}
}
// Random numbers utils
var return_v = false;
var v_val = 0.0;
var gaussRandom = function () {
if (return_v) {
return_v = false;
return v_val;
}
var u = 2 * Math.random() - 1;
var v = 2 * Math.random() - 1;
var r = u * u + v * v;
if (r == 0 || r > 1) return gaussRandom();
var c = Math.sqrt(-2 * Math.log(r) / r);
v_val = v * c; // cache this
return_v = true;
return u * c;
}
var randf = function (a, b) {
return Math.random() * (b - a) + a;
}
var randi = function (a, b) {
return Math.floor(Math.random() * (b - a) + a);
}
var randn = function (mu, std) {
return mu + gaussRandom() * std;
}
// helper function returns array of zeros of length n
// and uses typed arrays if available
var zeros = function (n) {
if (typeof (n) === 'undefined' || isNaN(n)) {
return [];
}
if (typeof ArrayBuffer === 'undefined') {
// lacking browser support
var arr = new Array(n);
for (var i = 0; i < n; i++) {
arr[i] = 0;
}
return arr;
} else {
return new Float64Array(n);
}
}
// Mat holds a matrix
var Mat = function (n, d) {
// n is number of rows d is number of columns
this.n = n;
this.d = d;
this.w = zeros(n * d);
this.dw = zeros(n * d);
}
Mat.prototype = {
get: function (row, col) {
// slow but careful accessor function
// we want row-major order
var ix = (this.d * row) + col;
assert(ix >= 0 && ix < this.w.length);
return this.w[ix];
},
set: function (row, col, v) {
// slow but careful accessor function
var ix = (this.d * row) + col;
assert(ix >= 0 && ix < this.w.length);
this.w[ix] = v;
},
setFrom: function (arr) {
for (var i = 0, n = arr.length; i < n; i++) {
this.w[i] = arr[i];
}
},
setColumn: function (m, i) {
for (var q = 0, n = m.w.length; q < n; q++) {
this.w[(this.d * q) + i] = m.w[q];
}
},
toJSON: function () {
var json = {};
json['n'] = this.n;
json['d'] = this.d;
json['w'] = this.w;
return json;
},
fromJSON: function (json) {
this.n = json.n;
this.d = json.d;
this.w = zeros(this.n * this.d);
this.dw = zeros(this.n * this.d);
for (var i = 0, n = this.n * this.d; i < n; i++) {
this.w[i] = json.w[i]; // copy over weights
}
}
}
var copyMat = function (b) {
var a = new Mat(b.n, b.d);
a.setFrom(b.w);
return a;
}
var copyNet = function (net) {
// nets are (k,v) pairs with k = string key, v = Mat()
var new_net = {};
for (var p in net) {
if (net.hasOwnProperty(p)) {
new_net[p] = copyMat(net[p]);
}
}
return new_net;
}
var updateMat = function (m, alpha) {
// updates in place
for (var i = 0, n = m.n * m.d; i < n; i++) {
if (m.dw[i] !== 0) {
m.w[i] += -alpha * m.dw[i];
m.dw[i] = 0;
}
}
}
var updateNet = function (net, alpha) {
for (var p in net) {
if (net.hasOwnProperty(p)) {
updateMat(net[p], alpha);
}
}
}
var netToJSON = function (net) {
var j = {};
for (var p in net) {
if (net.hasOwnProperty(p)) {
j[p] = net[p].toJSON();
}
}
return j;
}
var netFromJSON = function (j) {
var net = {};
for (var p in j) {
if (j.hasOwnProperty(p)) {
net[p] = new Mat(1, 1); // not proud of this
net[p].fromJSON(j[p]);
}
}
return net;
}
var netZeroGrads = function (net) {
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
gradFillConst(mat, 0);
}
}
}
var netFlattenGrads = function (net) {
var n = 0;
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
n += mat.dw.length;
}
}
var g = new Mat(n, 1);
var ix = 0;
for (var p in net) {
if (net.hasOwnProperty(p)) {
var mat = net[p];
for (var i = 0, m = mat.dw.length; i < m; i++) {
g.w[ix] = mat.dw[i];
ix++;
}
}
}
return g;
}
// return Mat but filled with random numbers from gaussian
var RandMat = function (n, d, mu, std) {
var m = new Mat(n, d);
fillRandn(m, mu, std);
//fillRand(m,-std,std); // kind of :P
return m;
}
// Mat utils
// fill matrix with random gaussian numbers
var fillRandn = function (m, mu, std) {
for (var i = 0, n = m.w.length; i < n; i++) {
m.w[i] = randn(mu, std);
}
}
var fillRand = function (m, lo, hi) {
for (var i = 0, n = m.w.length; i < n; i++) {
m.w[i] = randf(lo, hi);
}
}
var gradFillConst = function (m, c) {
for (var i = 0, n = m.dw.length; i < n; i++) {
m.dw[i] = c
}
}
// Transformer definitions
var Graph = function (needs_backprop) {
if (typeof needs_backprop === 'undefined') {
needs_backprop = true;
}
this.needs_backprop = needs_backprop;
// this will store a list of functions that perform backprop,
// in their forward pass order. So in backprop we will go
// backwards and evoke each one
this.backprop = [];
}
Graph.prototype = {
backward: function () {
for (var i = this.backprop.length - 1; i >= 0; i--) {
this.backprop[i](); // tick!
}
},
rowPluck: function (m, ix) {
// pluck a row of m with index ix and return it as col vector
assert(ix >= 0 && ix < m.n);
var d = m.d;
var out = new Mat(d, 1);
for (var i = 0, n = d; i < n; i++) {
out.w[i] = m.w[d * ix + i];
} // copy over the data
if (this.needs_backprop) {
var backward = function () {
for (var i = 0, n = d; i < n; i++) {
m.dw[d * ix + i] += out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
tanh: function (m) {
// tanh nonlinearity
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = Math.tanh(m.w[i]);
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0; i < n; i++) {
// grad for z = tanh(x) is (1 - z^2)
var mwi = out.w[i];
m.dw[i] += (1.0 - mwi * mwi) * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
sigmoid: function (m) {
// sigmoid nonlinearity
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = sig(m.w[i]);
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0; i < n; i++) {
// grad for z = tanh(x) is (1 - z^2)
var mwi = out.w[i];
m.dw[i] += mwi * (1.0 - mwi) * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
relu: function (m) {
var out = new Mat(m.n, m.d);
var n = m.w.length;
for (var i = 0; i < n; i++) {
out.w[i] = Math.max(0, m.w[i]); // relu
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0; i < n; i++) {
m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0;
}
}
this.backprop.push(backward);
}
return out;
},
mul: function (m1, m2) {
// multiply matrices m1 * m2
assert(m1.d === m2.n, 'matmul dimensions misaligned');
var n = m1.n;
var d = m2.d;
var out = new Mat(n, d);
for (var i = 0; i < m1.n; i++) { // loop over rows of m1
for (var j = 0; j < m2.d; j++) { // loop over cols of m2
var dot = 0.0;
for (var k = 0; k < m1.d; k++) { // dot product loop
dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j];
}
out.w[d * i + j] = dot;
}
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0; i < m1.n; i++) { // loop over rows of m1
for (var j = 0; j < m2.d; j++) { // loop over cols of m2
for (var k = 0; k < m1.d; k++) { // dot product loop
var b = out.dw[d * i + j];
m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b;
m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b;
}
}
}
}
this.backprop.push(backward);
}
return out;
},
add: function (m1, m2) {
assert(m1.w.length === m2.w.length);
var out = new Mat(m1.n, m1.d);
for (var i = 0, n = m1.w.length; i < n; i++) {
out.w[i] = m1.w[i] + m2.w[i];
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += out.dw[i];
m2.dw[i] += out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
dot: function (m1, m2) {
// m1 m2 are both column vectors
assert(m1.w.length === m2.w.length);
var out = new Mat(1, 1);
var dot = 0.0;
for (var i = 0, n = m1.w.length; i < n; i++) {
dot += m1.w[i] * m2.w[i];
}
out.w[0] = dot;
if (this.needs_backprop) {
var backward = function () {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += m2.w[i] * out.dw[0];
m2.dw[i] += m1.w[i] * out.dw[0];
}
}
this.backprop.push(backward);
}
return out;
},
eltmul: function (m1, m2) {
assert(m1.w.length === m2.w.length);
var out = new Mat(m1.n, m1.d);
for (var i = 0, n = m1.w.length; i < n; i++) {
out.w[i] = m1.w[i] * m2.w[i];
}
if (this.needs_backprop) {
var backward = function () {
for (var i = 0, n = m1.w.length; i < n; i++) {
m1.dw[i] += m2.w[i] * out.dw[i];
m2.dw[i] += m1.w[i] * out.dw[i];
}
}
this.backprop.push(backward);
}
return out;
},
}
var softmax = function (m) {
var out = new Mat(m.n, m.d); // probability volume
var maxval = -999999;
for (var i = 0, n = m.w.length; i < n; i++) {
if (m.w[i] > maxval) maxval = m.w[i];
}
var s = 0.0;
for (var i = 0, n = m.w.length; i < n; i++) {
out.w[i] = Math.exp(m.w[i] - maxval);
s += out.w[i];
}
for (var i = 0, n = m.w.length; i < n; i++) {
out.w[i] /= s;
}
// no backward pass here needed
// since we will use the computed probabilities outside
// to set gradients directly on m
return out;
}
var Solver = function () {
this.decay_rate = 0.999;
this.smooth_eps = 1e-8;
this.step_cache = {};
}
Solver.prototype = {
step: function (model, step_size, regc, clipval) {
// perform parameter update
var solver_stats = {};
var num_clipped = 0;
var num_tot = 0;
for (var k in model) {
if (model.hasOwnProperty(k)) {
var m = model[k]; // mat ref
if (!(k in this.step_cache)) {
this.step_cache[k] = new Mat(m.n, m.d);
}
var s = this.step_cache[k];
for (var i = 0, n = m.w.length; i < n; i++) {
// rmsprop adaptive learning rate
var mdwi = m.dw[i];
s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi;
// gradient clip
if (mdwi > clipval) {
mdwi = clipval;
num_clipped++;
}
if (mdwi < -clipval) {
mdwi = -clipval;
num_clipped++;
}
num_tot++;
// update (and regularize)
m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i];
m.dw[i] = 0; // reset gradients for next iteration
}
}
}
solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot;
return solver_stats;
}
}
var initLSTM = function (input_size, hidden_sizes, output_size) {
// hidden size should be a list
var model = {};
for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths
var prev_size = d === 0 ? input_size : hidden_sizes[d - 1];
var hidden_size = hidden_sizes[d];
// gates parameters
model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
model['bi' + d] = new Mat(hidden_size, 1);
model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
model['bf' + d] = new Mat(hidden_size, 1);
model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
model['bo' + d] = new Mat(hidden_size, 1);
// cell write params
model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08);
model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08);
model['bc' + d] = new Mat(hidden_size, 1);
}
// decoder params
model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08);
model['bd'] = new Mat(output_size, 1);
return model;
}
var forwardLSTM = function (G, model, hidden_sizes, x, prev) {
// forward prop for a single tick of LSTM
// G is graph to append ops to
// model contains LSTM parameters
// x is 1D column vector with observation
// prev is a struct containing hidden and cell
// from previous iteration
if (prev == null || typeof prev.h === 'undefined') {
var hidden_prevs = [];
var cell_prevs = [];
for (var d = 0; d < hidden_sizes.length; d++) {
hidden_prevs.push(new R.Mat(hidden_sizes[d], 1));
cell_prevs.push(new R.Mat(hidden_sizes[d], 1));
}
} else {
var hidden_prevs = prev.h;
var cell_prevs = prev.c;
}
var hidden = [];
var cell = [];
for (var d = 0; d < hidden_sizes.length; d++) {
var input_vector = d === 0 ? x : hidden[d - 1];
var hidden_prev = hidden_prevs[d];
var cell_prev = cell_prevs[d];
// input gate
var h0 = G.mul(model['Wix' + d], input_vector);
var h1 = G.mul(model['Wih' + d], hidden_prev);
var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d]));
// forget gate
var h2 = G.mul(model['Wfx' + d], input_vector);
var h3 = G.mul(model['Wfh' + d], hidden_prev);
var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d]));
// output gate
var h4 = G.mul(model['Wox' + d], input_vector);
var h5 = G.mul(model['Woh' + d], hidden_prev);
var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d]));
// write operation on cells
var h6 = G.mul(model['Wcx' + d], input_vector);
var h7 = G.mul(model['Wch' + d], hidden_prev);
var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d]));
// compute new cell activation
var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell
var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell
var cell_d = G.add(retain_cell, write_cell); // new cell contents
// compute hidden state as gated, saturated cell activations
var hidden_d = G.eltmul(output_gate, G.tanh(cell_d));
hidden.push(hidden_d);
cell.push(cell_d);
}
// one decoder to outputs at end
var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']);
// return cell memory, hidden representation and output
return {
'h': hidden,
'c': cell,
'o': output
};
}
var sig = function (x) {
// helper function for computing sigmoid
return 1.0 / (1 + Math.exp(-x));
}
var maxi = function (w) {
// argmax of array w
var maxv = w[0];
var maxix = 0;
for (var i = 1, n = w.length; i < n; i++) {
var v = w[i];
if (v > maxv) {
maxix = i;
maxv = v;
}
}
return maxix;
}
var samplei = function (w) {
// sample argmax from w, assuming w are
// probabilities that sum to one
var r = randf(0, 1);
var x = 0.0;
var i = 0;
while (true) {
x += w[i];
if (x > r) {
return i;
}
i++;
}
return w.length - 1; // pretty sure we should never get here?
}
// various utils
global.assert = assert;
global.zeros = zeros;
global.maxi = maxi;
global.samplei = samplei;
global.randi = randi;
global.randn = randn;
global.softmax = softmax;
// classes
global.Mat = Mat;
global.RandMat = RandMat;
global.forwardLSTM = forwardLSTM;
global.initLSTM = initLSTM;
// more utils
global.updateMat = updateMat;
global.updateNet = updateNet;
global.copyMat = copyMat;
global.copyNet = copyNet;
global.netToJSON = netToJSON;
global.netFromJSON = netFromJSON;
global.netZeroGrads = netZeroGrads;
global.netFlattenGrads = netFlattenGrads;
// optimization
global.Solver = Solver;
global.Graph = Graph;
})(R);
// END OF RECURRENTJS
var RL = {};
(function (global) {
"use strict";
// syntactic sugar function for getting default parameter values
var getopt = function (opt, field_name, default_value) {
if (typeof opt === 'undefined') {
return default_value;
}
return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value;
}
var zeros = R.zeros; // inherit these
var assert = R.assert;
var randi = R.randi;
var randf = R.randf;
var setConst = function (arr, c) {
for (var i = 0, n = arr.length; i < n; i++) {
arr[i] = c;
}
}
var sampleWeighted = function (p) {
var r = Math.random();
var c = 0.0;
for (var i = 0, n = p.length; i < n; i++) {
c += p[i];
if (c >= r) {
return i;
}
}
assert(false, 'wtf');
}
// ------
// AGENTS
// ------
// DPAgent performs Value Iteration
// - can also be used for Policy Iteration if you really wanted to
// - requires model of the environment :(
// - does not learn from experience :(
// - assumes finite MDP :(
var DPAgent = function (env, opt) {
this.V = null; // state value function
this.P = null; // policy distribution \pi(s,a)
this.env = env; // store pointer to environment
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
this.reset();
}
DPAgent.prototype = {
reset: function () {
// reset the agent's policy and value function
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.V = zeros(this.ns);
this.P = zeros(this.ns * this.na);
// initialize uniform random policy
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
}
}
},
act: function (s) {
// behave according to the learned policy
var poss = this.env.allowedActions(s);
var ps = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var prob = this.P[a * this.ns + s];
ps.push(prob);
}
var maxi = sampleWeighted(ps);
return poss[maxi];
},
learn: function () {
// perform a single round of value iteration
self.evaluatePolicy(); // writes this.V
self.updatePolicy(); // writes this.P
},
evaluatePolicy: function () {
// perform a synchronous update of the value function
var Vnew = zeros(this.ns);
for (var s = 0; s < this.ns; s++) {
// integrate over actions in a stochastic policy
// note that we assume that policy probability mass over allowed actions sums to one
var v = 0.0;
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var prob = this.P[a * this.ns + s]; // probability of taking action under policy
if (prob === 0) {
continue;
} // no contribution, skip for speed
var ns = this.env.nextStateDistribution(s, a);
var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition
v += prob * (rs + this.gamma * this.V[ns]);
}
Vnew[s] = v;
}
this.V = Vnew; // swap
},
updatePolicy: function () {
// update policy to be greedy w.r.t. learned Value function
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
// compute value of taking each allowed action
var vmax, nmax;
var vs = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var ns = this.env.nextStateDistribution(s, a);
var rs = this.env.reward(s, a, ns);
var v = rs + this.gamma * this.V[ns];
vs.push(v);
if (i === 0 || v > vmax) {
vmax = v;
nmax = 1;
} else if (v === vmax) {
nmax += 1;
}
}
// update policy smoothly across all argmaxy actions
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0;
}
}
},
}
// QAgent uses TD (Q-Learning, SARSA)
// - does not require environment model :)
// - learns from experience :)
var TDAgent = function (env, opt) {
this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate
// class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q
this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false);
this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on
// eligibility traces
this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used
this.replacing_traces = getopt(opt, 'replacing_traces', true);
// optional optimistic initial values
this.q_init_val = getopt(opt, 'q_init_val', 0);
this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning)
this.Q = null; // state action value function
this.P = null; // policy distribution \pi(s,a)
this.e = null; // eligibility trace
this.env_model_s = null;; // environment model (s,a) -> (s',r)
this.env_model_r = null;; // environment model (s,a) -> (s',r)
this.env = env; // store pointer to environment
this.reset();
}
TDAgent.prototype = {
reset: function () {
// reset the agent's policy and value function
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.Q = zeros(this.ns * this.na);
if (this.q_init_val !== 0) {
setConst(this.Q, this.q_init_val);
}
this.P = zeros(this.ns * this.na);
this.e = zeros(this.ns * this.na);
// model/planning vars
this.env_model_s = zeros(this.ns * this.na);
setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before
this.env_model_r = zeros(this.ns * this.na);
this.sa_seen = [];
this.pq = zeros(this.ns * this.na);
// initialize uniform random policy
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0, n = poss.length; i < n; i++) {
this.P[poss[i] * this.ns + s] = 1.0 / poss.length;
}
}
// agent memory, needed for streaming updates
// (s0,a0,r0,s1,a1,r1,...)
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
},
resetEpisode: function () {
// an episode finished
},
act: function (s) {
// act according to epsilon greedy policy
var poss = this.env.allowedActions(s);
var probs = [];
for (var i = 0, n = poss.length; i < n; i++) {
probs.push(this.P[poss[i] * this.ns + s]);
}
// epsilon greedy policy
if (Math.random() < this.epsilon) {
var a = poss[randi(0, poss.length)]; // random available action
this.explored = true;
} else {
var a = poss[sampleWeighted(probs)];
this.explored = false;
}
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function (r1) {
// takes reward for previous action, which came from a call to act()
if (!(this.r0 == null)) {
this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda);
if (this.planN > 0) {
this.updateModel(this.s0, this.a0, this.r0, this.s1);
this.plan();
}
}
this.r0 = r1; // store this for next update
},
updateModel: function (s0, a0, r0, s1) {
// transition (s0,a0) -> (r0,s1) was observed. Update environment model
var sa = a0 * this.ns + s0;
if (this.env_model_s[sa] === -1) {
// first time we see this state action
this.sa_seen.push(a0 * this.ns + s0); // add as seen state
}
this.env_model_s[sa] = s1;
this.env_model_r[sa] = r0;
},
plan: function () {
// order the states based on current priority queue information
var spq = [];
for (var i = 0, n = this.sa_seen.length; i < n; i++) {
var sa = this.sa_seen[i];
var sap = this.pq[sa];
if (sap > 1e-5) { // gain a bit of efficiency
spq.push({
sa: sa,
p: sap
});
}
}
spq.sort(function (a, b) {
return a.p < b.p ? 1 : -1
});
// perform the updates
var nsteps = Math.min(this.planN, spq.length);
for (var k = 0; k < nsteps; k++) {
// random exploration
//var i = randi(0, this.sa_seen.length); // pick random prev seen state action
//var s0a0 = this.sa_seen[i];
var s0a0 = spq[k].sa;
this.pq[s0a0] = 0; // erase priority, since we're backing up this state
var s0 = s0a0 % this.ns;
var a0 = Math.floor(s0a0 / this.ns);
var r0 = this.env_model_r[s0a0];
var s1 = this.env_model_s[s0a0];
var a1 = -1; // not used for Q learning
if (this.update === 'sarsa') {
// generate random action?...
var poss = this.env.allowedActions(s1);
var a1 = poss[randi(0, poss.length)];
}
this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here
}
},
learnFromTuple: function (s0, a0, r0, s1, a1, lambda) {
var sa = a0 * this.ns + s0;
// calculate the target for Q(s,a)
if (this.update === 'qlearn') {
// Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a]
var poss = this.env.allowedActions(s1);
var qmax = 0;
for (var i = 0, n = poss.length; i < n; i++) {
var s1a = poss[i] * this.ns + s1;
var qval = this.Q[s1a];
if (i === 0 || qval > qmax) {
qmax = qval;
}
}
var target = r0 + this.gamma * qmax;
} else if (this.update === 'sarsa') {
// SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1]
var s1a1 = a1 * this.ns + s1;
var target = r0 + this.gamma * this.Q[s1a1];
}
if (lambda > 0) {
// perform an eligibility trace update
if (this.replacing_traces) {
this.e[sa] = 1;
} else {
this.e[sa] += 1;
}
var edecay = lambda * this.gamma;
var state_update = zeros(this.ns);
for (var s = 0; s < this.ns; s++) {
var poss = this.env.allowedActions(s);
for (var i = 0; i < poss.length; i++) {
var a = poss[i];
var saloop = a * this.ns + s;
var esa = this.e[saloop];
var update = this.alpha * esa * (target - this.Q[saloop]);
this.Q[saloop] += update;
this.updatePriority(s, a, update);
this.e[saloop] *= edecay;
var u = Math.abs(update);
if (u > state_update[s]) {
state_update[s] = u;
}
}
}
for (var s = 0; s < this.ns; s++) {
if (state_update[s] > 1e-5) { // save efficiency here
this.updatePolicy(s);
}
}
if (this.explored && this.update === 'qlearn') {
// have to wipe the trace since q learning is off-policy :(
this.e = zeros(this.ns * this.na);
}
} else {
// simpler and faster update without eligibility trace
// update Q[sa] towards it with some step size
var update = this.alpha * (target - this.Q[sa]);
this.Q[sa] += update;
this.updatePriority(s0, a0, update);
// update the policy to reflect the change (if appropriate)
this.updatePolicy(s0);
}
},
updatePriority: function (s, a, u) {
// used in planning. Invoked when Q[sa] += update
// we should find all states that lead to (s,a) and upgrade their priority
// of being update in the next planning step
u = Math.abs(u);
if (u < 1e-5) {
return;
} // for efficiency skip small updates
if (this.planN === 0) {
return;
} // there is no planning to be done, skip.
for (var si = 0; si < this.ns; si++) {
// note we are also iterating over impossible actions at all states,
// but this should be okay because their env_model_s should simply be -1
// as initialized, so they will never be predicted to point to any state
// because they will never be observed, and hence never be added to the model
for (var ai = 0; ai < this.na; ai++) {
var siai = ai * this.ns + si;
if (this.env_model_s[siai] === s) {
// this state leads to s, add it to priority queue
this.pq[siai] += u;
}
}
}
},
updatePolicy: function (s) {
var poss = this.env.allowedActions(s);
// set policy at s to be the action that achieves max_a Q(s,a)
// first find the maxy Q values
var qmax, nmax;
var qs = [];
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var qval = this.Q[a * this.ns + s];
qs.push(qval);
if (i === 0 || qval > qmax) {
qmax = qval;
nmax = 1;
} else if (qval === qmax) {
nmax += 1;
}
}
// now update the policy smoothly towards the argmaxy actions
var psum = 0.0;
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0;
var ix = a * this.ns + s;
if (this.smooth_policy_update) {
// slightly hacky :p
this.P[ix] += this.beta * (target - this.P[ix]);
psum += this.P[ix];
} else {
// set hard target
this.P[ix] = target;
}
}
if (this.smooth_policy_update) {
// renomalize P if we're using smooth policy updates
for (var i = 0, n = poss.length; i < n; i++) {
var a = poss[i];
this.P[a * this.ns + s] /= psum;
}
}
}
}
var DQNAgent = function (env, opt) {
this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate
this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory
this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay
this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10);
this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0);
this.num_hidden_units = getopt(opt, 'num_hidden_units', 100);
this.env = env;
this.reset();
}
DQNAgent.prototype = {
reset: function () {
this.nh = this.num_hidden_units; // number of hidden units
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
// nets are hardcoded for now as key (str) -> Mat
// not proud of this. better solution is to have a whole Net object
// on top of Mats, but for now sticking with this
this.net = {};
this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01);
this.net.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.exp = []; // experience
this.expi = 0; // where to insert
this.t = 0;
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
this.tderror = 0; // for visualization only...
},
toJSON: function () {
// save function
var j = {};
j.nh = this.nh;
j.ns = this.ns;
j.na = this.na;
j.net = R.netToJSON(this.net);
return j;
},
fromJSON: function (j) {
// load function
this.nh = j.nh;
this.ns = j.ns;
this.na = j.na;
this.net = R.netFromJSON(j.net);
},
forwardQ: function (net, s, needs_backprop) {
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
this.lastG = G; // back this up. Kind of hacky isn't it
return a2mat;
},
act: function (slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// epsilon greedy policy
if (Math.random() < this.epsilon) {
var a = randi(0, this.na);
} else {
// greedy wrt Q function
var amat = this.forwardQ(this.net, s, false);
var a = R.maxi(amat.w); // returns index of argmax action
}
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function (r1) {
// perform an update on Q function
if (!(this.r0 == null) && this.alpha > 0) {
// learn from this tuple to get a sense of how "surprising" it is to the agent
var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1);
this.tderror = tderror; // a measure of surprise
// decide if we should keep this experience in the replay
if (this.t % this.experience_add_every === 0) {
this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1];
this.expi += 1;
if (this.expi > this.experience_size) {
this.expi = 0;
} // roll over when we run out
}
this.t += 1;
// sample some additional experience from replay memory and learn from it
for (var k = 0; k < this.learning_steps_per_iteration; k++) {
var ri = randi(0, this.exp.length); // todo: priority sweeps?
var e = this.exp[ri];
this.learnFromTuple(e[0], e[1], e[2], e[3], e[4])
}
}
this.r0 = r1; // store for next update
},
learnFromTuple: function (s0, a0, r0, s1, a1) {
// want: Q(s,a) = r + gamma * max_a' Q(s',a')
// compute the target Q value
var tmat = this.forwardQ(this.net, s1, false);
var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)];
// now predict
var pred = this.forwardQ(this.net, s0, true);
var tderror = pred.w[a0] - qmax;
var clamp = this.tderror_clamp;
if (Math.abs(tderror) > clamp) { // huber loss to robustify
if (tderror > clamp) tderror = clamp;
if (tderror < -clamp) tderror = -clamp;
}
pred.dw[a0] = tderror;
this.lastG.backward(); // compute gradients on net params
// update net
R.updateNet(this.net, this.alpha);
return tderror;
}
}
// buggy implementation, doesnt work...
var SimpleReinforceAgent = function (env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
SimpleReinforceAgent.prototype = {
reset: function () {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 100; // number of hidden units
this.nhb = 100; // and also in the baseline lstm
this.actorNet = {};
this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1);
this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.actorOutputs = [];
this.actorGraphs = [];
this.actorActions = []; // sampled ones
this.rewardHistory = [];
this.baselineNet = {};
this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01);
this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01);
this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01);
this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.baselineOutputs = [];
this.baselineGraphs = [];
this.t = 0;
},
forwardActor: function (s, needs_backprop) {
var net = this.actorNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
forwardValue: function (s, needs_backprop) {
var net = this.baselineNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
act: function (slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the actor to get action output
var ans = this.forwardActor(s, true);
var amat = ans.a;
var ag = ans.G;
this.actorOutputs.push(amat);
this.actorGraphs.push(ag);
// forward the baseline estimator
var ans = this.forwardValue(s, true);
var vmat = ans.a;
var vg = ans.G;
this.baselineOutputs.push(vmat);
this.baselineGraphs.push(vg);
// sample action from the stochastic gaussian policy
var a = R.copyMat(amat);
var gaussVar = 0.02;
a.w[0] = R.randn(0, gaussVar);
a.w[1] = R.randn(0, gaussVar);
this.actorActions.push(a);
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function (r1) {
// perform an update on Q function
this.rewardHistory.push(r1);
var n = this.rewardHistory.length;
var baselineMSE = 0.0;
var nup = 100; // what chunk of experience to take
var nuse = 80; // what chunk to update from
if (n >= nup) {
// lets learn and flush
// first: compute the sample values at all points
var vs = [];
for (var t = 0; t < nuse; t++) {
var mul = 1;
// compute the actual discounted reward for this time step
var V = 0;
for (var t2 = t; t2 < n; t2++) {
V += mul * this.rewardHistory[t2];
mul *= this.gamma;
if (mul < 1e-5) {
break;
} // efficiency savings
}
// get the predicted baseline at this time step
var b = this.baselineOutputs[t].w[0];
for (var i = 0; i < this.na; i++) {
// [the action delta] * [the desirebility]
var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
if (update > 0.1) {
update = 0.1;
}
if (update < -0.1) {
update = -0.1;
}
this.actorOutputs[t].dw[i] += update;
}
var update = -(V - b);
if (update > 0.1) {
update = 0.1;
}
if (update < 0.1) {
update = -0.1;
}
this.baselineOutputs[t].dw[0] += update;
baselineMSE += (V - b) * (V - b);
vs.push(V);
}
baselineMSE /= nuse;
// backprop all the things
for (var t = 0; t < nuse; t++) {
this.actorGraphs[t].backward();
this.baselineGraphs[t].backward();
}
R.updateNet(this.actorNet, this.alpha); // update actor network
R.updateNet(this.baselineNet, this.beta); // update baseline network
// flush
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineOutputs = [];
this.actorGraphs = [];
this.baselineGraphs = [];
this.tderror = baselineMSE;
}
this.t += 1;
this.r0 = r1; // store for next update
},
}
// buggy implementation as well, doesn't work
var RecurrentReinforceAgent = function (env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
RecurrentReinforceAgent.prototype = {
reset: function () {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 40; // number of hidden units
this.nhb = 40; // and also in the baseline lstm
this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na);
this.actorG = new R.Graph();
this.actorPrev = null;
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1);
this.baselineG = new R.Graph();
this.baselinePrev = null;
this.baselineOutputs = [];
this.t = 0;
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
},
act: function (slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the LSTM to get action distribution
var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev);
this.actorPrev = actorNext;
var amat = actorNext.o;
this.actorOutputs.push(amat);
// forward the baseline LSTM
var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev);
this.baselinePrev = baselineNext;
this.baselineOutputs.push(baselineNext.o);
// sample action from actor policy
var gaussVar = 0.05;
var a = R.copyMat(amat);
for (var i = 0, n = a.w.length; i < n; i++) {
a.w[0] += R.randn(0, gaussVar);
a.w[1] += R.randn(0, gaussVar);
}
this.actorActions.push(a);
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
learn: function (r1) {
// perform an update on Q function
this.rewardHistory.push(r1);
var n = this.rewardHistory.length;
var baselineMSE = 0.0;
var nup = 100; // what chunk of experience to take
var nuse = 80; // what chunk to also update
if (n >= nup) {
// lets learn and flush
// first: compute the sample values at all points
var vs = [];
for (var t = 0; t < nuse; t++) {
var mul = 1;
var V = 0;
for (var t2 = t; t2 < n; t2++) {
V += mul * this.rewardHistory[t2];
mul *= this.gamma;
if (mul < 1e-5) {
break;
} // efficiency savings
}
var b = this.baselineOutputs[t].w[0];
// todo: take out the constants etc.
for (var i = 0; i < this.na; i++) {
// [the action delta] * [the desirebility]
var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]);
if (update > 0.1) {
update = 0.1;
}
if (update < -0.1) {
update = -0.1;
}
this.actorOutputs[t].dw[i] += update;
}
var update = -(V - b);
if (update > 0.1) {
update = 0.1;
}
if (update < 0.1) {
update = -0.1;
}
this.baselineOutputs[t].dw[0] += update;
baselineMSE += (V - b) * (V - b);
vs.push(V);
}
baselineMSE /= nuse;
this.actorG.backward(); // update params! woohoo!
this.baselineG.backward();
R.updateNet(this.actorLSTM, this.alpha); // update actor network
R.updateNet(this.baselineLSTM, this.beta); // update baseline network
// flush
this.actorG = new R.Graph();
this.actorPrev = null;
this.actorOutputs = [];
this.rewardHistory = [];
this.actorActions = [];
this.baselineG = new R.Graph();
this.baselinePrev = null;
this.baselineOutputs = [];
this.tderror = baselineMSE;
}
this.t += 1;
this.r0 = r1; // store for next update
},
}
// Currently buggy implementation, doesnt work
var DeterministPG = function (env, opt) {
this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor
this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy
this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate
this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate
this.env = env;
this.reset();
}
DeterministPG.prototype = {
reset: function () {
this.ns = this.env.getNumStates();
this.na = this.env.getMaxNumActions();
this.nh = 100; // number of hidden units
// actor
this.actorNet = {};
this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01);
this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01);
this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1);
this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01);
this.ntheta = this.na * this.ns + this.na; // number of params in actor
// critic
this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector
this.r0 = null;
this.s0 = null;
this.s1 = null;
this.a0 = null;
this.a1 = null;
this.t = 0;
},
forwardActor: function (s, needs_backprop) {
var net = this.actorNet;
var G = new R.Graph(needs_backprop);
var a1mat = G.add(G.mul(net.W1, s), net.b1);
var h1mat = G.tanh(a1mat);
var a2mat = G.add(G.mul(net.W2, h1mat), net.b2);
return {
'a': a2mat,
'G': G
}
},
act: function (slist) {
// convert to a Mat column vector
var s = new R.Mat(this.ns, 1);
s.setFrom(slist);
// forward the actor to get action output
var ans = this.forwardActor(s, false);
var amat = ans.a;
var ag = ans.G;
// sample action from the stochastic gaussian policy
var a = R.copyMat(amat);
if (Math.random() < this.epsilon) {
var gaussVar = 0.02;
a.w[0] = R.randn(0, gaussVar);
a.w[1] = R.randn(0, gaussVar);
}
var clamp = 0.25;
if (a.w[0] > clamp) a.w[0] = clamp;
if (a.w[0] < -clamp) a.w[0] = -clamp;
if (a.w[1] > clamp) a.w[1] = clamp;
if (a.w[1] < -clamp) a.w[1] = -clamp;
// shift state memory
this.s0 = this.s1;
this.a0 = this.a1;
this.s1 = s;
this.a1 = a;
return a;
},
utilJacobianAt: function (s) {
var ujacobian = new R.Mat(this.ntheta, this.na);
for (var a = 0; a < this.na; a++) {
R.netZeroGrads(this.actorNet);
var ag = this.forwardActor(this.s0, true);
ag.a.dw[a] = 1.0;
ag.G.backward();
var gflat = R.netFlattenGrads(this.actorNet);
ujacobian.setColumn(gflat, a);
}
return ujacobian;
},
learn: function (r1) {
// perform an update on Q function
//this.rewardHistory.push(r1);
if (!(this.r0 == null)) {
var Gtmp = new R.Graph(false);
// dpg update:
// first compute the features psi:
// the jacobian matrix of the actor for s
var ujacobian0 = this.utilJacobianAt(this.s0);
// now form the features \psi(s,a)
var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector
var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1
// now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1}))
var ujacobian1 = this.utilJacobianAt(this.s1);
var ag = this.forwardActor(this.s1, false);
var psi_sa1 = Gtmp.mul(ujacobian1, ag.a);
var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1
// get the td error finally
var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol
if (tderror > 0.5) tderror = 0.5; // clamp
if (tderror < -0.5) tderror = -0.5;
this.tderror = tderror;
// update actor policy with natural gradient
var net = this.actorNet;
var ix = 0;
for (var p in net) {
var mat = net[p];
if (net.hasOwnProperty(p)) {
for (var i = 0, n = mat.w.length; i < n; i++) {
mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update
ix += 1;
}
}
}
// update the critic parameters too
for (var i = 0; i < this.ntheta; i++) {
var update = this.beta * tderror * psi_sa0.w[i];
this.criticw.w[i] += update;
}
}
this.r0 = r1; // store for next update
},
}
// exports
global.DPAgent = DPAgent;
global.TDAgent = TDAgent;
global.DQNAgent = DQNAgent;
//global.SimpleReinforceAgent = SimpleReinforceAgent;
//global.RecurrentReinforceAgent = RecurrentReinforceAgent;
//global.DeterministPG = DeterministPG;
})(RL);
var Trevel = {
//settings you can change
stop: false,
maxBet: 0.000005,
minBet: 0.00000005,
swap: true,
betSpeed: 100,//change this on init
verbose: true,
isTesting: false,
//money management
useKelly: false,//martingale performs better on live account!
kellyPercent: 5, //can't be more than 100 or less than 1
useMartingale: true, //if kelly is true this won't work
martingaleMultiplier: 2,
//bot settings, these are set automaticcally don't bother
currentBalance: 0,
startingBalance: 0,
betAmount: 0,
profit: 0,
totalBets: 0,
totalWins: 0,
winRate: 0,
betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed
betOutcomes: [],
hbProbability: 0,
lbProbability: 0,
hbCount: 0,
lbcount: 0,
nextBet: "",
previousReward: 0,
addBet: function (bet, outcome) {
if (bet === "LB" && outcome === "Win") {
Trevel.betHistory.push("LO");
Trevel.betOutcomes.push("W");
Trevel.totalWins++;
Trevel.lbcount++;
}
if (bet === "LB" && outcome === "Loose") {
Trevel.betHistory.push("HI");
Trevel.hbCount++;
Trevel.betOutcomes.push("L");
}
if (bet === "HB" && outcome === "Win") {
Trevel.betHistory.push("HI");
Trevel.totalWins++;
Trevel.hbCount++;
Trevel.betOutcomes.push("W");
}
if (bet === "HB" && outcome === "Loose") {
Trevel.betHistory.push("LO");
Trevel.lbcount++;
Trevel.betOutcomes.push("L");
}
Trevel.totalBets++;
},
calculateProbabilities: function () {
Trevel.hbProbability = Trevel.hbCount / Trevel.betHistory.length;
Trevel.lbProbability = Trevel.lbcount / Trevel.betHistory.length;
Trevel.winRate = Trevel.totalWins / Trevel.totalBets;
if (Trevel.isTesting === false) {
Trevel.profit = Trevel.getProfit();
}
},
getCurrentBalance: function () {
return parseFloat($('#balance').html());
},
placeHighBet: function () {
DoubleYourBTC('hi');
},
placeLowBet: function () {
DoubleYourBTC('lo');
},
setBetAmount: function (amount) {
var elem = stake.val(amount);
},
setOutcome: function (bet) {
if (lose.display == 'block') {
if (chance <= 2) {
chance = 2;
}
else {
chance -= 0.05;
}
payout.val(chance);
Trevel.addBet(bet, "Loose");
}
else if (win.display == 'block') {
if (chance >= 2.5) {
chance = 2.5;
}
else {
chance += 0.05;
}
payout.val(chance);
Trevel.addBet(bet, "Win");
}
},
prepareBet: function () {
Trevel.calculateProbabilities();
if (Trevel.betHistory.length < 10) {
if (Trevel.useMartingale === true) {
if (lose.display == 'block' && parseFloat(stake.val()) * parseFloat(payout.val()) < Trevel.maxBet) {
if (stake.val() > 0.00000100) {
Trevel.setBetAmount((parseFloat(stake.val()) * parseFloat(payout.val() / 2)).toFixed(8));
}
else {
Trevel.setBetAmount((parseFloat(stake.val()) * parseFloat(payout.val())).toFixed(8));
}
}
else if (win.display == 'block') {
if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
Trevel.setBetAmount(Trevel.minBet);
}
else {
if (stake.val() > 0.00000100) {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));
}
else {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
}
}
}
}
}
else {
if (Trevel.useKelly === true && Trevel.betHistory.length > 12) {
Trevel.currentBalance = Trevel.getCurrentBalance();
var currMulty = parseFloat(payout.val());
var kellyAmount = (((Trevel.currentBalance * Trevel.kellyPercent) / 100) * ((Trevel.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8);
if (kellyAmount > 0 && kellyAmount < Trevel.maxBet) {
Trevel.setBetAmount(kellyAmount);
}
else {
if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
Trevel.setBetAmount(Trevel.minBet);
}
else {
if (stake.val() > 0.00000100) {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));
}
else {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
}
}
}
}
else if (Trevel.useMartingale === true && Trevel.betHistory.length > 12) {
if (lose.display == 'block' && parseFloat(stake.val()) * parseFloat(payout.val()) < Trevel.maxBet) {
if (stake.val() > 0.00000100) {
Trevel.setBetAmount(parseFloat(stake.val()) * parseFloat(payout.val() / 2).toFixed(8));
}
else {
Trevel.setBetAmount(parseFloat(stake.val()) * parseFloat(payout.val()).toFixed(8));
}
}
else if (win.display == 'block') {
if (parseFloat(stake.val()) / parseFloat(payout.val()) < Trevel.minBet) {
Trevel.setBetAmount(Trevel.minBet);
}
else {
if (stake.val() > 0.00000100) {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val() * 2).toFixed(8));
}
else {
Trevel.setBetAmount(parseFloat(stake.val()) / parseFloat(payout.val()).toFixed(8));
}
}
}
}
}
},
placeBet: function () {
if (Trevel.nextBet === "HB") {
Trevel.placeHighBet();
} else if (Trevel.nextBet === "LB") {
Trevel.placeLowBet();
} else if (Trevel.betHistory.length > 0 && Trevel.swap === true) {
var prev = Trevel.betHistory[Trevel.betHistory.length - 1];
if (prev === "LO") {
Trevel.placeHighBet();
} else {
Trevel.placeLowBet();
}
} else {
Trevel.placeLowBet();
}
},
getProfit: function () {
return (Trevel.getCurrentBalance() - Trevel.startingBalance).toFixed(8);
},
getNumStates: function () {
return 8;
},
getMaxNumActions: function () {
return 2;
},
getSentiment: function (bet) {
if (bet === "HI") {
return 1;
} else {
return 0;
}
},
getPreviousBets: function () {
var hist = [];
if (Trevel.betHistory.length > 12) {
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 1]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 2]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 3]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 4]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 5]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 6]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 7]));
hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 8]));
} else {
hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started...
}
return hist;
},
getAgentState: function () { //we'll observe the last 8 bets
var s = Trevel.getPreviousBets();
return s;
},
getReward: function () {
var reward = 0;
var out1 = Trevel.betOutcomes[Trevel.betOutcomes.length - 1];
var out2 = Trevel.betOutcomes[Trevel.betOutcomes.length - 2];
if (out1 === "L") {
if (Trevel.previousReward < 0) {
reward = Trevel.previousReward;
reward += -0.03;
if (out2 === "L") {
reward += -0.03;
}
}
else {
reward = -0.03;
if (out2 === "L") {
reward += -0.03;
}
}
}
else {
if (Trevel.previousReward > 0) {
reward = Trevel.previousReward;
reward += 0.01;
if (out2 === "W") {
reward += 0.01;
}
}
else {
reward = 0.01;
if (out2 === "W") {
reward += 0.01;
}
}
}
return reward;
},
//for raw testing only
randomNumber: function (min, max) {
return Math.floor(Math.random() * (max - min + 1) + min);
},
getTestOutcome: function (random) {
if (random % 2 == 0) {
return "HI";
} else {
return "LO";
}
},
//initialize Trevel
init: function () {
Trevel.startingBalance = Trevel.currentBalance = balance;
Trevel.setBetAmount(Trevel.minBet);
Trevel.stop = false;
Trevel.swap = true;
Trevel.betSpeed = 1000;
}
};
//Global variables
var balance = function () { return $('#balance').html(); },
stake = $('#double_your_btc_stake'), //.val()
payout = $('#double_your_btc_payout_multiplier'), //.val()
hibet = function () { DoubleYourBTC('hi'); },
lobet = function () { DoubleYourBTC('lo'); },
win = document.getElementById('double_your_btc_bet_win').style,
lose = document.getElementById('double_your_btc_bet_lose').style,
chance = parseFloat(payout.val());
rString = function (length, chars) {
var result = '';
var length = 16;
var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz'
for (var i = length; i > 0; --i) result += chars[Math.floor(Math.random() * chars.length)];
return result;
},
rSeed = function () { $('#next_client_seed').val(rString()); };
//Deep Q learning with reinforceJS
var spec = {}
spec.update = 'qlearn';
spec.gamma = 0.9;
spec.epsilon = 0.45;
spec.alpha = 0.01;
spec.experience_add_every = 12;
spec.experience_size = 100000;
spec.learning_steps_per_iteration = 24;
spec.tderror_clamp = 0.7;
spec.num_hidden_units = 24;
// create an environment object
var env = Trevel;
if (env.isTesting === false) {
env.init();
}
// create the DQN agent
agent = new RL.DQNAgent(env, spec);
setInterval(function () {
if (env.stop === false) {
var state = env.getAgentState();
var action = agent.act(state);
var outcome = "";
if (env.isTesting === false) {
if (action === 0) {
env.nextBet = "LB";
env.prepareBet();
env.placeBet();
env.setOutcome("LB");
outcome = env.betOutcomes[env.betOutcomes.length - 1];
} else if (action === 1) {
env.nextBet = "HB";
env.prepareBet();
env.placeBet();
env.setOutcome("HB");
outcome = env.betOutcomes[env.betOutcomes.length - 1];
}
if (env.verbose === true) {
env.calculateProbabilities();
//console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale);
console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2));
console.log("Balance: " + balance() + " Stake: " + stake.val() + " Payout: " + payout.val());
}
} else {
console.log("Action: " + action);
var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000));
if (action === 0 && testOutcome === "LO") {
env.addBet("LB", "Win");
outcome = "W";
} else if (action === 0 && testOutcome === "HI") {
env.addBet("LB", "Loose");
outcome = "L";
} else if (action === 1 && testOutcome === "HI") {
env.addBet("HB", "Win");
outcome = "W";
} else if (action === 1 && testOutcome === "LO") {
env.addBet("HB", "Loose");
outcome = "L";
}
env.calculateProbabilities();
console.log("Winrate: " + (env.winRate * 100).toFixed(2));
}
var reward = env.getReward();
if (reward == 0) {
if (outcome === "L") {
reward = -0.03;
} else {
reward = 0.01;
}
}
agent.learn(reward);
env.previousReward = reward;
}
}, env.betSpeed); There's what I'm asking to do only it doesnt reward the bot. otherwise it works pretty well, with decent profits rather quick. If you dif this code with the version you posted, youll see I added a few things other than changing how the bet stake and payout multiplier is handled, such as a function that creates a random client seed and changed a few of the html elements to global variables |
@nickisghosty why don't you fork then make a pull request that way your contributions are included in whats existing, now about your questions
Back to rewards, as is currently the "punishment" increases with an increasing loosing streak and the rewards increase with an increasing loosing streak. One way to make this more pronounced without modifying much or adding new functions is to increase the epsilon to a max of 0.99. This info is available in the reinforcejs website. Apparently |
@mbithy Tbh, I dont understand how githubs pull push commit actually works ahah. Sometimes I try to push new commits to my repos and it works, other times it doesnt lol. But I'll ttry right now with a minor addition: new client seed every 500 satoshi profited. and a 'start/stop' button with prompts to edit configurable variables. later ill play around with the other settings you mentioned. one other question. what does the swap setting actually do? it seems counter-productive saying if last bet hi then bet =lo. I was under the impression it was the AI's job to determine if the bet is hi or lo. there were a few other pieces of code that confused me as to how it works ill quote them once i come across them again. |
Hi @nickisghosty, just wondering what affect would a new client seed have? |
@kazi308 it guaruntees that the rolls are more random. i like having a new seed each and every roll so i know that it is truely random and that the website isnt just saying it is, since you can only verify a roll after the fact. |
Ah Ok, Thanks for that information! |
Swap is used when you have just started and AI is still in training it's not a strategy. Also hold off your pull request because I have made an update with a simulator dedicated to training the AI on a new tab |
Yoo, I had been using the first version for a while as a base for a bunch of test scripts on freebitco.in, and went to copy it again and noticed you had made a new one. Usually I'd be able to read through it and figure out how to change things myself and improve it. and thats what I plan on doing with this, but this is much more complex than I'm used to.
I'm having trouble understanding the state, action, reward system. How do you define the actions and states. From there i should be able to understand the reward part.
What im ultimately trying to do is make it so that when the profit is in the negatives, the 'punishment' is greater than just losing the bet. and same goes for when it wins and the profit is in the positives, the 'reward' will be greater than winning with negative profit.
Im wanting to have the bot know when to increase and decrease the bet. once i get that im going to want to have the bot know when to increase and decrease the payout multiplier.
Maybe you have some articles you've been reading you wouldnt mind sharing, other than the reinforcejs website and github. Thanks.
The text was updated successfully, but these errors were encountered: