Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#328 support time-series output in LSTM in webgpu backend #337

Merged
merged 1 commit into from Jun 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 5 additions & 5 deletions package.json
@@ -1,8 +1,8 @@
{
"name": "mil-web-dnn",
"name": "WebDNN",
"version": "1.0.0",
"description": "Deep Neural Network Execution Framework for Web Browsers",
"main": "index.js",
"main": "dist/webdnn.js",
"directories": {
"doc": "docs",
"example": "example",
Expand All @@ -16,7 +16,7 @@
},
"repository": {
"type": "git",
"url": "git+https://github.com/mil-tokyo/mil-web-dnn.git"
"url": "git+https://github.com/mil-tokyo/webdnn.git"
},
"author": [
{
Expand All @@ -32,9 +32,9 @@
],
"license": "MIT",
"bugs": {
"url": "https://github.com/mil-tokyo/mil-web-dnn/issues"
"url": "https://github.com/mil-tokyo/webdnn/issues"
},
"homepage": "https://github.com/mil-tokyo/mil-web-dnn#readme",
"homepage": "https://mil-tokyo.github.io/webdnn/",
"devDependencies": {
"typedoc": "^0.6.0"
}
Expand Down
69 changes: 53 additions & 16 deletions src/graph_transpiler/webdnn/backend/webgpu/kernels/lstm.py
Expand Up @@ -4,26 +4,34 @@
from webdnn.backend.code_generator.injectors.buffer_injector import BufferInjector
from webdnn.backend.code_generator.injectors.kernel_name_injector import KernelNameInjector
from webdnn.backend.webgpu.kernel import Kernel, GPUSize
from webdnn.backend.webgpu.preset_placeholders import MAX_THREADS_PER_THREADGROUP
from webdnn.graph.axis import Axis
from webdnn.graph.operators.lstm import LSTM
from webdnn.graph.order import OrderNC, OrderNTC, OrderCN

template_general = """

def generate_template_general(initial_C: bool, return_sequences: bool):
return """
kernel void %%FUNC_NAME%%(device float * %%STATIC_BUFFER%%[[buffer(0)]],
device float * %%DYNAMIC_BUFFER%%[[buffer(1)]],
const device int * %%META_BUFFER%%[[buffer(2)]],
uint global_index[[thread_position_in_grid]],
uint num_threads[[threads_per_grid]])
{
#define USE_INITIAL_C %%USE_INITIAL_C%%
#define RETURN_SEQUENCES %%RETURN_SEQUENCES%%

const device float *X = %%LOAD_BUFFER(lstm_X)%%;
device float *XH = %%LOAD_BUFFER(lstm_X_and_H)%%;
const device float *W_all = %%LOAD_BUFFER(lstm_W_all)%%;
device float *cell = %%LOAD_BUFFER(lstm_cell)%%;
device float *workspace = %%LOAD_BUFFER(lstm_workspace)%%;
device float *Y = %%LOAD_BUFFER(lstm_Y)%%;
device float *final_C = %%LOAD_BUFFER(lstm_final_C)%%;
const device float *b = %%LOAD_BUFFER(lstm_b)%%;

#if USE_INITIAL_C
const device float *initial_C = %%LOAD_BUFFER(lstm_initial_C)%%;
#endif

const int N = %%LOAD_BUFFER(lstm_N)%%;
const int T = %%LOAD_BUFFER(lstm_T)%%;
const int C1 = %%LOAD_BUFFER(lstm_C1)%%;
Expand All @@ -36,7 +44,12 @@
for (int gid = global_index; gid < N * C2; gid += num_threads)
{
XH_H[gid] = 0;
cell[gid] = 0;

#if USE_INITIAL_C
final_C[gid] = initial_C[gid];
#else
final_C[gid] = 0;
#endif
}

for (int t = 0; t < T; t++)
Expand Down Expand Up @@ -72,7 +85,7 @@
float f = workspace[gid + N * C2 * 1];
float a = workspace[gid + N * C2 * 2];
float o = workspace[gid + N * C2 * 3];
float cell_last = cell[gid];
float cell_last = final_C[gid];

i = i < -2.5 ? 0.0 : (i > +2.5 ? 1.0 : (i * 0.2 + 0.5));
f = f < -2.5 ? 0.0 : (f > +2.5 ? 1.0 : (f * 0.2 + 0.5));
Expand All @@ -81,18 +94,32 @@

cell_last = a * i + cell_last * f;

cell[gid] = cell_last;
XH_H[gid] = tanh(cell_last) * o;
final_C[gid] = cell_last;
const float h = tanh(cell_last) * o;
XH_H[gid] = h;

#if RETURN_SEQUENCES
const int n = gid % N;
const int c2 = gid / C2;
Y[(n * T + t) * C2 + c2] = h;
#endif
}
}


#if !RETURN_SEQUENCES
//copy final output to output variable
for (int gid = global_index; gid < C2 * N; gid += num_threads)
{
Y[gid] = XH_H[gid];
}
#endif

#undef USE_INITIAL_C
#undef RETURN_SEQUENCES
}
"""
""" \
.replace("%%USE_INITIAL_C%%", "1" if initial_C else "0") \
.replace("%%RETURN_SEQUENCES%%", "1" if return_sequences else "0")


def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]:
Expand All @@ -102,12 +129,22 @@ def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]:
x_and_h = memory_layout[op.inputs["x_and_h"]]
w_all = memory_layout[op.inputs["w_all"]]
workspace = memory_layout[op.inputs["workspace"]]
cell = memory_layout[op.inputs["cell"]]
final_c = memory_layout[op.outputs["final_c"]]

use_initial_c = op.parameters["use_initial_c"]
return_sequences = op.parameters["return_sequences"]

assert x.variable.order == OrderNTC, \
f"Current implementation supports only OrderNTC for input variable order: x.order = {x.variable.order}"
assert y.variable.order == OrderNC, \
f"Current implementation supports only OrderNC for output variable order: y.order = {y.variable.order}"

if return_sequences:
assert y.variable.order == OrderNTC, f"Current implementation supports only OrderNTC for output variable of " + \
"LSTM in return_sequences=True mode: y.order = {y.variable.order}"
else:
assert y.variable.order == OrderNC, \
f"Current implementation supports only OrderNC for output variable of LSTM " + \
f"in return_sequences=False mode: y.order = {y.variable.order}"

assert w_all.variable.order == OrderCN

N = x.variable.shape_dict[Axis.N]
Expand All @@ -127,21 +164,21 @@ def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]:
"lstm_X_and_H": x_and_h,
"lstm_W_all": w_all,
"lstm_workspace": workspace,
"lstm_cell": cell
"lstm_final_C": final_c,
"lstm_initial_C": memory_layout[op.inputs["initial_c"]] if use_initial_c else 0,
})

name_injector = KernelNameInjector(op)

source = template_general

source = generate_template_general(use_initial_c, return_sequences)
source = buffer_injector.inject(source)
source = name_injector.inject(source)

kernel = Kernel(
{name_injector.name: source},
name_injector.name,
GPUSize(1, 1, 1),
GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1),
GPUSize(32, 1, 1),
buffer_injector.buffer,
buffer_injector.unresolved_value_list
)
Expand Down
Expand Up @@ -32,13 +32,11 @@ class ConcatLSTMInputAndHidden(OptimizeRule):

v = W_all * XH

Also this optimize rule append 2 additional inputs:
Also this optimize rule append 1 additional input:

workspace:
store the data of product of W_all and XH (=v)
store the data of product of W_all and XH (=`v` in above equations)

cell:
variable to store the temporal cell state
"""

def optimize(self, graph: Graph) -> Tuple[Graph, bool]:
Expand Down Expand Up @@ -66,7 +64,6 @@ def optimize(self, graph: Graph) -> Tuple[Graph, bool]:

x_and_h = Variable([C1 + C2, N], OrderCN)
workspace = Variable([N, 4 * C2], OrderNC)
cell = Variable([N, C2], OrderNC)

w_input.change_order(OrderCN)
w_hidden.change_order(OrderCN)
Expand All @@ -75,7 +72,6 @@ def optimize(self, graph: Graph) -> Tuple[Graph, bool]:
lstm.remove_input(w_hidden)
lstm.append_input("x_and_h", x_and_h)
lstm.append_input("workspace", workspace)
lstm.append_input("cell", cell)
lstm.append_input("w_all", w_all)
lstm.attributes.add(attr)

Expand Down
12 changes: 9 additions & 3 deletions src/graph_transpiler/webdnn/graph/operators/embedding.py
Expand Up @@ -30,14 +30,20 @@ def __call__(self, x: Variable, w: Variable):
self.append_input("x", x)
self.append_input("w", w)

# @TODO: this is too strict condition. It should be supported in optimization phase, not here.
if x.order != OrderNT:
raise NotImplementedError("Currently, Embedding supports only OrderNT variable for input sequence variable.")

x_shape_dict = x.shape_dict
w_shape_dict = w.shape_dict

assert set(w.order.axes) == {Axis.N, Axis.C}

batch_size = x_shape_dict[Axis.N]
assert x.order == OrderNT
assert w.order == OrderNC or w.order == OrderCN
sequence_len = x_shape_dict[Axis.T]
# n_vocabulary = w_shape_dict[Axis.C]
embedding_dim = w_shape_dict[Axis.N]

y = Variable([batch_size, sequence_len, embedding_dim], OrderNTC)

self.append_output("y", y)
return y,
53 changes: 38 additions & 15 deletions src/graph_transpiler/webdnn/graph/operators/lstm.py
@@ -1,8 +1,8 @@
from typing import Optional

from webdnn.graph.axis import Axis
from webdnn.graph.order import OrderNTC, OrderNT, OrderNC, OrderCN, OrderC
from webdnn.graph.operator import Operator
from webdnn.graph.order import OrderNTC, OrderNC, OrderC
from webdnn.graph.variable import Variable


Expand All @@ -25,7 +25,8 @@ def __init__(self, name: Optional[str], use_bias: bool, return_sequences: bool,
self.parameters["use_initial_c"] = use_initial_c
self.attributes = set()

def __call__(self, x: Variable, w_input: Variable, w_hidden: Variable, b: Optional[Variable]=None, initial_c: Optional[Variable]=None):
def __call__(self, x: Variable, w_input: Variable, w_hidden: Variable, b: Optional[Variable] = None,
initial_c: Optional[Variable] = None):
"""
Args:
x (:class:`~webdnn.graph.variable.Variable`): Input (sequence OrderNTC)
Expand All @@ -35,35 +36,57 @@ def __call__(self, x: Variable, w_input: Variable, w_hidden: Variable, b: Option
initial_c (:class:`~webdnn.graph.variable.Variable`): Initial hidden state

Returns:
tuple of :class:`~webdnn.graph.variable.Variable`: Output (OrderNC)
y (:class:`~webdnn.graph.variable.Variable`): Output (OrderNC)
final_c (:class:`~webdnn.graph.variable.Variable`): Last cell state (OrderNC)
"""
assert self.parameters["use_bias"] == (b is not None)
assert self.parameters["use_initial_c"] == (initial_c is not None)

self.append_input("x", x)
self.append_input("w_input", w_input)
self.append_input("w_hidden", w_hidden)
assert self.parameters["use_bias"] == (b is not None)
if self.parameters["use_bias"]:

if b is not None:
self.append_input("b", b)
assert self.parameters["use_initial_c"] == (initial_c is not None)
if initial_c is not None:
self.append_input("initial_c", initial_c)

# @TODO: this is too strict condition. It should be supported in optimization phase, not here.
if x.order != OrderNTC:
raise NotImplementedError("Currently, LSTM supports only OrderNTC variable for input sequence variable.")

x_shape_dict = x.shape_dict
w_input_shape_dict = w_input.shape_dict
w_hidden_shape_dict = w_hidden.shape_dict
batch_size = x_shape_dict[Axis.N]
assert x.order == OrderNTC
assert w_input.order == OrderNC or w_input.order == OrderCN

assert set(x.order.axes) == {Axis.N, Axis.T, Axis.C}
assert set(w_input.order.axes) == {Axis.N, Axis.C}
assert set(w_hidden.order.axes) == {Axis.N, Axis.C}
assert b.order == OrderC
assert x_shape_dict[Axis.C] == w_input_shape_dict[Axis.C]

batch_size = x_shape_dict[Axis.N]
sequence_len = x_shape_dict[Axis.T]
hidden_dim = w_input_shape_dict[Axis.N] // 4
assert hidden_dim * 4 == w_hidden_shape_dict[Axis.N] and hidden_dim == w_hidden_shape_dict[Axis.C]
assert hidden_dim * 4 == b.shape_dict[Axis.C]
input_dim = x_shape_dict[Axis.C]
hidden_dim = w_hidden_shape_dict[Axis.C]

assert x_shape_dict[Axis.N] == batch_size
assert x_shape_dict[Axis.C] == w_input_shape_dict[Axis.C] == input_dim
assert w_input_shape_dict[Axis.N] == w_hidden_shape_dict[Axis.N] == hidden_dim * 4

if initial_c is not None:
self.append_input("initial_c", initial_c)
initial_c_shape_dict = initial_c.shape_dict

assert set(initial_c.order.axes) == {Axis.N, Axis.C}
assert initial_c_shape_dict[Axis.N] == batch_size
assert initial_c_shape_dict[Axis.C] == hidden_dim

if self.parameters["return_sequences"]:
y = Variable([batch_size, sequence_len, hidden_dim], OrderNTC)
else:
y = Variable([batch_size, hidden_dim], OrderNC)

final_c = Variable([batch_size, hidden_dim], OrderNC)

self.append_output("y", y)
self.append_output("final_c", final_c)

return y, final_c