Skip to content

Commit

Permalink
Support mix order variables
Browse files Browse the repository at this point in the history
  • Loading branch information
Kiikurage committed Jun 2, 2017
1 parent 1b20ef6 commit e340a77
Show file tree
Hide file tree
Showing 11 changed files with 572 additions and 111 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def axiswise_scale(op: AxiswiseScale) -> List[Kernel]:
axis_pos = x.order.axes_dict[op.parameters["axis"]] # NCHWでaxis=Cなら、1
axis_size = x.shape[axis_pos]
assert axis_size == b.size
axis_stride = np.prod(x.shape[axis_pos + 1:]) # NCHWでaxis=Cなら、size(H)*size(W), np.prod([])==1.0
axis_stride = int(np.prod(x.shape[axis_pos + 1:])) # NCHWでaxis=Cなら、size(H)*size(W), np.prod([])==1.0

kernel = Kernel(
{"axiswise_scale": source},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def _generate_buffer(self) -> bytes:
buffer += value

else:
raise TypeError("MetaBufferInjector supports only int, float, and bytes contents.")
raise TypeError("MetaBufferInjector supports only int, float, and bytes contents. "
+ f"\"{key} is {type(value)}\".")

self.offset_map = offset_map
self.buffer = buffer
Expand Down
186 changes: 164 additions & 22 deletions src/graph_transpiler/webdnn/backend/webgpu/kernels/axiswise_bias.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,202 @@
from typing import List

import numpy as np

from webdnn.backend.webgpu.allocator import MemoryLayout
from webdnn.backend.webgpu.injectors.kernel_name_injector import KernelNameInjector
from webdnn.backend.webgpu.injectors.meta_injector import MetaInjector
from webdnn.backend.webgpu.kernel import Kernel, GPUSize
from webdnn.graph.axis import Axis
from webdnn.graph.operators.axiswise_bias import AxiswiseBias
from webdnn.graph.order import OrderHWNC, OrderNHWC, OrderNC

template = """

def axiswise_bias(op: AxiswiseBias,
constants_layout: MemoryLayout,
variables_layout: MemoryLayout) -> List[Kernel]:
x = variables_layout[op.inputs["x"]]
y = variables_layout[op.outputs["y"]]

if x.variable.order == y.variable.order:
return axiswise_bias_same_order(op, constants_layout, variables_layout)

else:
return axiswise_bias_general(op, constants_layout, variables_layout)


def generate_template_same_order(D1, D3):
return """
kernel void %%FUNC_NAME%%(const device float *weight_buffer[[buffer(0)]],
device float *data_buffer[[buffer(1)]],
const device int * %%META_NAME%% [[buffer(2)]],
uint index[[thread_position_in_grid]],
uint num_threads[[threads_per_grid]])
{
#define FLAG_D1_EQUAL_1 %%FLAG_D1_EQUAL_1%%
#define FLAG_D3_EQUAL_1 %%FLAG_D3_EQUAL_1%%
const device float *X = data_buffer + %%META_LOAD(axiswise_bias_X_offset)%%;
const device float *B = weight_buffer + %%META_LOAD(axiswise_bias_B_offset)%%;
device float *Y = data_buffer + %%META_LOAD(axiswise_bias_Y_offset)%%;
#if !OPTIMIZE || !FLAG_D1_EQUAL_1
const int D1 = %%META_LOAD(axiswise_bias_D1)%%;
#endif
const int D2 = %%META_LOAD(axiswise_bias_D2)%%;
#if !OPTIMIZE || !FLAG_D3_EQUAL_1
const int D3 = %%META_LOAD(axiswise_bias_D3)%%;
#endif
#if OPTIMIZE && FLAG_D3_EQUAL_1
#if OPTIMIZE && FLAG_D1_EQUAL_1
for (int gid = index; gid < D2; gid += num_threads) {
const int d2 = gid;
#else
for (int gid = index; gid < D1 * D2; gid += num_threads) {
const int d2 = gid % D2;
#endif
#else
#if OPTIMIZE && FLAG_D1_EQUAL_1
for (int gid = index; gid < D2 * D3; gid += num_threads) {
const int d2 = gid / D3 % D2;
#else
for (int gid = index; gid < D1 * D2 * D3; gid += num_threads) {
const int d2 = gid / D3 % D2;
#endif
#endif
float v = X[gid] + B[d2];
Y[gid] = v;
}
#undef FLAG_D1_EQUAL_1
#undef FLAG_D3_EQUAL_1
}
""" \
.replace("%%FLAG_D1_EQUAL_1%%", "1" if D1 == 1 else "0") \
.replace("%%FLAG_D3_EQUAL_1%%", "1" if D3 == 1 else "0")


def axiswise_bias_same_order(op: AxiswiseBias,
constants_layout: MemoryLayout,
variables_layout: MemoryLayout) -> List[Kernel]:
x = variables_layout[op.inputs["x"]]
b = constants_layout[op.inputs["b"]]
y = variables_layout[op.outputs["y"]]

target_axis_index = x.variable.order.axes_dict[op.axis]
D1 = int(np.prod(x.variable.shape[:target_axis_index]))
D2 = x.variable.shape[target_axis_index]
D3 = int(np.prod(x.variable.shape[target_axis_index + 1:]))

meta_injector = MetaInjector()
meta_injector.register({
"axiswise_bias_X_offset": x.offset,
"axiswise_bias_B_offset": b.offset,
"axiswise_bias_Y_offset": y.offset,
"axiswise_bias_D1": D1,
"axiswise_bias_D2": D2,
"axiswise_bias_D3": D3
})

name_injector = KernelNameInjector(op)

source = generate_template_same_order(D1, D3)
source = meta_injector.inject(source)
source = name_injector.inject(source)

kernel = Kernel(
{name_injector.name: source},
name_injector.name,
GPUSize(8, 1, 1),
GPUSize(1024, 1, 1),
meta_injector.buffer
)

return [kernel]


template_general = """
kernel void %%FUNC_NAME%%(const device float *weight_buffer[[buffer(0)]],
device float *data_buffer[[buffer(1)]],
const device int * %%META_NAME%% [[buffer(2)]],
uint index[[thread_position_in_grid]],
uint num_threads[[threads_per_grid]])
{
const device float *X = data_buffer + %%META_LOAD(axiswise_bias_X_offset)%%;
const device float *B = weight_buffer + %%META_LOAD(axiswise_bias_B_offset)%%;
const int N = %%META_LOAD(axiswise_bias_N)%%;
const int C = %%META_LOAD(axiswise_bias_C)%%;
for (int gid = index; gid < N * C; gid += num_threads) {
int c = gid % C;
int n = gid / C;
float result = X[gid] + B[c];
Y[n * C + c] = result;
device float *Y = data_buffer + %%META_LOAD(axiswise_bias_Y_offset)%%;
const int D = %%META_LOAD(axiswise_bias_D)%%;
const int d_target = %%META_LOAD(axiswise_bias_d_target)%%;
const device int *x_shape = &(%%META_LOAD(axiswise_bias_x_shape)%%);
const device int *x_stride_in_y = &(%%META_LOAD(axiswise_bias_x_stride_in_y)%%);
int size = 1;
for (int d = 0; d < D; d++) size *= x_shape[d];
int D1 = 1;
for (int d = 0; d < d_target; d++) D1 *= x_shape[d];
const int D2 = x_shape[d_target];
int D3 = 1;
for (int d = d_target + 1; d < D; d++) D3 *= x_shape[d];
for (int gid = index; gid < size; gid += num_threads) {
int y_offset = 0;
int s = gid;
for (int d = D - 1; d >= 0; d--) {
y_offset += x_stride_in_y[d] * (s % x_shape[d]);
s /= x_shape[d];
}
const int d2 = gid / D3 % D2;
float v = X[gid] + B[d2];
Y[y_offset] = v;
}
}
"""


def axiswise_bias(op: AxiswiseBias,
constants_layout: MemoryLayout,
variables_layout: MemoryLayout) -> List[Kernel]:
def axiswise_bias_general(op: AxiswiseBias,
constants_layout: MemoryLayout,
variables_layout: MemoryLayout) -> List[Kernel]:
x = variables_layout[op.inputs["x"]]
b = constants_layout[op.inputs["b"]]
y = variables_layout[op.outputs["y"]]

assert x.variable.order == OrderNC or x.variable.order == OrderNHWC or x.variable.order == OrderHWNC
assert y.variable.shape == x.variable.shape
assert op.parameters["axis"] == Axis.C, "[WebGPU] AxiswiseBias supports only channelwise bias."
x_shape = x.variable.shape

y_strides = []
stride = 1
for s in reversed(y.variable.shape):
y_strides.insert(0, stride)
stride *= s

x_stride_in_y = [y_strides[y.variable.order.axes_dict[axis]] for axis in x.variable.order.axes]

meta_injector = MetaInjector()
meta_injector.register({
"axiswise_bias_X_offset": x.offset,
"axiswise_bias_Y_offset": y.offset,
"axiswise_bias_B_offset": b.offset,
"axiswise_bias_N": y.variable.size // y.variable.shape_dict[Axis.C],
"axiswise_bias_C": y.variable.shape_dict[Axis.C],
"axiswise_bias_Y_offset": y.offset,
"axiswise_bias_D": x.variable.ndim,
"axiswise_bias_d_target": x.variable.order.axes_dict[op.axis],
"axiswise_bias_x_shape": np.array(x_shape, dtype=np.int32).tobytes(),
"axiswise_bias_x_stride_in_y": np.array(x_stride_in_y, dtype=np.int32).tobytes(),
})

name_injector = KernelNameInjector(op)

source = template
source = template_general
source = meta_injector.inject(source)
source = name_injector.inject(source)

Expand Down
Loading

0 comments on commit e340a77

Please sign in to comment.