This repository has been archived by the owner on Oct 26, 2021. It is now read-only.
/
Program.cs
253 lines (223 loc) · 12.2 KB
/
Program.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
// -----------------------------------------------------------------------------
// ILGPU Samples
// Copyright (c) 2017-2019 ILGPU Samples Project
// www.ilgpu.net
//
// File: Program.cs
//
// This file is part of ILGPU and is distributed under the University of
// Illinois Open Source License. See LICENSE.txt for details.
// -----------------------------------------------------------------------------
using ILGPU;
using ILGPU.Backends.EntryPoints;
using ILGPU.Runtime;
using System;
using System.Reflection;
namespace LowLevelKernelCompilation
{
class Program
{
/// <summary>
/// Implicitly-grouped kernels receive an index type (first parameter) of type:
/// <see cref="Index"/>, <see cref="Index2"/> or <see cref="Index3"/>.
/// These kernel types hide the underlying blocking/grouping semantics of a GPU
/// and allow convenient kernel programming without having take grouping details into account.
/// The block or group size can be defined while loading a kernel via:
/// - LoadImplicitlyGroupedKernel
/// - LoadAutoGroupedKernel.
///
/// Note that you must not use warp-shuffle functionality within implicitly-grouped
/// kernels since not all lanes of a warp are guaranteed to participate in the warp shuffle.
/// </summary>
/// <param name="index">The current thread index.</param>
/// <param name="dataView">The view pointing to our memory buffer.</param>
/// <param name="constant">A nice uniform constant.</param>
static void MyKernel(
Index1 index, // The global thread index (1D in this case)
ArrayView<int> dataView, // A view to a chunk of memory (1D in this case)
int constant) // A uniform constant
{
dataView[index] = index + constant;
}
/// <summary>
/// Explicitly-grouped kernels receive an index type (first parameter) of type:
/// <see cref="GroupedIndex"/>, <see cref="GroupedIndex2"/> or <see cref="GroupedIndex3"/>.
/// These kernel types expose the underlying blocking/grouping semantics of a GPU
/// and allow for highly efficient implementation of kernels for different GPUs.
/// The semantics of theses kernels are equivalent to kernel implementations in CUDA.
/// Explicitly-grouped kernels can use warp and group-based intrinsics without
/// restrictions (in contrast to implicitly-grouped kernels).
/// An explicitly-grouped kernel can be loaded with:
/// - LoadKernel
/// </summary>
/// <param name="dataView">The view pointing to our memory buffer.</param>
/// <param name="constant">A uniform constant.</param>
static void GroupedKernel(
ArrayView<int> dataView, // A view to a chunk of memory (1D in this case)
int constant) // A sample uniform constant
{
// Get the global 1D index for accessing the data view
var globalIndex = Grid.GlobalIndex.X;
if (globalIndex < dataView.Length)
dataView[globalIndex] = globalIndex + constant;
// Note: this explicitly grouped kernel implements the same functionality
// as MyKernel in the ImplicitlyGroupedKernels sample.
}
/// <summary>
/// Compiles and launches an explicitly grouped kernel.
/// </summary>
static void CompileAndLaunchKernel(Accelerator accelerator, int groupSize)
{
// Access the current backend for this device
var backend = accelerator.Backend;
// Resolve and compile method into a kernel
var method = typeof(Program).GetMethod(nameof(GroupedKernel), BindingFlags.NonPublic | BindingFlags.Static);
var entryPointDesc = EntryPointDescription.FromExplicitlyGroupedKernel(method);
var compiledKernel = backend.Compile(entryPointDesc, default);
// Info: If the current accelerator is a CudaAccelerator, we can cast the compiled kernel to a
// PTXCompiledKernel in order to extract the PTX assembly code.
// -------------------------------------------------------------------------------
// Load the explicitly grouped kernel
// Note that the kernel has to be disposed manually.
using (var kernel = accelerator.LoadKernel(compiledKernel))
{
var launcher = kernel.CreateLauncherDelegate<Action<AcceleratorStream, KernelConfig, ArrayView<int>, int>>();
// -------------------------------------------------------------------------------
using (var buffer = accelerator.Allocate<int>(1024))
{
// You can also use kernel.Launch; however, the generic launch method involves boxing.
launcher(
accelerator.DefaultStream,
((buffer.Length + groupSize - 1) / groupSize, // Compute the number of groups (round up)
groupSize), // Use the given group size
buffer.View,
42);
accelerator.Synchronize();
// Resolve and verify data
var data = buffer.GetAsArray();
for (int i = 0, e = data.Length; i < e; ++i)
{
if (data[i] != 42 + i)
Console.WriteLine($"Error at element location {i}: {data[i]} found");
}
}
}
}
/// <summary>
/// Compiles and launches an implicitly-grouped kernel.
/// </summary>
static void CompileAndLaunchImplicitlyGroupedKernel(Accelerator accelerator, int groupSize)
{
// Access the current backend for this device
var backend = accelerator.Backend;
// Resolve and compile method into a kernel
var method = typeof(Program).GetMethod(nameof(MyKernel), BindingFlags.NonPublic | BindingFlags.Static);
var entryPointDesc = EntryPointDescription.FromImplicitlyGroupedKernel(method);
var compiledKernel = backend.Compile(entryPointDesc, default);
// Info: If the current accelerator is a CudaAccelerator, we can cast the compiled kernel to a
// PTXCompiledKernel in order to extract the PTX assembly code.
// -------------------------------------------------------------------------------
// Load the implicitly grouped kernel with the custom group size
// Note that the kernel has to be disposed manually.
using (var kernel = accelerator.LoadImplicitlyGroupedKernel(compiledKernel, groupSize))
{
var launcher = kernel.CreateLauncherDelegate<Action<AcceleratorStream, Index1, ArrayView<int>, int>>();
// -------------------------------------------------------------------------------
using (var buffer = accelerator.Allocate<int>(1024))
{
// Launch buffer.Length many threads and pass a view to buffer.
// You can also use kernel.Launch; however, the generic launch method involves boxing.
launcher(
accelerator.DefaultStream,
buffer.Length,
buffer.View,
42);
// Wait for the kernel to finish...
accelerator.Synchronize();
// Resolve and verify data
var data = buffer.GetAsArray();
for (int i = 0, e = data.Length; i < e; ++i)
{
if (data[i] != 42 + i)
Console.WriteLine($"Error at element location {i}: {data[i]} found");
}
}
accelerator.Synchronize();
}
}
/// <summary>
/// Compiles and launches an auto-grouped implicitly-grouped kernel.
/// </summary>
static void CompileAndLaunchAutoGroupedKernel(Accelerator accelerator)
{
// Access the current backend for this device
var backend = accelerator.Backend;
// Resolve and compile method into a kernel
var method = typeof(Program).GetMethod(nameof(MyKernel), BindingFlags.NonPublic | BindingFlags.Static);
var entryPointDesc = EntryPointDescription.FromImplicitlyGroupedKernel(method);
var compiledKernel = backend.Compile(entryPointDesc, default);
// Info: If the current accelerator is a CudaAccelerator, we can cast the compiled kernel to a
// PTXCompiledKernel in order to extract the PTX assembly code.
// -------------------------------------------------------------------------------
// Load the implicitly grouped kernel with an automatically determined group size.
// Note that the kernel has to be disposed manually.
using (var kernel = accelerator.LoadAutoGroupedKernel(compiledKernel))
{
var launcher = kernel.CreateLauncherDelegate<Action<AcceleratorStream, Index1, ArrayView<int>, int>>();
// -------------------------------------------------------------------------------
using (var buffer = accelerator.Allocate<int>(1024))
{
// Launch buffer.Length many threads and pass a view to buffer.
// You can also use kernel.Launch; however, the generic launch method involves boxing.
launcher(
accelerator.DefaultStream,
buffer.Length,
buffer.View,
42);
// Wait for the kernel to finish...
accelerator.Synchronize();
// Resolve and verify data
var data = buffer.GetAsArray();
for (int i = 0, e = data.Length; i < e; ++i)
{
if (data[i] != 42 + i)
Console.WriteLine($"Error at element location {i}: {data[i]} found");
}
}
accelerator.Synchronize();
}
}
/// <summary>
/// Launches a simple 1D kernel using implicit and auto-grouping functionality.
/// This sample demonstrates the creation of launcher delegates in order to avoid boxing.
/// </summary>
static void Main()
{
// Create main context
using (var context = new Context())
{
// For each available accelerator...
foreach (var acceleratorId in Accelerator.Accelerators)
{
// Create default accelerator for the given accelerator id
using (var accelerator = Accelerator.Create(context, acceleratorId))
{
Console.WriteLine($"Performing operations on {accelerator}");
// Compiles and launches an implicitly-grouped kernel with an automatically
// determined group size. The latter is determined either by ILGPU or
// the GPU driver. This is the most convenient way to launch kernels using ILGPU.
CompileAndLaunchAutoGroupedKernel(accelerator);
// Compiles and launches an implicitly-grouped kernel with a custom group
// size. Note that a group size less than the warp size can cause
// dramatic performance decreases since many lanes of a warp might remain
// unused.
CompileAndLaunchImplicitlyGroupedKernel(accelerator, accelerator.WarpSize);
// Compiles and launches an explicitly-grouped kernel with a custom group
// size.
CompileAndLaunchKernel(accelerator, accelerator.WarpSize);
}
}
}
}
}
}