-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_locations_asm.c
108 lines (88 loc) · 3.1 KB
/
update_locations_asm.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
/* This program generates N random coordinates in space (3D), and N random
velocity vectors. It then iterates M times to update the locations based
on the velocity.
Finally, it outputs the sum of all coordinates as a checksum of the computation.
Coordinates start in the range [-1000:1000] per dimension.
Velocities are chosen from the range [-1:1] per dimension.
*/
#include <malloc.h>
#include <mm_malloc.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <xmmintrin.h>
// Create a list of 'size' floating point numbers in the range [bound]
float* generate_random_list(uint_fast32_t size, uint_fast32_t bound) {
float *list = (float*) memalign(16,sizeof(float) * (float)size);
for (uint_fast32_t i=0; i < size; i++) {
list[i] = (float)rand()/(float)(RAND_MAX/bound);
}
return list;
}
// Update location by velocity, one time-step
void update_coords(uint32_t i, float* x, float* y, float* z, float* vx, float* vy, float* vz) {
__m128 vec, flo, out;
vec = _mm_set_ss(vx[i]);
flo = _mm_set_ss(x[i]);
out = _mm_add_ss(vec, flo);
_mm_store_ss(&x[i], out);
vec = _mm_set_ss(vy[i]);
flo = _mm_set_ss(y[i]);
out = _mm_add_ss(vec, flo);
_mm_store_ss(&y[i], out);
vec = _mm_set_ss(vz[i]);
flo = _mm_set_ss(z[i]);
out = _mm_add_ss(vec, flo);
_mm_store_ss(&z[i], out);
}
// Sums an array of floats; needed in replacement of Python sum()
float sum(float* a, uint_fast32_t num_elements)
{
__m128 avec, sumflo, sumout;
float* sum = _mm_malloc(sizeof(float), sizeof(int16_t));
sumflo = _mm_set_ss(*sum);
for (uint_fast32_t i = 0; i < num_elements; i++) {
avec = _mm_set_ss(a[i]);
sumout = _mm_add_ss(avec, sumflo);
_mm_store_ss(sum, sumout);
}
return *sum;
}
// Main:
int main(int argc, char* argv[]) {
if (argc != 3) {
printf("Required arguments: vector_length(N) and iterations_num(M)\n");
exit(-1);
}
uint_fast32_t object_size = atoi(argv[1]);
uint_fast32_t iters = atoi(argv[2]);
srand(object_size);
float* x = generate_random_list(object_size, 1000);
float* y = generate_random_list(object_size, 1000);
float* z = generate_random_list(object_size, 1000);
float* vx = generate_random_list(object_size, 1);
float* vy = generate_random_list(object_size, 1);
float* vz = generate_random_list(object_size, 1);
struct timespec requestStart, requestEnd;
clock_gettime(CLOCK_MONOTONIC, &requestStart);
for (uint_fast32_t i=0; i < iters; i++) {
for (uint_fast32_t j=0; j < object_size; j++) {
update_coords(j,x,y,z,vx,vy,vz);
}
}
clock_gettime(CLOCK_MONOTONIC, &requestEnd);
float chksum = (float)sum(x,object_size) + (float)sum(y,object_size) + (float)sum(z,object_size);
float timeTaken = (requestEnd.tv_sec + (requestEnd.tv_nsec / 1000000000.)) - (requestStart.tv_sec + (requestStart.tv_nsec / 1000000000.));
printf(" (1000000 * %f) / (%lu * %lu))\n",timeTaken,object_size,iters);
printf("Mean time per coordinate: %f us\n", ((1000000. * timeTaken) / (object_size * iters)));
printf("Final checksum is: %f\n", chksum);
_mm_free(x);
_mm_free(y);
_mm_free(z);
_mm_free(vx);
_mm_free(vy);
_mm_free(vz);
return(0);
}