-
Notifications
You must be signed in to change notification settings - Fork 2
/
kernel_update_1dol.c
644 lines (557 loc) · 29.5 KB
/
kernel_update_1dol.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
__kernel void Clear(__global float2 *ARES) {
// Clear the work array ARES[NWG*NTRAN*CELLS]
int id = get_global_id(0) ;
for(int i=id; i<NWG*NTRAN*CELLS; i+=GLOBAL) {
ARES[i] = (float2)(0.0f, 0.0f) ;
}
}
__kernel void Sum(__global float2 *ARES, __global float2 *RES, __global float *VOLUME) {
// Sum over results of individual work groups
// ARES[NWG * NTRAN * CELLS] -> RES[CELLS * NTRAN]
int id = get_global_id(0) ; // CELLS
if (id>=CELLS) return ;
for(int itran=0; itran<NTRAN; itran++) {
float sij=0.0f, esc=0.0f ;
for(int i=0; i<NWG; i++) {
sij += ARES[i*NTRAN*CELLS+itran*CELLS+id].x ;
esc += ARES[i*NTRAN*CELLS+itran*CELLS+id].y ;
}
RES[id*NTRAN+itran].x = sij / VOLUME[id] ;
RES[id*NTRAN+itran].y = esc / VOLUME[id] ;
}
}
__kernel void Update(
__global float4 *CLOUD, // 0 [CELLS]: Vrad, Rc, dummy, sigma
__global float *VOLUME, // 1 cell volume / cloud volume
__global float *Aul, // 2 A_ul[NTRAN]
__global float *Blu, // 3 old A_b [NTRAN] = Blu*(h*f)/(4*pi)
__global float *GAU, // 4 precalculated gaussian profiles [CELLS*CHANNELS]
__global int2 *LIM, // 5 first and last significant channels
__global float *GN, // 6 [NTRAN]
__global float *DIRWEI, // 7 individual weights for the rays
__global float *STEP, // 8 average path length [NRAY*CELLS] (GL)
__global float *APL, // 9 average path length [CELLS] (GL)
__global float *IP, // 10 impact parameter [NRAY] [GL]
__global float *BG, // 11 background value (photons per ray)
__global float *NU, // 12 Nu [CELLS*NTRAN]
__global float *NBNB, // 13 nb_nb [CELLS*NTRAN]
__global float2 *ARES, // 14 [NWG*NTRAN*CELLS]: SIJ, ESC
__global int *SINGLE, // 15 list of single transitions
__global float *NTRUES // 16 global work space
) {
// Update normal single transitions
// WG loops over NRAY/NWG rays
// loop over transitions
// loop over channels, updating local soc[], loc[]
// add soc[] and loc[] and finally update ARES[NWG*NTRAN*CELLS]
float weight, dx, doppler, tmp_tau, tmp_emit, nb_nb, factor, escape, absorbed ;
float sum_delta_true, all_escaped, nu ;
int shift, INDEX, dstep ;
int id = get_global_id(0) ;
int lid = get_local_id(0) ; // index within the local work group
int gid = get_group_id(0) ;
if (gid>=NWG) return ; // NWG work groups
#define BYTRAN 1 // divide by transition rather than by channel
#if (BYTRAN==0)
broken ???
__local float sij[NTRAN*LOCAL] ; // used by 1 WG = 1 ray
__local float esc[NTRAN*LOCAL] ;
#endif
// NTRUE must be at least NWG*NTRAN*CHANNELS
#define LOCAL_NTRUE 1 // no effect on GPU run time
#if (LOCAL_NTRUE==1)
__local float NTRUE[NTRAN*CHANNELS] ;
__local float *ntrue ;
#else
__global float *NTRUE = &NTRUES[gid*NTRAN*CHANNELS] ;
__global float *ntrue ;
#endif
#define LOCAL_PROFILE 1 // 1 = faster
#if (LOCAL_PROFILE==1)
__local float profile[CHANNELS] ; // current cell, used by all work items, ~1kB
#else
__global float *profile ;
#endif
for(int iray=gid; iray<NRAY; iray+=NWG) {
INDEX = CELLS-1 ; // always starts with the outermost ray
dstep = -1 ; // we first go inwards (dstep<0), then outwards until ray exits
for(int itran=0; itran<NTRAN; itran++) {
for(int i=lid; i<CHANNELS; i+=LOCAL) NTRUE[itran*CHANNELS+i] = DIRWEI[iray] * BG[itran] ;
}
barrier(CLK_LOCAL_MEM_FENCE) ;
while(1) { // follow one ray through the cloud
dx = STEP[iray*CELLS+INDEX] ; // [GL]
weight = DIRWEI[iray]*(dx/APL[INDEX])*VOLUME[INDEX] ; // VOLUME == fraction of cloud volume
doppler = dstep*CLOUD[INDEX].x * sqrt( max(0.0f, 1.0f-pow(IP[iray]/CLOUD[INDEX].y, 2.0f)) ) ;
// a single profile vector used by all transitions, the whole work group
#if(LOCAL_PROFILE==1)
for(int i=lid; i<CHANNELS; i+=LOCAL) profile[i] = GAU[INDEX*CHANNELS+i] ;
#else
profile = &GAU[INDEX*CHANNELS] ;
#endif
barrier(CLK_LOCAL_MEM_FENCE) ;
shift = round(doppler/WIDTH) ;
int i1 = max(shift, LIM[INDEX].x) ;
int i2 = min(CHANNELS, shift+LIM[INDEX].y) ;
#if (BYTRAN==0)
for(int itran=0; itran<NTRAN; itran++) {
if (SINGLE[itran]==0) continue ;
ntrue = &NTRUE[itran*CHANNELS] ;
nu = NU[INDEX*NTRAN+itran] ; // [CELLS*NTRAN]
nb_nb = NBNB[INDEX*NTRAN+itran] ;
if (fabs(nb_nb)<1.0e-37f) nb_nb=1.0e-37f ;
tmp_tau = dx*GL*nb_nb*GN[itran] ; // GN DOES NOT include GL [cm]
tmp_emit = weight * nu*Aul[itran] / tmp_tau ;
sum_delta_true = all_escaped = 0.0f ;
for(int ii=(i1+lid); ii<i2; ii+=LOCAL) {
# if 0
factor = 1.0f-exp(-tmp_tau*profile[ii-shift]) ;
#else
nu = tmp_tau*profile[ii-shift] ;
factor = (fabs(nu)>1.0e-4f) ? (1.0f-exp(-nu)) : (nu) ;
#endif
escape = tmp_emit*factor ; // emitted photons that escape current cell
absorbed = ntrue[ii]*factor ; // incoming photons that are absorbed
ntrue[ii] += escape-absorbed ;
sum_delta_true += absorbed ; // ignore photons absorbed in emitting cell
all_escaped += escape ; // sum of escaping photons over the profile
}
sij[itran*LOCAL+lid] = Blu[itran] * sum_delta_true / nb_nb ;
esc[itran*LOCAL+lid] = all_escaped ;
} // NTRAN
barrier(CLK_LOCAL_MEM_FENCE) ;
// sum over sij and esc -> ARES; one work item = one transition
for(int itran=lid; itran<NTRAN; itran+=LOCAL) {
dx = 0.0f ; factor = 0.0f ;
for(int i=0; i<LOCAL; i++) {
dx += sij[itran*LOCAL+i] ;
factor += esc[itran*LOCAL+i] ;
}
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX] += (float2)(dx, factor) ;
}
#else
// loop over transitions WORK ITEM PER LEVEL
for(int itran=lid; itran<NTRAN; itran+=LOCAL) {
if (SINGLE[itran]==0) continue ;
ntrue = &NTRUE[itran*CHANNELS] ;
nu = NU[INDEX*NTRAN+itran] ; // [CELLS*NTRAN]
nb_nb = NBNB[INDEX*NTRAN+itran] ;
if (fabs(nb_nb)<1.0e-37f) nb_nb=1.0e-37f ;
tmp_tau = dx*GL*nb_nb*GN[itran] ; // GN DOES NOT include GL [cm]
tmp_emit = weight * nu*Aul[itran] / tmp_tau ;
sum_delta_true = all_escaped = 0.0f ;
// each work item processes one channel at a time
for(int ii=i1; ii<i2; ii++) {
# if 0 // problems at small optical depths
factor = 1.0f-exp(-tmp_tau*profile[ii-shift]) ;
# else
nu = tmp_tau*profile[ii-shift] ;
factor = (fabs(nu)>1.0e-4) ? (1.0f-exp(-nu)) : (nu) ;
# endif
escape = tmp_emit*factor ; // emitted photons that escape current cell
absorbed = ntrue[ii]*factor ; // incoming photons that are absorbed
ntrue[ii] += escape-absorbed ;
sum_delta_true += absorbed ; // ignore photons absorbed in emitting cell
all_escaped += escape ; // sum of escaping photons over the profile
}
// update SIJ and ESC in local array
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].x += Blu[itran] * sum_delta_true / nb_nb ;
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].y += all_escaped ;
} // loop over transitions
#endif // BYTRAN
barrier(CLK_LOCAL_MEM_FENCE) ;
// next cell
INDEX += dstep ;
if (INDEX>=CELLS) break ; // going outwards and outermost shell was processed
if (INDEX<0) { // we went through the centre cell, turn outwards
INDEX = 1 ; dstep = 1 ;
} else {
if (STEP[iray*CELLS+INDEX]<=0.0f) { // ray does not go that far in, turn outwards
INDEX += 2 ; dstep = 1 ;
}
}
if (INDEX>=CELLS) break ;
} // while(1) ... loop over ray
} // for iray
}
__kernel void Spectra(
__global float4 *CLOUD, // 0 [CELLS]: Vrad, Rc, dummy, sigma
__global float *GAU, // 1 precalculated gaussian profiles [CELLS*CHANNELS]
__global int2 *LIM, // 2 first and last significant channels
__global float *NU, // 3 Nu [NTRAN*CELLS]
__global float *NBNB, // 4 nb_nb [NTRAN*CELLS]
__global float *STEP, // 5 average path length [NRAY*CELLS] (GL)
__global float *IP, // 6 impact parameter [NRAY] [GL]
const float BG, // 7 background value (photons per ray)
const int itran, // 8 the transition
const float EMI, // 9 h/(4*pi)*freq*Aul*int2temp
const float GN, // 10 GL **NOT** included
__global float *NTRUES, // 11 [NRAY*CHANNELS] ... without overlap
__global float *STAUS // 12 [NRAY*CHANNELS]emi
)
{
// Normal spectra without overlapping lines
// One work item per ray; the same rays as used in the actual calculation!
// NTRUES[] must be at least NRAY*CHANNELS
int id = get_global_id(0) ;
int lid = get_local_id(0) ;
if (id>=NRAY) return ; // no more rays
#if 1
__global float *NTRUE = &(NTRUES[id*CHANNELS]) ; // NRAY*CHANNELS
#else
__local float LNTRUES[LOCAL*CHANNELS] ;
__local float *NTRUE = &LNTRUES[lid*CHANNELS] ;
#endif
#if 1
__global float *STAU = &( STAUS[id*CHANNELS]) ;
#else
__local float LSTAUS[LOCAL*CHANNELS] ;
__local float *STAU = &LSTAUS[lid*CHANNELS] ;
#endif
int i ;
for(int i=0; i<CHANNELS; i++) {
NTRUE[i] = 0.0f ; STAU[i] = 0.0f ;
}
float tau, dtau, emissivity, doppler, nu, dx ;
int shift ;
__global float* profile ;
float ip = IP[id] ; // impact parameter [GL] ;
int INDEX=CELLS-1, dstep = -1 ;
// emissivity = (PLANCK/(4.0*PI))*freq*A_Aul*C_LIGHT*C_LIGHT/(2.0*BOLZMANN*freq*freq) ;
while (1) {
dx = STEP[id*CELLS+INDEX] ; // [GL]
nu = NU[INDEX*NTRAN+itran] ;
tau = (fabs(NBNB[INDEX*NTRAN+itran])<1.0e-28f) ? (1.0e-28f) : (NBNB[INDEX*NTRAN+itran]) ;
tau *= GN*GL*dx ; // 1e5*1e18 = 1e23?
tau = clamp(tau, -2.0f, 1.0e10f) ;
doppler = dstep*CLOUD[INDEX].x * sqrt( max(0.0f, 1.0f-pow(ip/CLOUD[INDEX].y,2.0f)) ) ;
profile = &GAU[INDEX*CHANNELS] ;
shift = round(doppler/WIDTH) ;
// emissivity = H_PIx4 * freq * nu * Aul *dx * I2T ;
emissivity = EMI * nu * dx * GL ;
// i in [0, CHANNELS[
// i-shift in [0,CHANNELS[ => i in [+shift, CHANNELS+shift[
for(i=max(0, shift); i<min(CHANNELS, CHANNELS+shift); i++) {
dtau = tau*profile[i-shift] ;
dx = emissivity*profile[i-shift]*GN*exp(-STAU[i]) ;
dx *= (fabs(dtau)>1.0e-5f) ? ((1.0f-exp(-dtau))/dtau) : (1.0f-0.5f*dtau) ;
NTRUE[i] += dx ;
STAU[i] += dtau ;
}
// next cell
INDEX += dstep ;
if (INDEX>=CELLS) break ; // going outwards and outermost shell was processed
if (INDEX<0) { // we went through the centre cell, turn outwards
INDEX = 1 ; dstep = 1 ;
} else {
if (STEP[id*CELLS+INDEX]<=0.0f) { // ray does not go further in, turn outwards
INDEX += 2 ; // we went through i but not i-1 => continue with i+1
dstep = 1 ;
}
}
if (INDEX>=CELLS) break ;
} // while(1)
for (i=0; i<CHANNELS; i++) NTRUE[i] -= BG*(1.0f-exp(-STAU[i])) ;
// if (id==0) printf("CENTRE TAU %10.3e\n", STAU[CHANNELS/2]) ;
}
// ===========================================================================================
__kernel void Overlap(
__global float4 *CLOUD, // 0 [CELLS]: Vrad, Rc, dummy, sigma
__global float *VOLUME, // 1 cell volume / cloud volume
__constant float *Aul, // 2 Aul [NTRAN]
__constant float *BLU, // 3 A_b [NTRAN] = B_lu * h*f / (4*pi)
__global float *GAU, // 4 precalculated gaussian profiles [CELLS*CHANNELS]
__constant int2 *LIM, // 5 first and last significant channels
__constant float *GN, // 6 gauss normalisation [NTRAN] -- GL **NOT** inlcuded
__global float *DIRWEI, // 7 individual weights for the rays
__global float *STEP, // 8 average path length [NRAY*CELLS] (GL)
__global float *APL, // 9 average path length [CELLS] (GL)
__global float *IP, // 10 impact parameter [NRAY] [GL]
__global float *BG, // 11 [NTRAN] (photons per ray)
__global float *NU, // 12 n_u [NTRAN*CELLS]
__global float *NBNB, // 13 nb_nb [NTRAN*CELLS]
__global float2 *ARES, // 14 [NWG*NTRAN*CELLS]
const int NCMP, // 15 number of lines in this band
const int NCHN, // 16 channels in combined spectrum
__constant int *TRAN, // 17 list of transitions
__constant float *OFF // 18 offsets in channels
#if (USE_LOCALS==0)
,__global float *NTRUES // 19 [4*NRAY*CHANNELS] !!!!!!
#endif
) {
// Each work group loops over rays.
// Each work item of a work group is working on the same ray at the same time.
// ????????????????????????????????????????????????????????????????????????
// GPU sometimes works, sometimes gives wrong (different) answers for Tex
// was caused by LOCAL being changed on the host side?
// ????????????????????????????????????????????????????????????????????????
float weight, dx, doppler, tmp_tau, tmp, tmp_emit, emi, own, phi ;
float s ;
int shift, INDEX, dstep, itran ;
int id = get_global_id(0) ; // *** not used ***
int lid = get_local_id(0) ; // index within the local work group
int gid = get_group_id(0) ; // work group index
if (gid>=NWG) return ; // NWG work groups used, may be less than allocated wgs
// one kernel call = loop over rays but ONLY A SINGLE BAND
// each loop needs TAU[MAXCHN], STAU[MAXCHN] that do fit in local
#if (USE_LOCALS==0)
__global float *NTRUE = &NTRUES[ gid*NCHN] ;
__global float *EMIT = &NTRUES[2*NWG*NCHN+gid*NCHN] ;
__global float *TAU = &NTRUES[ NWG*NCHN+gid*NCHN] ;
__global float *TT = &NTRUES[3*NWG*NCHN+gid*NCHN] ;
#else
// not significant effect... from 18 to 15 seconds
__local float EMIT[MAXCHN] ;
__local float TAU[MAXCHN] ;
__local float NTRUE[MAXCHN] ;
__local float TT[MAXCHN] ;
if (NCHN>MAXCHN) {
// printf("NOT ENOUGH CHANNELS ALLOCATED IN KERNEL OVERLAP !!!\n") ;
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].x = +1.0e30f ;
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].y = -1.0e30f ;
return ;
}
#endif
__local float sij[MAXCMP*LOCAL] ; // 32*20 < 3kB
__local float esc[MAXCMP*LOCAL] ; // only NCMP*LOCAL used
__local float profile[CHANNELS] ;
float bg = BG[TRAN[0]] ;
for(int iray=gid; iray<NRAY; iray+=NWG) { // work group loops over rays
INDEX = CELLS-1 ; // always starts with the outermost ray
dstep = -1 ; // we first go inwards (dstep<0), then outwards until ray exits
// for(int i=lid; i<NCHN; i+=LOCAL) NTRUE[i] = BG[TRAN[0]]*DIRWEI[iray] ;
for(int i=lid; i<NCHN; i+=LOCAL) NTRUE[i] = bg*DIRWEI[iray] ;
// NTRUE[] is updated only by this work group
barrier(CLK_GLOBAL_MEM_FENCE) ;
barrier(CLK_LOCAL_MEM_FENCE) ;
while(1) { // follow one ray through the cloud
// barrier(CLK_GLOBAL_MEM_FENCE) ;
barrier(CLK_LOCAL_MEM_FENCE) ;
dx = STEP[iray*CELLS+INDEX] ; // [GL]
s = dx*GL ; // [cm]
weight = DIRWEI[iray]*(dx/APL[INDEX])*VOLUME[INDEX] ; // VOLUME == fraction of cloud volume
doppler = dstep*CLOUD[INDEX].x * sqrt( max(0.0f, 1.0f-pow(IP[iray]/CLOUD[INDEX].y, 2.0f)) ) ;
// a single profile[CHANNELS] used by all transitions, the whole work group
for(int i=lid; i<CHANNELS; i+=LOCAL) profile[i] = GAU[INDEX*CHANNELS+i] ;
for(int i=lid; i<NCHN; i+= LOCAL) TAU[i] = 0.0f ;
for(int i=lid; i<NCHN; i+= LOCAL) EMIT[i] = 0.0f ;
for(int i=lid; i<NCMP*LOCAL; i+=LOCAL) sij[i] = 0.0f ;
for(int i=lid; i<NCMP*LOCAL; i+=LOCAL) esc[i] = 0.0f ;
barrier(CLK_LOCAL_MEM_FENCE) ;
// TAU[], EMIT[], sij[], esc[] updated only by this work group
// barrier(CLK_GLOBAL_MEM_FENCE) ;
// optical depth: tau == nb_nb*s*GN*GL
// phi = profile[]*GN = actual profile function
// tau = nb_nb*s*GN*profile[]
// sum of component optical depths => TAU
for(int itran0=0; itran0<NCMP; itran0++) { // loop over NCMP components
itran = TRAN[itran0] ;
tmp_tau = s * NBNB[INDEX*NTRAN+itran] * GN[itran] ;
shift = round(doppler/WIDTH) + OFF[itran0] ; // offset, current component
int i1 = max( -shift, LIM[INDEX].x) ;
int i2 = min(NCHN-shift, LIM[INDEX].y) ;
for(int i=(i1+lid); i<i2; i+=LOCAL) { // over profile
TAU[i+shift] += tmp_tau*profile[i] ;
}
// calculate the vector of total emission: SUM(nu*Aul*phi)
// nu*Aul*phi*s => weight*nu*Aul == true photons from cell / package / cloud volume
tmp = weight * NU[INDEX*NTRAN+itran] * Aul[itran] ;
for(int i=(i1+lid); i<i2; i+=LOCAL) { // phi * dnu = profile*GN * (1/GN) == profile
EMIT[i+shift] += tmp * profile[i] ; // photons within one channel
}
// TAU[], EMIT[] updated only by this work group
barrier(CLK_LOCAL_MEM_FENCE) ; // next loop, element updated by different work item?
// barrier(CLK_GLOBAL_MEM_FENCE) ; // next loop, element updated by different work item?
}
// calculate (1-exp(-TAU))/TAU for all channels
for(int i=lid; i<NCHN; i+=LOCAL) {
TT[i] = (fabs(TAU[i])>1.0e-5f) ? ((1.0f-exp(-TAU[i]))/TAU[i]) : (1.0f-0.5f*TAU[i]) ;
}
// TT[] updated only by this work group
// barrier(CLK_GLOBAL_MEM_FENCE) ; // make sure all have the final TT
barrier(CLK_LOCAL_MEM_FENCE) ;
// count absorptions of incoming photons: BLU*ntrue*phi*(1-exp(-tau))/tau
for(int itran0=0; itran0<NCMP; itran0++) { // loop over NCMP components
itran = TRAN[itran0] ;
shift = round(doppler/WIDTH) + OFF[itran0] ; // offset, current component
int i1 = max( -shift, LIM[INDEX].x) ;
int i2 = min(NCHN-shift, LIM[INDEX].y) ;
for(int i=(i1+lid); i<i2; i+=LOCAL) { // over the profile of this transition
phi = GN[itran]*profile[i] ; // PHI (absorption)
// TT = (1-exp(-tau))/tau
// tau = NBNB * s * GN * profile
sij[itran0*LOCAL+lid] += NTRUE[i+shift]*BLU[itran]*phi*s*TT[i+shift] ;
}
}
// sij[] updated only by this work group
// barrier(CLK_GLOBAL_MEM_FENCE) ;
barrier(CLK_LOCAL_MEM_FENCE) ;
// emission-absorption -- each element updated by the same work item as above
for(int itran0=0; itran0<NCMP; itran0++) {
itran = TRAN[itran0] ;
shift = round(doppler/WIDTH) + OFF[itran0] ; // offset, current component
int i1 = max( -shift, LIM[INDEX].x) ;
int i2 = min(NCHN-shift, LIM[INDEX].y) ;
for(int i=(i1+lid); i<i2; i+=LOCAL) { // over the profile of transition itran
emi = weight*NU[INDEX*NTRAN+itran]*Aul[itran]*profile[i] ; // emitted in one channel
phi = profile[i] * GN[itran] ; // profile function in current channel = phi(freq)
tmp_tau = s*NBNB[INDEX*NTRAN+itran]*phi ;
// SIJ -- count absorptions ignoring photons from current line
#if 0
tmp = (EMIT[i+shift]-emi) * BLU[itran]*phi*s*(1.0f-TT[i+shift])/TAU[i+shift] ;
# else
tmp = (EMIT[i+shift]-emi) * BLU[itran]*phi*s ;
tmp *= (fabs(TAU[i+shift])>1.0e-3f)
? ((1.0f-TT[i+shift])/TAU[i+shift]) : ( 0.5f-TAU[i+shift]/6.0f) ;
# endif
sij[itran0*LOCAL+lid] += tmp ;
// Escape = W*nu*Aul*profile * ((1-exp(-TAU))/TAU) * (tau/TAU))
// = emi * TT * tmp_tau/TAU
// if there were no other lines:
// esc[itran0*LOCAL+lid] += emi * TT[i+shift] * tmp_tau / TAU[i+shift] ;
// taking into account overlapping:
esc[itran0*LOCAL+lid] += emi * (1.0f-(1.0f-TT[i+shift])*tmp_tau/TAU[i+shift]) ;
}
} // for itran0
barrier(CLK_LOCAL_MEM_FENCE) ;
// sij[], esc[] updated only by this work group
// barrier(CLK_GLOBAL_MEM_FENCE) ;
// update NTRUE
for(int i=lid; i<NCHN; i+=LOCAL) {
NTRUE[i] *= exp(-TAU[i]) ;
NTRUE[i] += EMIT[i] * TT[i] ;
}
barrier(CLK_LOCAL_MEM_FENCE) ;
// NTRUE[] updated only by this work group
// barrier(CLK_GLOBAL_MEM_FENCE) ;
// sij[LOCAL*itran0+lid] -> ARES[...itran*CELLS+INDEX]
for(int itran0=lid; itran0<NCMP; itran0+=LOCAL) {
itran = TRAN[itran0] ;
tmp = 0.0f ;
for(int i=0; i<LOCAL; i++) tmp += sij[itran0*LOCAL+i] ;
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].x += tmp ; // only this WG updates this element
tmp = 0.0f ;
for(int i=0; i<LOCAL; i++) tmp += esc[itran0*LOCAL+i] ;
ARES[gid*NTRAN*CELLS+itran*CELLS+INDEX].y += tmp ; // only this WG updates this element
}
barrier(CLK_LOCAL_MEM_FENCE) ;
// each element of ARES updated only by a single work group
// barrier(CLK_GLOBAL_MEM_FENCE) ;
// next cell
INDEX += dstep ;
if (INDEX>=CELLS) break ; // going outwards and outermost shell was processed
if (INDEX<0) { // we went through the centre cell, turn outwards
INDEX = 1 ; dstep = +1 ;
} else {
if (STEP[iray*CELLS+INDEX]<=0.0f) { // ray does not go that far in, turn outwards
INDEX += 2 ; dstep = +1 ;
if (INDEX>=CELLS) break ; // went tangentially through the outermost shell
}
}
} // while(1) --- loop through the cloud
barrier(CLK_LOCAL_MEM_FENCE) ;
barrier(CLK_GLOBAL_MEM_FENCE) ;
} // for iray
}
__kernel void Spectra_OL(
__global float4 *CLOUD, // 0 [CELLS]: Vrad, Rc, dummy, sigma
__global float *GAU, // 1 precalculated gaussian profiles [CELLS*CHANNELS]
__global int2 *LIM, // 2 first and last significant channels
__global float *GN, // 3 precalculated gaussian profiles [CELLS*CHANNELS]
__global float *NU, // 4 Nu [CELLS*NTRAN]
__global float *Aul, // 5 Aul [NTRAN]
__global float *NBNB, // 6 nb_nb [NTRAN*CELLS]
__global float *STEP, // 7 step lengths (GL)
__global float *IP, // 8 impact parameter [NRAY] [GL]
__global float *BG, // 9 background value (photons per ray)
__global float *NTRUES, // 10 [NRAY*CHANNELS] ... without overlap
__global float *STAUS, // 11 [NRAY*CHANNELS]emi
const int NCMP, // 12 number of lines in this band
const int NCHN, // 13 channels in combined spectrum
__global int *TRAN, // 14 list of transitions
__global float *OFF // 15 offsets in channels
)
{
// Write spectra in case of overlapping lines; one work group per ray
int id = get_global_id(0) ;
int lid = get_local_id(0) ;
int gid = get_group_id(0) ;
if (gid>=NWG) return ;
float doppler, dx, tmp, ip ;
int shift, itran, INDEX, dstep ;
__global float *profile ;
__global float *NTRUE ;
#if 0
__global float *EMIT = &NTRUES[(2*gid+1)*NCHN] ;
__global float *STAU = &(STAUS[gid*NCHN]) ;
#else
__local float EMIT[MAXCHN] ;
__local float STAU[MAXCHN] ;
#endif
__local float TAU[MAXCHN] ;
barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE) ;
// one work group per ray, loop over rays
for(int iray=gid; iray<NRAY; iray+=NWG) {
NTRUE = &NTRUES[iray*NCHN] ;
for(int i=lid; i<NCHN; i+=LOCAL) STAU[i] = 0.0f ; // shared by this WG
for(int i=lid; i<NCHN; i+=LOCAL) NTRUE[i] = 0.0f ; // shared by this WG
INDEX = CELLS-1 ;
dstep = -1 ;
ip = IP[iray] ;
while (1) {
barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE) ;
dx = STEP[iray*CELLS+INDEX] ; // [GL]
doppler = dstep*CLOUD[INDEX].x * sqrt( max(0.0f, 1.0f-pow(ip/CLOUD[INDEX].y,2.0f)) ) ;
// doppler = 0 ; // this was a leftover from some debuggin !!! 2018-07-27
profile = &GAU[INDEX*CHANNELS] ;
// sum of component optical depths => TAU
for(int i=lid; i<NCHN; i+=LOCAL) {
TAU[i] = 0.0f ; // optical depth of current cell
EMIT[i] = 0.0f ; // emission of current cell
}
barrier(CLK_LOCAL_MEM_FENCE) ;
barrier(CLK_GLOBAL_MEM_FENCE) ;
for(int itran0=0; itran0<NCMP; itran0++) { // loop over NCMP components
barrier(CLK_LOCAL_MEM_FENCE || CLK_GLOBAL_MEM_FENCE) ;
itran = TRAN[itran0] ; // index of the current transition
tmp = dx*GL*GN[itran]*NBNB[INDEX*NTRAN+itran] ; // optical depth; GN **NOT* including GL
shift = round(doppler/WIDTH) + OFF[itran0] ; // offset of the first channel
int i1 = max( -shift, LIM[INDEX].x) ; // LIM = first and last significant channel
int i2 = min(NCHN-shift, LIM[INDEX].y) ;
for(int i=(i1+lid); i<i2; i+=LOCAL) { // loop over profile of this comp.
TAU[i+shift] += tmp*profile[i] ;
}
tmp = NU[INDEX*NTRAN+itran]*Aul[itran] ; // emitted photons / cm3 / channel
for(int i=(i1+lid); i<i2; i+=LOCAL) { // over present component profile only
EMIT[i+shift] += tmp*profile[i] ; // photons in one channel / cm3
}
}
barrier(CLK_LOCAL_MEM_FENCE) ;
barrier(CLK_GLOBAL_MEM_FENCE) ;
for(int i=lid; i<NCHN; i+=LOCAL) { // over the full band
tmp = TAU[i] ; // local optical depth
tmp = (fabs(tmp)>1.0e-5f) ? ((1.0f-exp(-tmp))/tmp) : (1.0f-0.5f*tmp) ;
NTRUE[i] += EMIT[i]*dx*GL*exp(-STAU[i])*tmp ; // observed photons
STAU[i] += TAU[i] ; // cumulative optical depth
}
// next cell
INDEX += dstep ;
if (INDEX>=CELLS) break ; // going outwards and outermost shell was processed
if (INDEX<0) { // we went through the centre cell, turn outwards
INDEX = 1 ; dstep = 1 ;
} else {
if (STEP[iray*CELLS+INDEX]<=0.0f) { // ray does not go further in, turn outwards
INDEX += 2 ; // we went through i but not into i-1 => continue with shell i+1
dstep = 1 ;
if (INDEX>=CELLS) break ;
}
}
} // while(1)
barrier(CLK_LOCAL_MEM_FENCE) ;
barrier(CLK_GLOBAL_MEM_FENCE) ;
for (int i=lid; i<NCHN; i+=LOCAL) NTRUE[i] -= BG[TRAN[0]]*(1.0f-exp(-STAU[i])) ;
barrier(CLK_GLOBAL_MEM_FENCE) ;
barrier(CLK_LOCAL_MEM_FENCE) ;
// if (iray==0) printf("CENTRE TAU %10.3e BG[%d] %10.3e\n", STAU[NCHN/2], TRAN[0], BG[TRAN[0]]) ;
} // for iray
}