/
mmul.4th
186 lines (159 loc) · 4.58 KB
/
mmul.4th
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
\ mmul.4th
\
\ Double precision floating point matrix multiplication.
\
\ Krishna Myneni
\
\ Usage: a1 a2 a3 nr1 nc1 nc2 df_mmul
\
\ Requires:
\ ans-words.4th
\ modules.4th
\ fsl/fsl-util.4th
\
\ Revisions:
\ 2017-05-17 km; created.
\ 2017-05-18 km; precompute row offsets for a1 and a2;
\ additional test case for product of
\ 3x4 with 4x4; made into FSL-style module.
\ 2017-05-19 km; fixed declaration of floating point arrays
\ in test code -- previously declared as
\ double integer.
\ 2017-05-20 km; replacing >R DFLOAT+ R> with SWAP DFLOAT+ SWAP
\ in DF_MUL_R1C2, suggested by Stephen Pelc,
\ reduces execution time by 16% for large matrices
\ under kforth-fast.
\ 2017-05-21 km; added SET_MMUL_PARAMS to be able to use DF_R1C2>A1A2
\ and DF_MUL_R1C2 independently of DF_MMUL.
\ 2021-05-09 km; added separate stack versions of words.
\ 2023-12-05 km; fix comment for DF_MMUL.
\
\ Notes:
\ 0. Matrix data is assumed to be stored in row order
\
CR .( MMUL V1.3 09 May 2021 )
BEGIN-MODULE
BASE @
DECIMAL
variable nc1
variable nc2
variable a1
variable a2
variable roffs1
variable roffs2
Public:
\ Convert row of a1 and column of a2 to
\ corresponding addresses
: df_r1c2>a1a2 ( row1 col2 -- arow1 acol2 )
dfloats a2 a@ + >r
roffs1 @ * a1 a@ + r> \ -- arow1 acol2
;
\ Multiply row of a1 with col of a2, element by element,
\ and accumulate the sum.
[DEFINED] FDEPTH [IF]
: df_mul_r1c2 ( row1 col2 -- ) ( F: -- rsum )
df_r1c2>a1a2
0e
nc1 @ 0 DO
2dup f@ f@ f* f+
roffs2 @ +
swap dfloat+ swap
LOOP
2drop ;
[ELSE]
: df_mul_r1c2 ( row1 col2 -- rsum )
df_r1c2>a1a2
2>r 0e 2r> \ rsum a1 a2
nc1 @ 0 DO
2dup 2>r >r f@ r> f@ f* f+
2r> roffs2 @ +
swap dfloat+ swap
LOOP
2drop ;
[THEN]
: set_mmul_params ( a1 a2 a3 nr1 nc1 nc2 -- a3 nr1 )
nc2 ! nc1 ! 2>r a2 ! a1 !
\ offsets to next row for a1 and a2
nc1 @ dfloats roffs1 !
nc2 @ dfloats roffs2 !
2r> ;
\ Multiply two double-precision matrices with data beginning at
\ a1 and a2, and store at a3. Proper memory allocation is
\ assumed, as are the dimensions for a2, i.e. nr2 = nc1 is
\ assumed.
[DEFINED] FDEPTH [IF]
: df_mmul ( a1 a2 a3 nr1 nc1 nc2 -- )
set_mmul_params
0 DO
nc2 @ 0 DO
J I df_mul_r1c2 dup f!
dfloat+
LOOP
LOOP
drop ;
[ELSE]
: df_mmul ( a1 a2 a3 nr1 nc1 nc2 -- )
set_mmul_params
0 DO
nc2 @ 0 DO
J I df_mul_r1c2 2 pick f!
dfloat+
LOOP
LOOP
drop ;
[THEN]
BASE !
END-MODULE
TEST-CODE? [IF]
[undefined] T{ [IF] s" ttester.4th" included [THEN]
base @
decimal
\ Allot and initialize three 2x2 matrices
2 2 dfloat matrix a{{
2 2 dfloat matrix b{{
2 2 dfloat matrix c{{
cr
t{ 1e a{{ 0 0 }} f! -> }t
t{ 2e a{{ 0 1 }} f! -> }t
t{ 3e a{{ 1 0 }} f! -> }t
t{ 4e a{{ 1 1 }} f! -> }t
t{ 5e b{{ 0 0 }} f! -> }t
t{ 6e b{{ 0 1 }} f! -> }t
t{ 7e b{{ 1 0 }} f! -> }t
t{ 8e b{{ 1 1 }} f! -> }t
TESTING df_mmul
set-near
1e-16 rel-near f!
t{ a{{ 0 0 }} b{{ 0 0 }} c{{ 0 0 }} 2 2 2 df_mmul -> }t
t{ c{{ 0 0 }} f@ -> 19e r}t
t{ c{{ 0 1 }} f@ -> 22e r}t
t{ c{{ 1 0 }} f@ -> 43e r}t
t{ c{{ 1 1 }} f@ -> 50e r}t
\ Compute the product of a 3x4 matrix with a 4x4 matrix
3 4 dfloat matrix d{{
4 4 dfloat matrix e{{
3 4 dfloat matrix f{{
t{ 1e 0.5e 0.25e 2e
1e 3e f/ 0.75e 5e 6e f/ 3e
2e 3e f/ 5e 4e f/ 6e 7e f/ 11e 12e f/
3 4 d{{ }}fput -> }t
t{ 10e 9e 8e 7e
6e 5e 4e 3e
2e 1e 0.5e 0.25e
1e 8e f/ 1e 16e f/ 1e 3e f/ 2e 3e f/
4 4 e{{ }}fput -> }t
t{ d{{ 0 0 }} e{{ 0 0 }} f{{ 0 0 }} 3 4 4 df_mmul -> }t
t{ f{{ 0 0 }} f@ -> 10e 3e f+ 0.5e f+ 0.25e f+ r}t
t{ f{{ 0 1 }} f@ -> 9e 2.5e f+ 0.25e f+ 0.125e f+ r}t
t{ f{{ 0 2 }} f@ -> 8e 2e f+ 0.125e f+ 2e 3e f/ f+ r}t
t{ f{{ 0 3 }} f@ -> 7e 1.5e f+ 1e 16e f/ f+ 4e 3e f/ f+ r}t
t{ f{{ 1 0 }} f@ -> 10e 3e f/ 4.5e f+ 5e 3e f/ f+ 3e 8e f/ f+ r}t
t{ f{{ 1 1 }} f@ -> 3e 3.75e f+ 5e 6e f/ f+ 3e 16e f/ f+ r}t
t{ f{{ 1 2 }} f@ -> 8e 3e f/ 3e f+ 5e 12e f/ f+ 1e f+ r}t
t{ f{{ 1 3 }} f@ -> 7e 3e f/ 2.25e f+ 5e 24e f/ f+ 2e f+ r}t
t{ f{{ 2 0 }} f@ -> 20e 3e f/ 7.5e f+ 12e 7e f/ f+ 11e 96e f/ f+ r}t
t{ f{{ 2 1 }} f@ -> 6e 6.25e f+ 6e 7e f/ f+ 11e 192e f/ f+ r}t
t{ f{{ 2 2 }} f@ -> 16e 3e f/ 5e f+ 3e 7e f/ f+ 11e 36e f/ f+ r}t
t{ f{{ 2 3 }} f@ -> 14e 3e f/ 3.75e f+ 3e 14e f/ f+ 11e 18e f/ f+ r}t
base !
[THEN]