forked from apache/madlib
-
Notifications
You must be signed in to change notification settings - Fork 4
/
sortasort.c
237 lines (211 loc) · 7.3 KB
/
sortasort.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/*!
* \file sortasort.c
*
* \brief sortasort dictionary implementation
*/
/*!
* A "sortasort" is a pre-marshalled *set* (no dups) of values intended for
* append and query operations only (no deletion). It's not a
* particularly smart data structure. Cuckoo hashing would be a
* fancier solution.
*/
/*
* It is structured as a header, followed by a fixed-length "directory"
* (an array of offsets) that point to the actual Datums
* concatenated in a variable-length array at the end of the directory.
* The initial directory entries are sorted in ascending order of the Datums
* they point to, but the last < SORTA_SLOP entries are left unsorted to facilitate
* efficient insertion. Binary Search is used on all but those last entries,
* which must be scanned. At every k*SORTA_SLOP'th insert, the full directory is
* sorted.
*/
//#include <ctype.h>
#include <postgres.h>
#include <fmgr.h>
#include "sortasort.h"
#include "sketch_support.h"
#define SORTASORT_DATA(s) (((char *)(s->dir)) + \
(s->capacity * (sizeof (s->dir[0]))))
/*
* get the ith item stored in sortasort
*/
char *sortasort_getval(sortasort *s, unsigned i) {
char *data_ptr = SORTASORT_DATA(s);
char *res = NULL;
Datum dat;
if (i >= s->num_vals) {
elog(ERROR, "attempt to get item at illegal index %d in sortasort", i);
}
if (s->dir[i] >= s->storage_sz) {
elog(ERROR, "illegal offset %u in sortasort", s->dir[i]);
}
res = data_ptr + s->dir[i];
dat = PointerExtractDatum(res, s->typByVal);
if (s->dir[i]
+ ExtractDatumLen(dat, s->typLen, s->typByVal, s->storage_sz - s->dir[i])
> s->storage_sz) {
elog(ERROR, "value overruns size of sortasort");
}
return res;
}
/*!
* given a pre-allocated sortasort, set up its metadata
* first argument s is the
* \param s a pre-allocated sortasort
* \param capacity size of the sortasort directory
* \param s_sz size of s
* \param typLen Postgres type length (-1 for bytea, -2 for cstring)
*/
sortasort *
sortasort_init(sortasort *s,
size_t capacity,
size_t s_sz,
int typLen,
bool typByVal)
{
/* capacity is the size of the directory: i.e. max number of Datums it can hold */
s->capacity = capacity;
/* storage_sz is the number of bytes available for Datums at the end. */
s->storage_sz = s_sz - sizeof(sortasort) - capacity*sizeof(s->dir[0]);
if (s_sz - sizeof(sortasort) <= capacity*sizeof(s->dir[0]))
elog(
ERROR,
"sortasort initialized too small to hold its own directory");
s->typLen = typLen;
s->typByVal = typByVal;
/* number of values so far */
s->num_vals = 0;
/* offset after the directory to do the next insertion */
s->storage_cur = 0;
return(s);
}
/*! comparison function for qsort_arg */
int sorta_cmp(const void *i, const void *j, void *thunk)
{
/* the "thunk" in this case is the sortasort being sorted */
sortasort *s = (sortasort *)thunk;
char *dat1 = sortasort_getval(s, ((unsigned *)i) - s->dir);
char *dat2 = sortasort_getval(s, ((unsigned *)j) - s->dir);
int len = s->typLen;
int shorter;
/*
* we always use typByVal = true, since we've marshalled the data into place
*/
if (len < 0) {
len = (int)ExtractDatumLen(PointerGetDatum(dat1), len, s->typByVal, -1);
if ((shorter = (len - ExtractDatumLen(PointerGetDatum(dat2), len, s->typByVal, -1))))
/* order by length */
return shorter;
/* else drop through */
}
/* byte ordering */
return memcmp(dat1, dat2, len);
}
/*!
* insert a new element into s_in if there's room and return TRUE
* if not enough room, return FALSE
* \param s_in a sortasort
* \param dat a Datum to insert
* \param len the Postgres typLen
*/
int sortasort_try_insert(sortasort *s_in, Datum dat, int len)
{
/* sanity check */
void *datp = DatumExtractPointer(dat, s_in->typByVal);
/* first check to see if the element is already there */
int found = sortasort_find(s_in, dat);
if (found >= 0 && found < (int)s_in->num_vals) {
/* found! just return TRUE */
return TRUE;
}
len = ExtractDatumLen(dat, len, s_in->typByVal, -1);
/* sanity check */
if (found < -1 || found >= (int) s_in->num_vals)
elog(ERROR,
"invalid offset %d returned by sortasort_find",
found);
/* we need to insert v. return FALSE if not enough space. */
if (s_in->storage_cur + len >= s_in->storage_sz) {
/* caller will have to allocate a bigger one and try again */
return FALSE;
}
/* return -1 if no more capacity */
if (s_in->num_vals >= s_in->capacity)
return -1;
/*
* copy dat to the current storage offset, put a pointer into dir,
* update num_vals and storage_cur
*/
memcpy(SORTASORT_DATA(s_in) + s_in->storage_cur,
datp,
len);
s_in->dir[s_in->num_vals++] = s_in->storage_cur;
s_in->storage_cur += len;
if (s_in->storage_cur > s_in->storage_sz)
elog(ERROR, "went off the end of sortasort storage");
/* re-sort every SORTA_SLOP vals */
if (s_in->num_vals % SORTA_SLOP == 0)
qsort_arg(s_in->dir,
s_in->num_vals,
sizeof(s_in->dir[0]),
sorta_cmp,
(void *)s_in);
return TRUE;
}
/*!
* find items in a sortasort. this involves binary search in the sorted prefix,
* and linear search in the <SORTA_SLOP-sized suffix.
* We assume that the sorted prefix is the
* highest multiple of SORTA_SLOP less than s->num_vals
*
* \param s a sortasort
* \param v a value to find
* \return position in directory where item found, or -1 if not found.
* NOTE: return value 0 does not mean false!! It means it *found* the item
* at position 0
*/
int sortasort_find(sortasort *s, Datum dat)
{
int theguess, diff;
int hi = (s->num_vals/SORTA_SLOP)*SORTA_SLOP;
int themin = 0, themax = hi - 1;
size_t i;
int addend, subtrahend;
size_t len = ExtractDatumLen(dat, s->typLen, s->typByVal, -1);
/* binary search on the front of the sortasort */
if (themax >= (int)s->num_vals) {
elog(ERROR,
"sortasort failure: max = %d, num_vals = %zu",
themax,
s->num_vals);
}
theguess = hi / 2;
while (themin < themax ) {
if (!(diff = memcmp(sortasort_getval(s,theguess),
DatumExtractPointer(dat, s->typByVal),
len)))
return theguess;
if (themin == themax - 1) break;
else if (diff < 0) {
/* undershot */
addend = (themax - theguess) / 2;
if (!addend) addend = 1;
themin = theguess;
theguess += addend;
}
else {
/* overshot */
subtrahend = (theguess - themin) / 2;
if (!subtrahend) subtrahend = 1;
themax = theguess;
theguess -= subtrahend;
}
}
/* if we got here, continue with a naive linear search on the tail */
for (i = hi; i < s->num_vals; i++)
if (!memcmp(sortasort_getval(s, i),
DatumExtractPointer(dat, s->typByVal),
len))
return i;
return -1;
}