Skip to content

Commit

Permalink
Added support for bit to IVFFlat
Browse files Browse the repository at this point in the history
  • Loading branch information
ankane committed Apr 17, 2024
1 parent 819b6cf commit 04af15c
Show file tree
Hide file tree
Showing 14 changed files with 315 additions and 3 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

- Added `halfvec` type
- Added `sparsevec` type
- Added support for `bit` vectors to HNSW
- Added support for indexing `bit` type
- Added `binary_quantize` function
- Added `hamming_distance` function
- Added `jaccard_distance` function
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ Supported types are:

- `vector` - up to 2,000 dimensions
- `halfvec` - up to 4,000 dimensions (unreleased)
- `bit` - up to 64,000 dimensions (unreleased)

### Query Options

Expand Down
12 changes: 12 additions & 0 deletions sql/vector--0.6.2--0.7.0.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,18 @@ CREATE OPERATOR <%> (
COMMUTATOR = '<%>'
);

CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 hamming_distance(bit, bit),
FUNCTION 3 hamming_distance(bit, bit);

CREATE OPERATOR CLASS bit_jaccard_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 jaccard_distance(bit, bit),
FUNCTION 3 jaccard_distance(bit, bit);

CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING hnsw AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
Expand Down
12 changes: 12 additions & 0 deletions sql/vector.sql
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,18 @@ CREATE OPERATOR <%> (

-- bit opclasses

CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 hamming_distance(bit, bit),
FUNCTION 3 hamming_distance(bit, bit);

CREATE OPERATOR CLASS bit_jaccard_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 jaccard_distance(bit, bit),
FUNCTION 3 jaccard_distance(bit, bit);

CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING hnsw AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
Expand Down
5 changes: 5 additions & 0 deletions src/ivfbuild.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "access/tableam.h"
#include "access/parallel.h"
#include "access/xact.h"
#include "bitvector.h"
#include "catalog/index.h"
#include "catalog/pg_operator_d.h"
#include "catalog/pg_type_d.h"
Expand Down Expand Up @@ -324,6 +325,8 @@ GetMaxDimensions(IvfflatType type)

if (type == IVFFLAT_TYPE_HALFVEC)
maxDimensions *= 2;
else if (type == IVFFLAT_TYPE_BIT)
maxDimensions *= 32;

return maxDimensions;
}
Expand All @@ -338,6 +341,8 @@ GetItemSize(IvfflatType type, int dimensions)
return VECTOR_SIZE(dimensions);
else if (type == IVFFLAT_TYPE_HALFVEC)
return HALFVEC_SIZE(dimensions);
else if (type == IVFFLAT_TYPE_BIT)
return VARBITTOTALLEN(dimensions);
else
elog(ERROR, "Unsupported type");
}
Expand Down
3 changes: 2 additions & 1 deletion src/ivfflat.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
typedef enum IvfflatType
{
IVFFLAT_TYPE_VECTOR,
IVFFLAT_TYPE_HALFVEC
IVFFLAT_TYPE_HALFVEC,
IVFFLAT_TYPE_BIT
} IvfflatType;

/* Build phases */
Expand Down
65 changes: 64 additions & 1 deletion src/ivfkmeans.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
#include <float.h>
#include <math.h>

#include "bitvector.h"
#include "halfutils.h"
#include "halfvec.h"
#include "ivfflat.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/datum.h"
#include "utils/memutils.h"
#include "vector.h"
Expand Down Expand Up @@ -134,6 +136,15 @@ CompareHalfVectors(const void *a, const void *b)
return halfvec_cmp_internal((HalfVector *) a, (HalfVector *) b);
}

/*
* Compare bit vectors
*/
static int
CompareBitVectors(const void *a, const void *b)
{
return DirectFunctionCall2(bitcmp, VarBitPGetDatum((VarBit *) a), VarBitPGetDatum((VarBit *) b));
}

/*
* Quick approach if we have little data
*/
Expand All @@ -151,6 +162,8 @@ QuickCenters(Relation index, VectorArray samples, VectorArray centers, IvfflatTy
qsort(samples->items, samples->length, samples->itemsize, CompareVectors);
else if (type == IVFFLAT_TYPE_HALFVEC)
qsort(samples->items, samples->length, samples->itemsize, CompareHalfVectors);
else if (type == IVFFLAT_TYPE_BIT)
qsort(samples->items, samples->length, samples->itemsize, CompareBitVectors);
else
elog(ERROR, "Unsupported type");

Expand Down Expand Up @@ -191,6 +204,16 @@ QuickCenters(Relation index, VectorArray samples, VectorArray centers, IvfflatTy
for (int j = 0; j < dimensions; j++)
vec->x[j] = Float4ToHalfUnchecked((float) RandomDouble());
}
else if (type == IVFFLAT_TYPE_BIT)
{
VarBit *vec = DatumGetVarBitP(center);

SET_VARSIZE(vec, VARBITTOTALLEN(dimensions));
VARBITLEN(vec) = dimensions;

for (int j = 0; j < dimensions; j++)
VARBITS(vec)[j / dimensions] |= (RandomDouble() > 0.5 ? 1 : 0) << (7 - (j % 8));
}
else
elog(ERROR, "Unsupported type");

Expand Down Expand Up @@ -263,6 +286,17 @@ ComputeNewCenters(VectorArray samples, VectorArray aggCenters, VectorArray newCe
aggCenter->x[k] += HalfToFloat4(vec->x[k]);
}
}
else if (type == IVFFLAT_TYPE_BIT)
{
for (int j = 0; j < numSamples; j++)
{
Vector *aggCenter = (Vector *) VectorArrayGet(aggCenters, closestCenters[j]);
VarBit *vec = (VarBit *) VectorArrayGet(samples, j);

for (int k = 0; k < dimensions; k++)
aggCenter->x[k] += (float) (((VARBITS(vec)[k / 8]) >> (7 - (k % 8))) & 0x01);
}
}
else
elog(ERROR, "Unsupported type");

Expand Down Expand Up @@ -308,6 +342,21 @@ ComputeNewCenters(VectorArray samples, VectorArray aggCenters, VectorArray newCe
newCenter->x[k] = Float4ToHalfUnchecked(aggCenter->x[k]);
}
}
else if (type == IVFFLAT_TYPE_BIT)
{
for (int j = 0; j < numCenters; j++)
{
Vector *aggCenter = (Vector *) VectorArrayGet(aggCenters, j);
VarBit *newCenter = (VarBit *) VectorArrayGet(newCenters, j);
unsigned char *nx = VARBITS(newCenter);

for (uint32 k = 0; k < VARBITBYTES(newCenter); k++)
nx[k] = 0;

for (int k = 0; k < dimensions; k++)
nx[k / 8] |= (aggCenter->x[k] > 0.5) << (7 - (k % 8));
}
}

/* Normalize if needed */
if (normprocinfo != NULL)
Expand Down Expand Up @@ -425,6 +474,18 @@ ElkanKmeans(Relation index, VectorArray samples, VectorArray centers, IvfflatTyp
vec->dim = dimensions;
}
}
else if (type == IVFFLAT_TYPE_BIT)
{
newCenters = VectorArrayInit(numCenters, dimensions, centers->itemsize);

for (int j = 0; j < numCenters; j++)
{
VarBit *vec = (VarBit *) VectorArrayGet(newCenters, j);

SET_VARSIZE(vec, VARBITTOTALLEN(dimensions));
VARBITLEN(vec) = dimensions;
}
}
else
elog(ERROR, "Unsupported type");

Expand Down Expand Up @@ -642,7 +703,7 @@ CheckCenters(Relation index, VectorArray centers, IvfflatType type)
elog(ERROR, "Infinite value detected. Please report a bug.");
}
}
else
else if (type != IVFFLAT_TYPE_BIT)
elog(ERROR, "Unsupported type");
}

Expand All @@ -652,6 +713,8 @@ CheckCenters(Relation index, VectorArray centers, IvfflatType type)
qsort(centers->items, centers->length, centers->itemsize, CompareVectors);
else if (type == IVFFLAT_TYPE_HALFVEC)
qsort(centers->items, centers->length, centers->itemsize, CompareHalfVectors);
else if (type == IVFFLAT_TYPE_BIT)
qsort(centers->items, centers->length, centers->itemsize, CompareBitVectors);
else
elog(ERROR, "Unsupported type");

Expand Down
3 changes: 3 additions & 0 deletions src/ivfscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <float.h>

#include "access/relscan.h"
#include "bitvector.h"
#include "catalog/pg_operator_d.h"
#include "catalog/pg_type_d.h"
#include "halfvec.h"
Expand Down Expand Up @@ -195,6 +196,8 @@ GetScanValue(IndexScanDesc scan)
value = PointerGetDatum(InitVector(so->dimensions));
else if (type == IVFFLAT_TYPE_HALFVEC)
value = PointerGetDatum(InitHalfVector(so->dimensions));
else if (type == IVFFLAT_TYPE_BIT)
value = PointerGetDatum(InitBitVector(so->dimensions));
else
elog(ERROR, "Unsupported type");
}
Expand Down
3 changes: 3 additions & 0 deletions src/ivfutils.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ IvfflatGetType(Relation index)
Form_pg_type type;
IvfflatType result;

if (typid == BITOID)
return IVFFLAT_TYPE_BIT;

tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typid));
if (!HeapTupleIsValid(tuple))
elog(ERROR, "cache lookup failed for type %u", typid);
Expand Down
32 changes: 32 additions & 0 deletions test/expected/ivfflat_bit_hamming.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
SET enable_seqscan = off;
CREATE TABLE t (val bit(3));
INSERT INTO t (val) VALUES (B'000'), (B'100'), (B'111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_hamming_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES (B'110');
SELECT * FROM t ORDER BY val <~> B'111';
val
-----
111
110
100
000
(4 rows)

SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <~> (SELECT NULL::bit)) t2;
count
-------
4
(1 row)

DROP TABLE t;
-- TODO move
CREATE TABLE t (val varbit(3));
CREATE INDEX ON t USING ivfflat (val bit_hamming_ops) WITH (lists = 1);
ERROR: type not supported for ivfflat index
CREATE INDEX ON t USING ivfflat ((val::bit(3)) bit_hamming_ops) WITH (lists = 1);
NOTICE: ivfflat index created with little data
DETAIL: This will cause low recall.
HINT: Drop the index until the table has more data.
CREATE INDEX ON t USING ivfflat ((val::bit(64001)) bit_hamming_ops) WITH (lists = 1);
ERROR: column cannot have more than 64000 dimensions for ivfflat index
DROP TABLE t;
21 changes: 21 additions & 0 deletions test/expected/ivfflat_bit_jaccard.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
SET enable_seqscan = off;
CREATE TABLE t (val bit(4));
INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_jaccard_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES (B'1110');
SELECT * FROM t ORDER BY val <%> B'1111';
val
------
1111
1110
1100
0000
(4 rows)

SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2;
count
-------
4
(1 row)

DROP TABLE t;
19 changes: 19 additions & 0 deletions test/sql/ivfflat_bit_hamming.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
SET enable_seqscan = off;

CREATE TABLE t (val bit(3));
INSERT INTO t (val) VALUES (B'000'), (B'100'), (B'111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_hamming_ops) WITH (lists = 1);

INSERT INTO t (val) VALUES (B'110');

SELECT * FROM t ORDER BY val <~> B'111';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <~> (SELECT NULL::bit)) t2;

DROP TABLE t;

-- TODO move
CREATE TABLE t (val varbit(3));
CREATE INDEX ON t USING ivfflat (val bit_hamming_ops) WITH (lists = 1);
CREATE INDEX ON t USING ivfflat ((val::bit(3)) bit_hamming_ops) WITH (lists = 1);
CREATE INDEX ON t USING ivfflat ((val::bit(64001)) bit_hamming_ops) WITH (lists = 1);
DROP TABLE t;
12 changes: 12 additions & 0 deletions test/sql/ivfflat_bit_jaccard.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
SET enable_seqscan = off;

CREATE TABLE t (val bit(4));
INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_jaccard_ops) WITH (lists = 1);

INSERT INTO t (val) VALUES (B'1110');

SELECT * FROM t ORDER BY val <%> B'1111';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2;

DROP TABLE t;

0 comments on commit 04af15c

Please sign in to comment.