In [None]:
import sys

if "google.colab" in sys.modules:
    !git clone https://github.com/rapidsai/rapidsai-csp-utils.git
    !python rapidsai-csp-utils/colab/pip-install.py
    !pip install faiss-cpu

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 592, done.[K
remote: Counting objects: 100% (158/158), done.[K
remote: Compressing objects: 100% (76/76), done.[K
remote: Total 592 (delta 125), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (592/592), 194.79 KiB | 1.46 MiB/s, done.
Resolving deltas: 100% (299/299), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.12 environment at: /usr
Resolved 173 packages in 1.40s
Downloading cudf-cu12 (1.7MiB)
Downloading cugraph-cu12 (3.0MiB)
Downloading rmm-cu12 (1.5MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading dask (1.3MiB)
Downloading bokeh (6.6MiB)
Downloading shapely (2.4MiB)
Downloading datashader (17.5MiB)
Downloading libcudf-cu12 (538.8MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1MiB)
Downloading raft-

In [None]:
import cuvs
import faiss

cuvs_version = cuvs.__version__
faiss_cpu_version = faiss.__version__

print(f"cuVS version: {cuvs_version}")
print(f"FAISS CPU version: {faiss_cpu_version}")

cuVS version: 25.02.01
FAISS CPU version: 1.11.0


# 데이터 생성

In [None]:
import numpy as np
import cupy as cp

# 1. NumPy 시드 고정
np.random.seed(42)

# 2. 데이터 생성 : 분포가 다른 2개의 2차원 데이터를 합쳐서 벡터 크기 차이가 나게 만듦
dataset_np = np.hstack([
    np.random.random((50000, 25)),   # [0, 1) 구간의 균일 분포
    np.random.randn(50000, 25) * 5    # 평균 0, 표준편차 5인 정규 분포
]).astype(np.float32)

# 3. Numpy 배열을 CuPy 배열로 변환
dataset_cp = cp.asarray(dataset_np)

# 4. 쿼리 개수 지정 (첫 번째 샘플을 쿼리로 사용)
n_queries = 1
query_np = dataset_np[:n_queries]
query_cp = dataset_cp[:n_queries]

# 5. 데이터셋과 쿼리 벡터의 일부 출력
print("데이터셋 일부:\n", dataset_np[:1], end='\n\n')  # 첫 2개 샘플만 출력
print("쿼리 벡터:\n", query_np)

데이터셋 일부:
 [[  0.37454012   0.9507143    0.7319939    0.5986585    0.15601864
    0.15599452   0.05808361   0.8661761    0.601115     0.7080726
    0.02058449   0.96990985   0.83244264   0.21233912   0.18182497
    0.1834045    0.30424225   0.52475643   0.43194503   0.29122913
    0.6118529    0.13949387   0.29214466   0.36636186   0.45606998
   -6.9410195   -0.7064649   -2.3158252    8.927271    -0.20885432
   -5.3248634   -2.2789273   -1.2751076    2.4537957    4.761805
    5.567778    -4.7681065    0.42192096   9.123224    -2.6262207
    0.5368253   -8.330255     1.1769367   -2.1636157    0.6136664
    1.3033437   -1.3473666    1.8423494    4.220154    11.622142  ]
 [  0.785176     0.19967379   0.5142344    0.59241456   0.04645041
    0.60754484   0.17052412   0.06505159   0.94888556   0.965632
    0.80839735   0.30461377   0.09767211   0.684233     0.4401525
    0.12203824   0.4951769    0.03438852   0.9093204    0.25877997
    0.66252226   0.31171107   0.52006805   0.54671025   0.1

# FAISS Brute-force

In [None]:
%%time

import numpy as np
import faiss

# 1. 파라미터 설정 : Top-k
k = 10

# 2. 코사인 유사도 계산에 필요한 L2 정규화 적용
faiss.normalize_L2(dataset_np)
faiss.normalize_L2(query_np)

# 3. L2 정규화된 데이터를 바탕으로 Brute-force 인덱스 생성
index = faiss.IndexFlatIP(len(dataset_np[0]))

# 4. 인덱스에 벡터 추가
index.add(dataset_np)

# 5. 벡터 검색
distances, neighbors = index.search(query_np, k)

# 6. 검색 결과 출력
print("Top-k 인덱스:", neighbors[0])
print("유사도 점수 (cosine):", distances[0])

Top-k 인덱스: [    0 48906 22098 20005 43820 21295 17192  6777 14367  7147]
유사도 점수 (cosine): [1.         0.7389719  0.6744807  0.6726349  0.67114633 0.6699053
 0.6660687  0.65731394 0.6533355  0.65039325]
CPU times: user 29.3 ms, sys: 5.03 ms, total: 34.3 ms
Wall time: 34.2 ms


# cuVS Brute-force

In [None]:
%%time
import cupy as cp
from cuvs.neighbors import brute_force

# 1. 파라미터 설정 : Top-k
k = 10

# 2. 코사인 유사도를 사용해서 Brute-force 인덱스 생성
index = brute_force.build(dataset_cp, metric="cosine")

# 3. 벡터 검색
distances, neighbors = brute_force.search(index, query_cp, k)

# 4. CuPy 배열을 Numpy 배열로 변경
distances = cp.asarray(distances)
neighbors = cp.asarray(neighbors)

# 5. 검색 결과 출력
print("Top-k 인덱스:", neighbors[0])
print("유사도 점수 (cosine):", distances[0])

Top-k 인덱스: [    0 48906 22098 20005 43820 21295 17192  6777 14367  7147]
유사도 점수 (cosine): [-1.1920929e-07  2.6102811e-01  3.2551938e-01  3.2736516e-01
  3.2885367e-01  3.3009470e-01  3.3393139e-01  3.4268612e-01
  3.4666449e-01  3.4960675e-01]
CPU times: user 170 ms, sys: 81.6 ms, total: 252 ms
Wall time: 619 ms
