## PRAKTIKUM 4

In [None]:
import numpy as np
import time
from annoy import AnnoyIndex
import faiss
import hnswlib

# ===============================
# 1. Buat dataset 1 juta data 5D
# ===============================
n_data = 1_000_000   # bisa coba 100_000 dulu jika RAM terbatas
dim = 5
X = np.random.random((n_data, dim)).astype(np.float32)

# Normalize data for cosine similarity (optional but recommended)
# X = X / np.linalg.norm(X, axis=1)[:, np.newaxis]


# Query point
query = np.random.random((1, dim)).astype(np.float32)
# Normalize query for cosine similarity
# query = query / np.linalg.norm(query)

k = 10

# ===============================
# 2. Annoy
# ===============================
print("=== Annoy (Euclidean Distance) ===")
ann_index_euclidean = AnnoyIndex(dim, 'euclidean')

start = time.time()
for i in range(n_data):
    ann_index_euclidean.add_item(i, X[i])
ann_index_euclidean.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index_euclidean.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", neighbors[0][:5])
print("Distances (first 5):", neighbors[1][:5])

print("\n=== Annoy (Angular Distance) ===")
ann_index_angular = AnnoyIndex(dim, 'angular')

start = time.time()
for i in range(n_data):
    ann_index_angular.add_item(i, X[i])
ann_index_angular.build(10)  # 10 trees
build_time = time.time() - start

start = time.time()
neighbors = ann_index_angular.get_nns_by_vector(query[0], k, include_distances=True)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", neighbors[0][:5])
print("Distances (first 5):", neighbors[1][:5])


# ===============================
# 3. FAISS
# ===============================
print("\n=== FAISS (IndexFlatL2) ===")
faiss_index_l2 = faiss.IndexFlatL2(dim)

start = time.time()
faiss_index_l2.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index_l2.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", indices[0][:5])
print("Distances (first 5):", distances[0][:5])


print("\n=== FAISS (IndexFlatIP) ===")
# For Inner Product (IP), it's often beneficial to normalize the data first
# If your data is not normalized, IP distance can be affected by vector magnitude.
# Uncomment the normalization lines at the beginning if you want to test with normalized data.
faiss_index_ip = faiss.IndexFlatIP(dim)

start = time.time()
faiss_index_ip.add(X)
build_time = time.time() - start

start = time.time()
distances, indices = faiss_index_ip.search(query, k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", indices[0][:5])
print("Distances (first 5):", distances[0][:5])


# ===============================
# 4. HNSW (hnswlib)
# ===============================
print("\n=== HNSW (hnswlib - L2) ===")
hnsw_index_l2 = hnswlib.Index(space='l2', dim=dim)

start = time.time()
hnsw_index_l2.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index_l2.add_items(X)
build_time = time.time() - start

hnsw_index_l2.set_ef(50)

start = time.time()
labels, distances = hnsw_index_l2.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", labels[0][:5])
print("Distances (first 5):", distances[0][:5])


print("\n=== HNSW (hnswlib - IP) ===")
# For Inner Product (IP), it's often beneficial to normalize the data first
# If your data is not normalized, IP distance can be affected by vector magnitude.
# Uncomment the normalization lines at the beginning if you want to test with normalized data.
hnsw_index_ip = hnswlib.Index(space='ip', dim=dim)

start = time.time()
hnsw_index_ip.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index_ip.add_items(X)
build_time = time.time() - start

hnsw_index_ip.set_ef(50)

start = time.time()
labels, distances = hnsw_index_ip.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", labels[0][:5])
print("Distances (first 5):", distances[0][:5])


print("\n=== HNSW (hnswlib - Cosine) ===")
# For Cosine similarity, data should ideally be normalized to unit vectors.
# Uncomment the normalization lines at the beginning if you want to test with normalized data.
hnsw_index_cosine = hnswlib.Index(space='cosine', dim=dim)

start = time.time()
hnsw_index_cosine.init_index(max_elements=n_data, ef_construction=200, M=16)
hnsw_index_cosine.add_items(X)
build_time = time.time() - start

hnsw_index_cosine.set_ef(50)

start = time.time()
labels, distances = hnsw_index_cosine.knn_query(query, k=k)
query_time = time.time() - start

print("Build time:", build_time, "detik")
print("Query time:", query_time, "detik")
print("Neighbors (first 5):", labels[0][:5])
print("Distances (first 5):", distances[0][:5])

=== Annoy (Euclidean Distance) ===
Build time: 21.892763137817383 detik
Query time: 0.00019550323486328125 detik
Neighbors (first 5): [94543, 683318, 674044, 272414, 528476]
Distances (first 5): [0.05478391796350479, 0.06358018517494202, 0.06655634939670563, 0.06725229322910309, 0.07195629179477692]

=== Annoy (Angular Distance) ===
Build time: 27.20171618461609 detik
Query time: 0.0002951622009277344 detik
Neighbors (first 5): [102506, 428743, 216936, 342878, 140783]
Distances (first 5): [0.030187251046299934, 0.03625230863690376, 0.04427915811538696, 0.04495389387011528, 0.045594390481710434]

=== FAISS (IndexFlatL2) ===
Build time: 0.008498668670654297 detik
Query time: 0.006273746490478516 detik
Neighbors (first 5): [ 94543 683318 674044 272414 528476]
Distances (first 5): [0.00300128 0.00404244 0.00442975 0.00452287 0.00517771]

=== FAISS (IndexFlatIP) ===
Build time: 0.01523900032043457 detik
Query time: 0.006819486618041992 detik
Neighbors (first 5): [480322  13597 501074 137549