# Use bulkInsert to test GPU index

## Env preparation

1. Docker installation: https://docs.docker.com/engine/install/ubuntu/
2. Install nvidia-docker2:

In [None]:
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update
sudo apt-get install nvidia-docker2
sudo systemctl restart docker.service

3. Install NVIDIA driver

In [None]:
sudo apt install --no-install-recommends  nvidia-headless-535 nvidia-utils-535

4. (Optional) mount a high performance disk for test. We need to ensure all following operations are in a high performance disk. For an AWS host, we need to manually mount the NVMe SSD (For example, g4dn).

In [None]:
lsblk # see device path
sudo mkfs -t ext4 /dev/nvme1n1
sudo mkdir /data
sudo mount /dev/nvme1n1 /data
sudo -i blkid # get /dev/nvme1n1 UUID, e.g. dd04113f-deb6-42b0-a021-03110c119295 
sudo vi /etc/fstab # add to the tail: UUID=<UUID get from previous cmd> /data ext4 defaults 1 2
cd /data

## Download milvus image

In [None]:
sudo docker pull milvusdb/milvus:v2.4.0-rc.1

## Use docker compose to start the milvus service 

Save the following file as docker-compose.yml.

Use cmd: 

In [None]:
sudo docker compose up -d

## Prepare data 

Note: We may need to use sudo since we are in /data, and all pip command need to be under sudo.
Set the dataset as an environment variable：

In [None]:
export DATASET="cohere" # or "openai"

In [None]:
pip3 install polars
pip3 install numpy
pip3 install s3fs
pip3 install environs

In [None]:
import polars as pl
import numpy as np
import os
import shutil
import s3fs
import environs
env = environs.Env()
env.read_env(".env")
dataset_name = env.str("DATASET", "cohere")
parquet_path = dataset_name + "_data/"
npy_path = dataset_name + "_npy_data/"

base_file = parquet_path + "shuffle_train.parquet"
query_file = parquet_path + "test.parquet"
output_base = npy_path + "base.npy"
output_id = npy_path + "id.npy"
try: 
    shutil.rmtree(parquet_path)
except:
    pass   
try:
    shutil.rmtree(npy_path)
except:
    pass
os.mkdir(parquet_path)
os.mkdir(npy_path)
#download s3 file
fs = s3fs.S3FileSystem(anon=True, client_kwargs={"region_name": "us-west-2"})
if dataset_name == "cohere":
    s3_path = "assets.zilliz.com/benchmark/cohere_medium_1m"
elif dataset_name == "openai":
    s3_path = "assets.zilliz.com/benchmark/openai_medium_500k"
dataset_info = fs.ls(s3_path, detail=True)

downloads = []
for info in dataset_info:
    downloads.append(info['Key'])
print("download files:", downloads)
fs.download(downloads, parquet_path)

df_train = pl.read_parquet(base_file)
base = np.stack(df_train['emb']).astype(np.float32)
id = np.stack(df_train['id']).astype(np.int64)
all_embeddings = base / np.linalg.norm(base, axis=1)[:, np.newaxis]
np.save(output_base, all_embeddings)
np.save(output_id, id)