In [1]:
!pip install -U sentence_transformers
!/opt/bin/nvidia-smi

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[

In [2]:
import torch
# print(torch.__version__)
# print(torch.version.cuda)
# print(torch.backends.cudnn.version())
# print(torch.cuda.get_device_name(0))
# print(torch.cuda.is_available())

In [3]:
# # #Uninstall the current CUDA version
# !apt-get --purge remove cuda nvidia* libnvidia-*
# !dpkg -l | grep cuda- | awk '{print $2}' | xargs -n1 dpkg --purge
# !apt-get remove cuda-*
# !apt autoremove
# !apt-get update

# #Download CUDA 10.0
# !wget  --no-clobber https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-repo-ubuntu1804_10.2.89-1_amd64.deb
# #install CUDA kit dpkg
# !yes | dpkg -i cuda-repo-ubuntu1804_10.2.89-1_amd64.deb
# !sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
# !apt-get update
# !apt-get install cuda-10-2

In [4]:
!nvcc --version
!cat /usr/local/cuda/version.txt
!cat /usr/include/x86_64-linux-gnu/cudnn_v*.h | grep CUDNN_MAJOR -A 2

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0
cat: /usr/local/cuda/version.txt: No such file or directory
#define CUDNN_MAJOR 8
#define CUDNN_MINOR 9
#define CUDNN_PATCHLEVEL 0
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

/* cannot use constexpr here since this is a C-only file */


In [5]:
from google.colab import drive
drive.mount('/mntDrive')
from sentence_transformers import SentenceTransformer
import os
import json
import numpy as np

Mounted at /mntDrive


In [6]:
def get_passage_embds(model, date, passages_dir_prefix, para_embs_dir_prefix):
    model_name = model.split('/')[-1]
    passage_dir = passages_dir_prefix.format(date)
    para_embs_dir = para_embs_dir_prefix.format(model_name, date)

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model, device=device)
    total_params = sum(p.numel() for p in model.parameters())
    print(model_name, ": ", total_params)

    for doc in os.listdir(passage_dir):
        print(doc)
        if os.path.exists(os.path.join(para_embs_dir, doc)):
          continue
        doc_fp = os.path.join(passage_dir, doc)
        save_fp = os.path.join(para_embs_dir, doc)
        if not os.path.exists(para_embs_dir):
          os.makedirs(para_embs_dir)
        with open(doc_fp, "r") as f:
            passages = json.load(f)

        ptxt_list = []
        pctxt_list = []
        for pid in range(len(passages["passages_text"])):
            ptxt_list.append(passages["passages_text"][pid]["ptxt"])
            if "passages_composite_text" in passages:
                pctxt_list.append(passages["passages_composite_text"][pid]["pctxt"])

        ptxt_embs = []
        pctxt_embs = []

        for text in ptxt_list:
          embs = model.encode([text]).tolist()
          ptxt_embs.append(embs[0])

        for text in pctxt_list:
          embs = model.encode([text]).tolist()
          pctxt_embs.append(embs[0])

        with open(os.path.join(para_embs_dir, doc), "w") as f:
            json.dump({"ptxt_embs": ptxt_embs, "pctxt_embs": pctxt_embs}, f)

In [7]:
def query_embs(model, date, groundtruth_fp, query_emb_prefix):
    model_name = model.split('/')[-1]
    query_embs = query_emb_prefix.format(model_name, date)

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model = SentenceTransformer(model, device=device)

    with open(groundtruth_fp, "r") as f:
        groundtruth = json.load(f)

        query_list = []
        for doc in groundtruth:
            for question in doc["questions"]:
                # encode query
                embs = model.encode([question["query"]]).tolist()

                question["qemb"] = embs[0]

    with open(query_embs, "w") as f:
        json.dump(groundtruth, f)

In [8]:
# model1 = 'sentence-transformers/sentence-t5-xl'
# model2 = "sentence-transformers/all-MiniLM-L6-v2"
# model3 = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# model4 = "hkunlp/instructor-base"

# from sentence_transformers import SentenceTransformer
# device = "cuda:0" if torch.cuda.is_available() else "cpu"

# models = [model1, model2, model3]
# for model_name in models:
#   model = SentenceTransformer(model_name, device=device)
#   total_params = sum(p.numel() for p in model.parameters())
#   print(model_name, ": ", total_params)

In [None]:
data_p =  '/mntDrive/MyDrive/Colab Notebooks/'

date = "0708"

# passages_dir_prefix = data_p + "data/groundtruth/passages_{0}"
# para_embs_dir_prefix = data_p + "data/groundtruth/{0}/para_embs_{1}"

# passages_dir_prefix = data_p + "data/groundtruth/passages_proc_table_{0}"
# para_embs_dir_prefix = data_p + "data/groundtruth/{0}/para_proc_table_embs_{1}"

# passages_dir_prefix = data_p + "data/groundtruth/passages_chunk_{0}"
# para_embs_dir_prefix = data_p + "data/groundtruth/{0}/para_embs_chunk_{1}"

passages_dir_prefix = data_p + "data/groundtruth/passages_table2text_tid_{0}"
para_embs_dir_prefix = data_p + "data/groundtruth/{0}/para_embs_table2text_{1}"

groundtruth_fp = data_p +  "data/groundtruth/qa_groundtruth.json"
query_emb_prefix =  data_p + "data/groundtruth/{0}/query_emb_{1}.json"

model1 = 'sentence-transformers/sentence-t5-xl'
model2 = "sentence-transformers/all-MiniLM-L6-v2"
model3 = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
model4 = "hkunlp/instructor-base"

model5 = 'sentence-transformers/sentence-t5-xxl'
# get_passage_embds(model1, date, passages_dir_prefix, para_embs_dir_prefix)
# get_passage_embds(model2, date, passages_dir_prefix, para_embs_dir_prefix)
get_passage_embds(model5, date, passages_dir_prefix, para_embs_dir_prefix)
# get_passage_embds(model1, date, passages_dir_prefix, para_embs_dir_prefix)


query_embs(model5, date, groundtruth_fp, query_emb_prefix)

Downloading (…)55764/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)8828555764/README.md:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

Downloading (…)28555764/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/9.73G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)55764/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)8555764/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]