In [1]:
import os
import sys
if sys.path[-1] != "../":
    sys.path.append("../")
    os.chdir("../")

import numpy as np
import pandas as pd
from IPython.display import display
from random import sample
from transformers import AutoModel, AutoTokenizer

import torch
from utils.util import *
from utils.index import *
from utils.data import *

from hydra import initialize, compose

config = Config()
with initialize(version_base=None, config_path="../data/config/"):
    overrides = [
        "base=NQ320k",
        # "base=MS300k",
        # "++plm=t5",
    ]
    hydra_config = compose(config_name="_example", overrides=overrides)
    config._from_hydra(hydra_config)

loaders = prepare_data(config)

loader_text = loaders["text"]
loader_query = loaders["query"]
text_dataset = loader_text.dataset
query_dataset = loader_query.dataset

# tokenizer = AutoTokenizer.from_pretrained(os.path.join(config.plm_root, config.plm_tokenizer))

[2023-08-18 02:58:54,609] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


[2023-08-18 02:58:58,699] INFO (Config) setting seed to 42...
[2023-08-18 02:58:58,711] INFO (Config) setting PLM to t5...
[2023-08-18 02:58:58,954] INFO (Config) Config: {'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'batch_size': 2, 'bf16': False, 'cache_root': 'data/cache/NQ320k', 'data_format': 'memmap', 'data_root': '/share/peitian/Data/AutoTSG', 'dataset': 'NQ320k', 'debug': False, 'deepspeed': None, 'device': 0, 'distill_src': 'none', 'early_stop_patience': 5, 'enable_all_gather': True, 'enable_distill': False, 'enable_inbatch_negative': True, 'epoch': 20, 'eval_batch_size': 2, 'eval_delay': 0, 'eval_flops': False, 'eval_metric': ['mrr', 'recall'], 'eval_metric_cutoff': [1, 5, 10, 100, 1000], 'eval_mode': 'retrieve', 'eval_posting_length': False, 'eval_set': 'dev', 'eval_step': '1e', 'fp16': False, 'grad_accum_step': 1, 'hits': 1000, 'index_shard': 32, 'index_thread': 10, 'index_type': 'invvec', 'learning_rate': 3e-06, 'load_ckpt': None, 'load_encode': False, 'l

In [2]:
# load terms
# code_type = "words_title_comma_plus_stem"
code_type = "words_comma_plus_stem"
code_tokenizer = "t5"
# for NQ320k
code_length = 26
# for MS300k
# code_length = 34
# code_length = 42
# code_length = 66

tokenizer = AutoTokenizer.from_pretrained(os.path.join(config.plm_root, code_tokenizer))

text_codes = np.memmap(
    f"data/cache/{config.dataset}/codes/{code_type}/{code_tokenizer}/{code_length}/codes.mmp",
    mode="r",
    dtype=np.int32
).reshape(len(text_dataset), -1).copy()

In [24]:
new_codes = np.memmap(
    "codes.mmp",
    mode="w+",
    dtype=np.int32,
    shape=text_codes.shape
)

In [12]:
a = np.array([1,2,3])
b = a[1:].copy()
np.random.shuffle(b)
a, b

(array([1, 2, 3]), array([3, 2]))

In [30]:
start_shuffle_pos = 0

for i, code in enumerate(text_codes):
    code = code[code != -1]
    random_code = code[1:-1].copy()
    random_words = []
    word = []
    for c in random_code:
        word.append(c.tolist())
        if c == 6:
            random_words.append(word.copy())
            word.clear()
    random.shuffle(random_words[start_shuffle_pos:])
    random_code = code[:1].tolist() + sum(random_words, []) + code[-1:].tolist()
    if len(random_code) < text_codes.shape[-1]:
        random_code += [-1 for _ in range(text_codes.shape[-1] - len(random_code))]

    random_code = np.array(random_code, dtype=code.dtype)
    new_codes[i] = random_code

In [31]:
text_codes[:10], new_codes[:10]

(array([[    0,   791,     6,  4842,     6,  1070,     6,  1622,     6,
          4175,     6,   884,     6,  5657,     6,  7387,     6,  1730,
             6,  3882,     6,   367,     6, 11947,     6,     1],
        [    0,  2039,     6,     3,  1054,     6, 18598,     6,  1736,
             6,   942,     6,     3,  6471,    63,     6,     3,    51,
            75,  1018, 10361,     6,  3912,     6,     1,    -1],
        [    0,     3,  4339,    51,     6, 20617,   257,     6,     3,
          4339,  3357, 20260,   106,     6, 11614,    32,     6,     3,
          4339,  3357, 20260,     9,     6,  6182,     6,     1],
        [    0, 20134,     6,     3,    29,    89,    40,     6,  9204,
             6,  1415,     6,  3858,    26,    63,     6,  3370,     6,
           158, 21220,     6,     3,  2434,    53,     6,     1],
        [    0,     3,    52,    32,   152,  1825,    15,     6,  6718,
            63,     6,  1513,     6, 19803,     6,   649,    15,     6,
             3, 

In [None]:
indices = range(10)
text_code = text_codes[indices]
text_code[text_code == -1] = 0
display(tokenizer.batch_decode(text_code))
display(tokenizer.batch_decode(np.array(text_dataset[indices]["text"]["input_ids"])[:, :100]))

In [None]:
# trie = TrieIndex(save_dir=f"data/cache/{config.dataset}/codes/{code_type}/{code_tokenizer}/{code_length}")
# trie.load()

# wordset = WordSetIndex(save_dir=f"data/cache/{config.dataset}/codes/{code_type}/{code_tokenizer}/{code_length}", sep_token_id=6)
# wordset.fit(None)

# text_codes = np.sort(text_codes, axis=-1)
df = pd.DataFrame(text_codes)
duplicates = df.groupby(df.columns.tolist(),as_index=False).size()
duplicates = duplicates.sort_values("size", ascending=False)
duplicates.reset_index(drop=True, inplace=True)

dup = df.duplicated(keep="first").to_numpy()
dup_indices = np.argwhere(dup)[:, 0]
len(dup_indices), duplicates["size"][duplicates["size"] > 1].sum()

In [None]:
train_positives = load_pickle("/share/peitian/Code/Adon/src/data/cache/MSMARCO-passage/dataset/query/train/positives.pkl")
dev_positives = load_pickle("/share/peitian/Code/Adon/src/data/cache/MSMARCO-passage/dataset/query/dev/positives.pkl")

all_positives = defaultdict(list)
for k, v in train_positives.items():
    all_positives[v[0]].append(k)
for k, v in dev_positives.items():
    all_positives[v[0]].append(k)

train_docs = set([x[0] for x in train_positives.values()])
dev_docs = set([x[0] for x in dev_positives.values()])

In [None]:
idx = 1
most_dup_idx = np.argwhere((text_codes == duplicates.iloc[idx].to_numpy()[:-1]).all(-1))[:, 0]
most_dup_code = text_codes[most_dup_idx]
most_dup_code[most_dup_code == -1] = 0
most_dup_text = np.array(text_dataset[most_dup_idx]["text"]["input_ids"])#[:, :code_length + 5]
tokenizer.batch_decode(most_dup_code), tokenizer.batch_decode(most_dup_text)

In [None]:
train_query_dataset = QueryDataset(config, "train")
qidx = 337798
tidx = train_positives[qidx][0]
tokenizer.decode(text_dataset[tidx]["text"]["input_ids"]), tokenizer.decode(train_query_dataset[qidx]["query"]["input_ids"])

In [None]:
positives = load_pickle("/share/peitian/Code/Adon/src/data/cache/MSMARCO-passage/dataset/query/dev/positives.pkl")
pos_docs = [v[0] for v in positives.values()]
dup_docs = np.argwhere(dup[pos_docs]==True).squeeze()
len(dup_docs)

In [2]:
# create new query set based on an existing one

dataset = "MSMARCO-passage"
ori_query_set = "doct5"
query_set = "doct5-1"

try:
    qid2idx = load_pickle(f"data/cache/{dataset}/dataset/query/{ori_query_set}/id2index.pkl")
except FileNotFoundError:
    qid2idx = {}
    with open(f"{config.data_root}/{dataset}/queries.{ori_query_set}.tsv") as f:
        for qidx, line in enumerate(tqdm(f, desc="Collecting qid2idx")):
            qid = line.split("\t")[0]
            qid2idx[qid] = qidx
            
tid2idx = load_pickle(f"data/cache/{dataset}/dataset/text/id2index.pkl")

qindices = []
tid2qrels = defaultdict(list)
k = 1

train_positives = load_pickle(f"data/cache/{dataset}/dataset/query/train/positives.pkl")
train_positives = set([x[0] for x in train_positives.values()])
miss_docs = set(range(len(text_dataset))) - train_positives
print(f"number of documents missing in training set: {len(miss_docs)}")

with open(f"{config.data_root}/{dataset}/qrels.{ori_query_set}.tsv") as ori_qrel_file, open(f"{config.data_root}/{dataset}/qrels.{query_set}.tsv", "w") as qrel_file, open(f"{config.data_root}/{dataset}/queries.{ori_query_set}.tsv") as ori_query_file, open(f"{config.data_root}/{dataset}/queries.{query_set}.tsv", "w") as query_file:
    for i, line in enumerate(ori_qrel_file):
        qid, _, tid, _ = line.strip().split("\t")
        qidx = qid2idx[qid]

        # filter out the existing ones
        # tidx = tid2idx[tid]
        # if tidx in miss_docs:
        #     tid2qrels[tid].append(line)
        #     qindices.append(qidx)

        # keep the first k elements
        if len(tid2qrels[tid]) >= k:
            continue
        else:
            tid2qrels[tid].append(line)
            qindices.append(qidx)

    qindices = set(qindices)
    for i, line in enumerate(ori_query_file):
        if i in qindices:
            query_file.write(line)

    for qrels in tid2qrels.values():
        for line in qrels:
            qrel_file.write(line)

number of documents missing in training set: 8353661


In [None]:
from transformers import AutoModel, AutoTokenizer

# model_name_or_path = "facebook/contriever-msmarco"
model_name_or_path = "sentence-transformers/gtr-t5-base"

model = AutoModel.from_pretrained(model_name_or_path, cache_dir="/share/LMs", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir="/share/LMs", local_files_only=True)

# save_path = "/share/peitian/Data/AutoTSG/PLM/contriever"
save_path = "/share/peitian/Data/AutoTSG/PLM/gtr"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)