In [1]:
import os
import json
import nltk
import torch
import re
import sys
from tqdm import tqdm

os.chdir('/home/s2310409/workspace/coliee-2024/')

# Using summarization from the dataset

In [2]:
# summary_dict = {}

# def get_summary(txt):
#     if "Summary:" in txt and "no summary" not in txt and "for this document are in preparation." not in txt:
#         idx = txt.find("Summary:")
#         end = txt.find("- Topic", idx)
#         end2 = txt.rfind("\n", idx, end)
#         summ = txt[idx+8:end2].strip()
#         return summ
#     else:
#         return ""

# n_no_summ = 0
# for file in os.listdir("dataset-2024/task1_test_files_2024"):
#     if not file.startswith('.'):
#         with open(f"dataset-2024/task1_test_files_2024/{file}", 'r') as fp:
#             content = fp.read()
#             summ = get_summary(content)
#             summary_dict[file] = {
#                 "raw_content": content,
#                 "summary": summ
#             }
#             if summ == "":
#                 n_no_summ += 1

# for file in os.listdir("dataset-2024/task1_train_files_2024"):
#     if not file.startswith('.'):
#         with open(f"dataset-2024/task1_train_files_2024/{file}", 'r') as fp:
#             content = fp.read()
#             summ = get_summary(content)
#             summary_dict[file] = {
#                 "raw_content": content,
#                 "summary": summ
#             }
#             if summ == "":
#                 n_no_summ += 1
# summary_dict

# Using LLMs to summarize

In [30]:
def chunking(sentences, window_size=5):
    chunks = []
    for i in range(0, len(sentences) - window_size):
        chunks.append("\n".join(sentences[i:i+window_size]))
    return chunks

with open('dataset/all_data.json') as f:
    all_data_dict = json.load(f)

word_tokenizer = nltk.tokenize.WordPunctTokenizer()
file_list = [f for f in os.listdir('dataset/processed') if f.endswith('.txt')]
# file_list = sorted(list(all_data_dict.keys()))

# >>>>>>>>>>>> CHANGE THE FILE LIST HERE <<<<<<<<<<<<<<<
# file_list = [f for f in os.listdir('/home/s2310409/workspace/coliee-2024/dataset-2023/task1/test_files') if f.endswith('.txt')]
# file_list = [f for f in file_list if f in all_data_dict.keys()]
file_list = sorted(file_list)

processed_file_dict = {}
for file in [f for f in os.listdir('dataset/processed') if f.endswith('.txt')]:
    processed_file = f"dataset/processed/{file}"
    with open(processed_file, 'r') as fp:
        processed_document = fp.read()
        processed_file_dict[file] = {
            'sentences': processed_document.split('\n\n'),
            'processed_document': processed_document
        }

chunk_dict = {}
for file in file_list:
    chunks = chunking(processed_file_dict[file]['sentences'])
    chunk_dict[file] = chunks
    # for i, chunk in enumerate(chunks):
    #     if len(chunk) > 0:
    #         chunk_dict[f"{file}_{i}"] = chunk

In [4]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers import GenerationConfig


model_name='google/flan-t5-xl'

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [31]:
def chunking_for_llm(sentences, tokenizer, max_length=512):
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        next_chunk = current_chunk + ("\n" + sentence if current_chunk != "" else sentence)
        if len(tokenizer.encode(next_chunk)) > max_length:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk = next_chunk
    if current_chunk != "":
        chunks.append(current_chunk)
    return chunks

chunk_dict = {}
for file in tqdm(file_list):
    chunks = chunking_for_llm(processed_file_dict[file]['sentences'], tokenizer=tokenizer, max_length=512)
    chunk_dict[file] = chunks

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7351/7351 [06:05<00:00, 20.10it/s]


In [6]:
def prompt(chunk):
    return f"Summarize following document. \n\n {chunk} \n\n Summary:"

def prompt_batch(chunks):
    return [prompt(chunk) for chunk in chunks]

# for file in tqdm(file_list):
#     chunks = chunk_dict[file]
#     summarized_chunks = []
#     for chunk in chunks:
#         input_ids = tokenizer.encode(chunk, return_tensors="pt").cuda()
#         outputs = model.generate(input_ids)
#         summarized_chunks.append(tokenizer.decode(outputs[0]))
#     summary = "\n".join(summarized_chunks)

# model.eval()
# file_example = file_list[1]
# chunks = chunk_dict[file_example]
# summarized_chunks = []
# for chunk in chunks:
#     input_ids = tokenizer.encode(prompt(chunk), return_tensors="pt").to('cuda')
#     with torch.no_grad():
#         outputs = model.generate(input_ids, 
#                                  max_length=300, 
#                                  num_beams=5, 
#                                  early_stopping=True, 
#                                  repetition_penalty=2.0,
#                                  no_repeat_ngram_size=3)
#         outputs = outputs.detach().cpu()
#     summarized_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     summarized_chunks.append(summarized_chunk)
# summary = "\n".join(summarized_chunks)
# summary

In [32]:
@torch.no_grad()
def get_summary(model, tokenizer, chunk):
    input_ids = tokenizer.encode(prompt(chunk), return_tensors="pt", truncation=True, max_length=512).to('cuda')
    with torch.no_grad():
        outputs = model.generate(input_ids, 
                                 max_length=300, 
                                 num_beams=5, 
                                 early_stopping=True, 
                                 repetition_penalty=2.0,
                                 no_repeat_ngram_size=3)
        outputs = outputs.detach().cpu()
    summarized_chunk = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summarized_chunk

In [33]:
"075634.txt" in file_list

True

In [34]:
model.eval()
summary_dict = {}

for file in tqdm(file_list):
    if os.path.exists(f"dataset/summarized/{file}"):
        continue
    
    chunks = chunk_dict[file]
    summarized_chunks = []
    for chunk in chunks:
        summarized_chunk = get_summary(model, tokenizer, chunk)
        summarized_chunks.append(summarized_chunk)

    # input_ids = tokenizer(prompt_batch(chunks), return_tensors="pt", padding=True).to('cuda')
    # with torch.no_grad():
    #     outputs = model.generate(**input_ids, 
    #                             max_length=300,
    #                             num_beams=5, 
    #                             early_stopping=True, 
    #                             repetition_penalty=2.0,
    #                             no_repeat_ngram_size=10)
    # summarized_chunks = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    summary = "\n".join(summarized_chunks)
    with open(f"dataset/summarized/{file}", 'w') as fp:
        fp.write(summary)
    summary_dict[file] = summary

 14%|██████████████▍                                                                                          | 1009/7351 [00:00<00:00, 10085.68it/s]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 7351/7351 [00:04<00:00, 1709.01it/s]


In [13]:
with open('dataset/summary_dict.json', 'w') as fp:
    json.dump(summary_dict, fp)

In [23]:
tokenizer.decode(tokenizer.encode(f"##Query: {'Hi'} ##Document: {'Hi'} ##Relevant:"), skip_special_tokens=True)

'##Query: Hi ##Document: Hi ##Relevant:'

In [16]:
def load_summary_data(dir):
    summary_data = {}
    files = os.listdir(dir)
    files = [f for f in files if not f.startswith('.')]
    for file in files:
        f_path = os.path.join(dir, file)
        with open(f_path, 'r') as fp:
            summary_data[file] = fp.read()
    return summary_data

summary_data = load_summary_data('dataset/summarized')
summary_data['040073.txt']

'Application for judicial review pursuant to subsection 72(1) of the Immigration and Refugee Protection Act\nThe PRRA Officer\'s Decision In a decision dated September 27, 2007, the PRRA officer dismisses the Applications and concludes that the Applicants had not provided sufficient evidence to establish that there was more than a mere possibility that they would face persecution should they be returned to Colombia. The Applicant submitted the following issues for the Court\'s consideration:\nThe PRRA officer\'s finding on state protection is a question of law to be reviewed on a standard of correctness.\nThe PRRA officer found that the new evidence presented by the Applicants was self-serving and did not provide a rational basis for dismissing the corroborative evidence.\nIs the PRRA officer erroneous in finding that the Applicant would not fit the "profile" of persons targeted by the FARC?\nThe PRRA officer\'s finding that the principal Applicant did not meet the profile of persons b

In [3]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments
)

model = AutoModelForSeq2SeqLM.from_pretrained('castorini/monot5-large-msmarco-10k').to('cuda')
tokenizer = AutoTokenizer.from_pretrained('castorini/monot5-large-msmarco-10k')



2024-01-09 16:02:24.537512: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 16:02:24.585890: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
example = f"##Query: {'place '*25} ##Document: {tokenizer.decode(tokenizer.encode(summary_data['040073.txt']*2,max_length=450, truncation=True))} ##Relevant:"

In [20]:
inputs = tokenizer(example, truncation='longest_first', return_tensors='pt', max_length=512).to('cuda')
inputs['labels'] = tokenizer(['true'], return_tensors='pt')['input_ids'].to('cuda')

outputs = model(**inputs)

In [21]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

def prompt(document, query):
    return f'##Query: {query} ##Document: {document} ##Relevant:'

class MonoT5Trainer(Seq2SeqTrainer):
    def __init__(self, loss_func, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.decoder_input_ids = None
        self.token_false_id = self.tokenizer.get_vocab()["▁false"]
        self.token_true_id  = self.tokenizer.get_vocab()["▁true"]

        self.loss_func = loss_func

    def compute_loss(self, model, inputs, return_outputs=False):
        if "inst_w" in inputs.keys():
            inst_w = inputs.pop("inst_w")
        if self.decoder_input_ids is None:
            if isinstance(model, torch.nn.DataParallel) or \
                    isinstance(model, torch.nn.parallel.DistributedDataParallel):
                self.decoder_input_ids = model.module._shift_right(inputs["labels"])
            else:
                self.decoder_input_ids = model._shift_right(inputs["labels"])
        if self.loss_func == "cross_entropy":
            if isinstance(model, torch.nn.DataParallel) or \
                    isinstance(model, torch.nn.parallel.DistributedDataParallel):
                inputs["decoder_input_ids"] = model.module._shift_right(inputs["labels"])
            else:
                inputs["decoder_input_ids"] = model._shift_right(inputs["labels"])
            return super().compute_loss(model, inputs, return_outputs)
        elif self.loss_func in ["contrastive", "ensemble"]:
            xe_loss, logits = model(**inputs, use_cache=False)[:2]
            logits = logits[:, -1, [self.token_false_id, self.token_true_id]]
            scores = torch.nn.functional.log_softmax(logits, dim=1)
            log_probs = scores[:, 1]
            loss = torch.mean(
                -torch.log(torch.exp(log_probs[0]) / torch.sum(torch.exp(log_probs), dim=-1)))
        elif self.loss_func == "weighted_cross_entropy":
            xe_loss, logits = model(**inputs, use_cache=False)[:2]
            loss = inst_w * torch.nn.CrossEntropyLoss(ignore_index=-100, reduction="none")(
                logits.view(-1, logits.size(-1)), inputs["labels"].view(-1))
            loss = torch.mean(loss)
        else:
            raise ValueError(self.loss_func)

        return loss

class MonoT5BatchCollator:
    def __init__(self, tokenizer, device, max_length=512):
        self.tokenizer = tokenizer
        self.device = device
        self.max_length = max_length

        self.pattern = '##Query: {} ##Document: {} ##Relevant:'
        # self.pattern = "Query: {} Document: {}"

    def flatten(self, l):
        return [item for sublist in l for item in sublist]

    def __call__(self, batch, return_tensors=None):
        texts = [self.pattern.format(example[0], self.tokenizer.decode(self.tokenizer.encode(example[1], 
                max_length=450, truncation=True), skip_special_tokens=True)) for b in batch for example in b]

        tokenized = self.tokenizer(texts, padding=True, truncation='longest_first',
                                   return_tensors='pt', max_length=self.max_length)
        tokenized['labels'] = self.tokenizer(
            ["true" if example[2] == 1 else "false"
            # [1 if example[2] == 1 else 0
             for b in batch for example in b], return_tensors='pt')['input_ids']
        tokenized["inst_w"] = torch.tensor(self.flatten([(1, example[3]) for b in batch for example in b]))
        for name in tokenized:
            tokenized[name] = tokenized[name].to(self.device)
        return tokenized

In [24]:
import sys
sys.path.append('/home/s2310409/workspace/coliee-2024/modules/pygaggle')
from utils.dataset import build_dataset

dataset = build_dataset()

2024-01-09 16:22:16 [INFO] env: 
Using override env var JVM_PATH (/home/s2310409/jdk/lib/server/libjvm.so) to load libjvm.
Please report your system information (os version, java
version, etc), and the path that works for you, to the
PyJNIus project, at https://github.com/kivy/pyjnius/issues.
so we can improve the automatic discovery.

2024-01-09 16:22:17 [INFO] loader: Loading faiss with AVX2 support.
2024-01-09 16:22:17 [INFO] loader: Successfully loaded faiss with AVX2 support.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 4399/4399 [00:14<00:00, 306.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4399/4399 [02:21<00:00, 31.20it/s]


In [25]:
from utils.misc import load_json, save_json
from utils.dataset import TrainDataset, \
    NegativeSamplingCallback, MonoT5BatchCollator, MonoT5Trainer, build_dataset
from utils.misc import load_json

config = load_json('/home/s2310409/workspace/coliee-2024/configs/monot5-large-10k_hns.json')
save_dir = os.path.join(f"train_logs/test")
ckpt_dir = os.path.join(save_dir, "ckpt")

train_dataset = TrainDataset(
        dataset,
        config["num_pairs_per_batch"],
        config["negative_sampling_strategy"],
        num_msmarco_pairs_per_batch=config.get("num_msmarco_pairs_per_batch", None),
        enhanced_weight=config.get("enhanced_weight", None),
        train_uncased=config["train_uncased"]
    )

In [26]:
train_args = TrainingArguments(
        output_dir=ckpt_dir,
        do_train=True,
        save_strategy='steps',
        save_steps=1, 
        logging_strategy='steps',
        logging_steps=1,
        per_device_train_batch_size=config["per_device_train_batch_size"],
        gradient_accumulation_steps=config["gradient_accumulation_steps"],
        learning_rate=config["learning_rate"],
        weight_decay=config["weight_decay"],
        num_train_epochs=config["n_epochs"],
        warmup_ratio=config["warmup_ratio"],
        optim=config["optim"],
        label_smoothing_factor=config["label_smoothing_factor"],
        fp16=config["fp16"],
        seed=42,
        disable_tqdm=False,
        load_best_model_at_end=False,
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        deepspeed=config.get("deepspeed", None)
    )
train_args.generation_config = None
collator_fn = MonoT5BatchCollator(tokenizer, 'cuda')

trainer = MonoT5Trainer(
        loss_func=config["loss_fn"],
        model=model,
        args=train_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=collator_fn,
    )

In [31]:
ns_callback = NegativeSamplingCallback(trainer)
trainer.add_callback(ns_callback)

trainer.train()

You are adding a <class 'utils.dataset.NegativeSamplingCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
TensorBoardCallback
NotebookProgressCallback
NegativeSamplingCallback
 45%|█████████████████████████████████████████████████▏                                                            | 386/863 [30:32<37:44,  4.75s/it]


KeyboardInterrupt: 