In [15]:
import os
import re
import copy
import random
import collections
import torch
import numpy as np
import pandas as pd
import json
import pickle
import nltk

from tqdm import tqdm
from rank_bm25 import BM25Okapi
from tqdm import tqdm
from pathlib import Path
from torch.utils.data import Dataset
from transformers import TrainerCallback, AutoTokenizer
os.chdir('/home/s2310409/workspace/coliee-2024/')
from utils.misc import get_query, get_summary


def load_data(dir):
    with open(dir, 'r') as fp:
        train_data = json.load(fp)

    data = []
    for key in train_data.keys():
        data.append([key, train_data[key]])

    return pd.DataFrame(data, columns=['source', 'target'])

def get_summary(doc_name):
    with open(f"dataset/mixtral_summarized/{doc_name}", 'r') as fp:
        summary = fp.read()
    return summary



In [36]:
def chunking(sentences, window_size=10):
    chunks = []
    for i in range(0, len(sentences) - window_size, window_size//2):
        chunks.append("\n".join(sentences[i:i+window_size]))
    return chunks

word_tokenizer = nltk.tokenize.WordPunctTokenizer()
# file_list = sorted(list(all_data_dict.keys()))

split = 'dev'
if split == 'dev':
    file_list = []
    with open(f'dataset/dev.json', 'r') as fp:
        split_data = json.load(fp)
        for file in split_data.keys():
            file_list.append(file)
            file_list.extend(split_data[file])
    file_list = sorted(list(set(file_list)))
elif split == 'test':
    file_list = [f for f in os.listdir('dataset/c2023/test_files') if f.endswith('.txt')]
    file_list = sorted(file_list)
elif split == 'submission':
    file_list = [f for f in os.listdir('dataset/c2024/test_files') if f.endswith('.txt')]
    file_list = sorted(file_list)

processed_file_dict = {}
for file in [f for f in os.listdir('dataset/processed') if not f.startswith('.')]:
    processed_file = f"dataset/processed/{file}"
    with open(processed_file, 'r') as fp:
        processed_document = fp.read()
        processed_file_dict[file] = {
            'sentences': processed_document.split('\n\n'),
            'processed_document': processed_document
        }

chunk_dict = {}
for file in file_list:
    chunks = chunking(processed_file_dict[file]['sentences'])
    for i, chunk in enumerate(chunks):
        if len(chunk) > 0:
            chunk_dict[f"{file}_{i}"] = chunk

mode = 'document'
if mode == 'chunk':
    # bm25 for chunks
    corpus = []
    chunk_list = sorted(list(chunk_dict.keys()))
    for chunk in chunk_list:
        corpus.append(chunk_dict[chunk])
    tokenized_corpus = [word_tokenizer.tokenize(doc) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
else:
    # bm25 for whole document
    corpus = []
    prcessed_list = sorted(file_list)
    for file in prcessed_list:
        corpus.append(processed_file_dict[file]['processed_document'])
    tokenized_corpus = [word_tokenizer.tokenize(doc) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)

In [37]:
best_f1 = 0
best_topk = 0
best_precision = 0
best_recall = 0

for topk in [4]:
    n_candidates = topk
    candidate_dicts = {}

    test_df = load_data(f'dataset/{split}.json')

    for file in tqdm(test_df['source']):
        query = get_query(file)
        tokenized_query = word_tokenizer.tokenize(query)
        results = bm25.get_scores(tokenized_query)
        max_ids = np.argsort(results)[-n_candidates:]
        document_candidates = [file_list[idx] for idx in max_ids]
        candidate_dicts[file] = list(set(document_candidates))


    test_df['candidates'] = test_df['source'].apply(lambda x: candidate_dicts[x])
    test_df['query'] = test_df['source'].apply(lambda x: get_query(x))

    # calculate accuracy metrics for BM25 + TF-IDF
    correct = 0
    n_retrived = 0
    n_relevant = 0

    coverages = []

    for index, row in test_df.iterrows():
        source = row['source']
        target = row['target']
        preds = row['candidates']
        coverages.append(len(preds))
        n_retrived += len(preds)
        n_relevant += len(target)
        for prediction in preds:
            if prediction in target:
                correct += 1

    precision = correct / n_retrived
    recall = correct / n_relevant
    f1 = 2 * precision * recall / (precision + recall)
    if f1 > best_f1:
        best_f1 = f1
        best_topk = topk
        best_precision = precision
        best_recall = recall

print(f"Top K: {best_topk}")
print(f"Precision: {best_precision}")
print(f"Recall: {best_recall}")
print(f"F1: {best_f1}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 96/96 [00:00<00:00, 332.99it/s]


Top K: 4
Precision: 0.2760416666666667
Recall: 0.2523809523809524
F1: 0.26368159203980096


In [30]:

if split == 'submission':
    n_candidates = best_topk
    with open(f"dataset/c2024/test_no_labels.json", 'r') as fp:
        test_keys = json.load(fp)
    with open(f"submissions/captainBM25.txt", 'w') as fp:
        for key in test_keys:
            query = get_query(file)
            tokenized_query = word_tokenizer.tokenize(query)
            results = bm25.get_scores(tokenized_query)
            max_ids = np.argsort(results)[-n_candidates:]
            document_candidates = [file_list[idx] for idx in max_ids]
            document_candidates = list(set(document_candidates))
            for candidate in document_candidates:
                fp.write(f"{key.split('.')[0]} {candidate.split('.')[0]} captainBM25\n")


In [None]:
"""
Top K: 4
Precision: 0.16379310344827586
Recall: 0.25150421179302046
F1: 0.19838633127669672
"""