In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import torch
import nltk
import os
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from rank_bm25 import BM25Okapi

os.chdir('/home/s2310409/workspace/coliee-2024/')

def load_data(dir):
    with open(dir, 'r') as fp:
        train_data = json.load(fp)

    data = []
    for key in train_data.keys():
        data.append([key, train_data[key]])

    return pd.DataFrame(data, columns=['source', 'target'])

# BM25 Indexing

In [3]:
def chunking(sentences, window_size=10):
    chunks = []
    for i in range(0, len(sentences) - window_size, window_size//2):
        chunks.append("\n".join(sentences[i:i+window_size]))
    return chunks

with open('dataset/all_data.json') as f:
    all_data_dict = json.load(f)

word_tokenizer = nltk.tokenize.WordPunctTokenizer()
# file_list = sorted(list(all_data_dict.keys()))

file_list = [f for f in os.listdir('dataset/c2023/test_files') if f.endswith('.txt')]
file_list = [f for f in file_list if f in all_data_dict.keys()]
file_list = sorted(file_list)

processed_file_dict = {}
for file in [f for f in os.listdir('dataset/processed') if not f.startswith('.')]:
    processed_file = f"dataset/processed/{file}"
    with open(processed_file, 'r') as fp:
        processed_document = fp.read()
        processed_file_dict[file] = {
            'sentences': processed_document.split('\n\n'),
            'processed_document': processed_document
        }

chunk_dict = {}
for file in file_list:
    chunks = chunking(processed_file_dict[file]['sentences'])
    for i, chunk in enumerate(chunks):
        if len(chunk) > 0:
            chunk_dict[f"{file}_{i}"] = chunk

mode = 'document'
if mode == 'chunk':
    # bm25 for chunks
    corpus = []
    chunk_list = sorted(list(chunk_dict.keys()))
    for chunk in chunk_list:
        corpus.append(chunk_dict[chunk])
    tokenized_corpus = [word_tokenizer.tokenize(doc) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
else:
    # bm25 for whole document
    corpus = []
    prcessed_list = sorted(file_list)
    for file in prcessed_list:
        corpus.append(processed_file_dict[file]['processed_document'])
    tokenized_corpus = [word_tokenizer.tokenize(doc) for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)

# Query with TF-IDF keywords

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

stopwords = nltk.corpus.stopwords.words('english')

docs = []
for file in processed_file_dict.keys():
    docs.append(processed_file_dict[file]['processed_document'])

count_vec = CountVectorizer(stop_words=stopwords)
word_count_vector = count_vec.fit_transform(docs)


tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

features = count_vec.get_feature_names_out()

In [5]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

## Query on chunks

In [6]:
n_keywords = 25

def extract_query(doc):
    tf_idf_vector=tfidf_transformer.transform(count_vec.transform([doc]))
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(features,sorted_items, n_keywords)
    return " ".join(list(keywords.keys()))

query_dict = {}

for file in tqdm(file_list):
    query_dict[file] = extract_query(processed_file_dict[file]['processed_document'])

n_candidates = 50
chunk_candidate_dict = {}
candidate_dict = {}

for file in tqdm(file_list):
    query = query_dict[file]
    tokenized_query = word_tokenizer.tokenize(query)
    results = bm25.get_scores(tokenized_query)
    max_ids = np.argsort(results)[-n_candidates:]

    chunk_candidates = [chunk_list[idx] for idx in max_ids]
    chunk_candidate_dict[file] = chunk_candidates

    document_candidates = [chunk.split('_')[0] for chunk in chunk_candidates]
    candidate_dict[file] = list(set(document_candidates))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:04<00:00, 303.80it/s]
  0%|                                                                                                                                                                                                                                 | 0/1217 [00:00<?, ?it/s]


NameError: name 'chunk_list' is not defined

In [None]:

test_df = load_data('dataset/json/test.json')

test_df['chunk_candidates'] = test_df['source'].apply(lambda x: chunk_candidate_dict[x])
test_df['candidates'] = test_df['source'].apply(lambda x: candidate_dict[x])
test_df['query'] = test_df['source'].apply(lambda x: query_dict[x])

# calculate accuracy metrics for BM25 + TF-IDF
correct = 0
n_retrived = 0
n_relevant = 0

coverages = []

for index, row in test_df.iterrows():
    source = row['source']
    target = row['target']
    preds = row['candidates']
    coverages.append(len(preds))
    n_retrived += len(preds)
    n_relevant += len(target)
    for prediction in preds:
        if prediction in target:
            correct += 1

precision = correct / n_retrived
recall = correct / n_relevant

print(f"Average # candidates: {np.mean(coverages)}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {2 * precision * recall / (precision + recall)}")

In [45]:
test_df

Unnamed: 0,source,target,chunk_candidates,candidates,query
0,070318.txt,[015076.txt],"[026517.txt_0, 084131.txt_5, 025676.txt_12, 04...","[010714.txt, 054863.txt, 037739.txt, 093796.tx...",rpo board guideline adamidis applicant france ...
1,077960.txt,"[009054.txt, 040860.txt]","[071670.txt_5, 079542.txt_3, 015274.txt_1, 055...","[077828.txt, 010714.txt, 048551.txt, 040860.tx...",removal child children custody order dunn cour...
2,042319.txt,"[093691.txt, 075956.txt, 084953.txt, 022987.txt]","[094702.txt_6, 031573.txt_1, 017268.txt_2, 024...","[087571.txt, 020300.txt, 042893.txt, 046082.tx...",beyer cross affidavit prothonotary examination...
3,041766.txt,[039269.txt],"[098992.txt_10, 079863.txt_2, 084744.txt_1, 02...","[023186.txt, 077828.txt, 084744.txt, 028729.tx...",drug clinical nds health data 002 omitted 08 n...
4,077407.txt,[038669.txt],"[010486.txt_4, 063976.txt_3, 077407.txt_8, 021...","[087571.txt, 039348.txt, 010486.txt, 021955.tx...",communication 23 privilege counsel litigation ...
...,...,...,...,...,...
314,085079.txt,"[044669.txt, 003144.txt]","[030514.txt_6, 084937.txt_4, 060581.txt_14, 07...","[039348.txt, 025162.txt, 084937.txt, 022837.tx...",cso promotions shephard cst adjudicator commis...
315,031370.txt,"[096341.txt, 060602.txt, 047107.txt, 084522.tx...","[000084.txt_0, 047826.txt_3, 094010.txt_0, 048...","[023186.txt, 077828.txt, 027678.txt, 048551.tx...",removal peru applicant irreparable 3d spouse p...
316,085828.txt,"[004301.txt, 074887.txt, 088994.txt]","[035451.txt_8, 063976.txt_5, 060350.txt_6, 091...","[088994.txt, 071670.txt, 028315.txt, 063976.tx...",officer applicants india singh riots risk prra...
317,024957.txt,"[015009.txt, 080348.txt]","[052545.txt_4, 002842.txt_9, 081064.txt_3, 087...","[055800.txt, 080348.txt, 077315.txt, 058746.tx...",seizure annuity civil 224 code debtor chattels...


## Query on all documents

In [7]:
n_keywords = 25

def extract_query(doc):
    tf_idf_vector=tfidf_transformer.transform(count_vec.transform([doc]))
    sorted_items = sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(features,sorted_items, n_keywords)
    return " ".join(list(keywords.keys()))

query_dict = {}

for file in tqdm(file_list):
    query_dict[file] = extract_query(processed_file_dict[file]['processed_document'])

n_candidates = 50
candidate_dicts = {}

for file in tqdm(file_list):
    query = query_dict[file]
    tokenized_query = word_tokenizer.tokenize(query)
    results = bm25.get_scores(tokenized_query)
    max_ids = np.argsort(results)[-n_candidates:]
    document_candidates = [file_list[idx] for idx in max_ids]
    candidate_dicts[file] = list(set(document_candidates))


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:04<00:00, 303.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1217/1217 [00:08<00:00, 136.37it/s]


In [9]:
test_df = load_data('dataset/test.json')

test_df['candidates'] = test_df['source'].apply(lambda x: candidate_dicts[x])
test_df['query'] = test_df['source'].apply(lambda x: query_dict[x])

# calculate accuracy metrics for BM25 + TF-IDF
correct = 0
n_retrived = 0
n_relevant = 0

coverages = []

for index, row in test_df.iterrows():
    source = row['source']
    target = row['target']
    preds = row['candidates']
    coverages.append(len(preds))
    n_retrived += len(preds)
    n_relevant += len(target)
    for prediction in preds:
        if prediction in target:
            correct += 1

precision = correct / n_retrived
recall = correct / n_relevant

print(f"Average # candidates: {np.mean(coverages)}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1: {2 * precision * recall / (precision + recall)}")

Average # candidates: 50.0
Precision: 0.0374294670846395
Recall: 0.7184115523465704
F1: 0.07115189797985817


In [19]:
test_df

Unnamed: 0,source,target,candidates,query
0,070318.txt,[015076.txt],"[032432.txt, 020211.txt, 025676.txt, 073854.tx...",rpo board adamidis guideline applicant hearing...
1,077960.txt,"[009054.txt, 040860.txt]","[071412.txt, 019505.txt, 048551.txt, 026347.tx...",removal custody child children dunn order idah...
2,042319.txt,"[093691.txt, 075956.txt, 084953.txt, 022987.txt]","[086059.txt, 093691.txt, 094451.txt, 019572.tx...",beyer cross affidavit prothonotary examination...
3,041766.txt,[039269.txt],"[098992.txt, 091415.txt, 038039.txt, 084744.tx...",drug clinical nds 002 data health omitted 08 n...
4,077407.txt,[038669.txt],"[041415.txt, 085158.txt, 071211.txt, 030514.tx...",communication 23 privilege litigation counsel ...
...,...,...,...,...
314,085079.txt,"[044669.txt, 003144.txt]","[071211.txt, 085158.txt, 051660.txt, 078642.tx...",cso promotions shephard cst adjudicator jse co...
315,031370.txt,"[096341.txt, 060602.txt, 047107.txt, 084522.tx...","[019505.txt, 096717.txt, 017883.txt, 084744.tx...",removal peru irreparable applicant 3d spouse p...
316,085828.txt,"[004301.txt, 074887.txt, 088994.txt]","[019505.txt, 048551.txt, 049064.txt, 087722.tx...",officer applicants india singh riots principal...
317,024957.txt,"[015009.txt, 080348.txt]","[098992.txt, 048208.txt, 078642.txt, 015009.tx...",annuity seizure civil 224 debtor code unseizab...
