## Legal Summarization Dataset [Metric: Normalized Discounted Cumulative Gain @ 10 (nDCG@10)]

In [1]:
import datasets

# Download the dataset
queries_dataset = datasets.load_dataset("mteb/legal_summarization", "queries")
documents_dataset = datasets.load_dataset("mteb/legal_summarization", "corpus")
pair_labels_dataset = datasets.load_dataset("mteb/legal_summarization", "default")


In [2]:
print(queries_dataset)
print(documents_dataset)
print(pair_labels_dataset)


DatasetDict({
    queries: Dataset({
        features: ['_id', 'text'],
        num_rows: 284
    })
})
DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 438
    })
})
DatasetDict({
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 439
    })
})


In [3]:
# Extract queries and documents using appropriate keys
queries = [item['text'] for item in queries_dataset['queries']]
documents = [item['text'] for item in documents_dataset['corpus']]
pair_labels = [(pair['query-id'], pair['corpus-id'], pair['score']) for pair in pair_labels_dataset['test']]


In [4]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

# Function to embed a single text
def embed_text(text):
    return model.encode(text, convert_to_tensor=True, device='cuda')

# Embed the queries and documents one by one
query_embeddings = torch.stack([embed_text(query) for query in queries])
document_embeddings = torch.stack([embed_text(doc) for doc in documents])




In [5]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)


In [6]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)

import numpy as np
from sklearn.metrics import ndcg_score

# Prepare a mapping from IDs to indices
query_id_to_index = {id: index for index, id in enumerate(queries_dataset['queries']['_id'])}
corpus_id_to_index = {id: index for index, id in enumerate(documents_dataset['corpus']['_id'])}

def calculate_ndcg(cosine_similarities, pair_labels, k=10):
    ndcg_scores = []
    for query_id, corpus_id, score in pair_labels:
        query_index = query_id_to_index[query_id]
        corpus_index = corpus_id_to_index[corpus_id]
        
        # Create true relevance scores based on the scores in the dataset
        true_relevance = np.zeros(len(documents))
        true_relevance[corpus_index] = score

        # Get similarities for the current query
        query_similarities = cosine_similarities[query_index].cpu().numpy()

        # Calculate nDCG@10
        ndcg = ndcg_score([true_relevance], [query_similarities], k=k)
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

# Calculate and print the average nDCG@10
average_ndcg_at_10 = calculate_ndcg(cosine_similarities, pair_labels)
print(f"Average nDCG@10: {average_ndcg_at_10}")


Average nDCG@10: 0.4527647304337138


## ConsumerContractsQA [Metric: Normalized Discounted Cumulative Gain @ 10 (nDCG@10)]

In [7]:
import datasets

# Download the dataset
queries_dataset = datasets.load_dataset("mteb/legalbench_consumer_contracts_qa", "queries")
documents_dataset = datasets.load_dataset("mteb/legalbench_consumer_contracts_qa", "corpus")
pair_labels_dataset = datasets.load_dataset("mteb/legalbench_consumer_contracts_qa", "default")


In [8]:
# Inspect datasets to determine the correct keys
print(queries_dataset)
print(documents_dataset)
print(pair_labels_dataset)


DatasetDict({
    queries: Dataset({
        features: ['_id', 'text'],
        num_rows: 396
    })
})
DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 154
    })
})
DatasetDict({
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 396
    })
})


In [9]:
# Extract queries and documents using appropriate keys
queries = [item['text'] for item in queries_dataset['queries']]
documents = [item['text'] for item in documents_dataset['corpus']]
pair_labels = [(pair['query-id'], pair['corpus-id'], pair['score']) for pair in pair_labels_dataset['test']]

In [10]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

# Function to embed a single text
def embed_text(text):
    return model.encode(text, convert_to_tensor=True, device='cuda')

# Embed the queries and documents one by one
query_embeddings = torch.stack([embed_text(query) for query in queries])
document_embeddings = torch.stack([embed_text(doc) for doc in documents])




In [11]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)


In [12]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)

import numpy as np
from sklearn.metrics import ndcg_score

# Prepare a mapping from IDs to indices
query_id_to_index = {id: index for index, id in enumerate(queries_dataset['queries']['_id'])}
corpus_id_to_index = {id: index for index, id in enumerate(documents_dataset['corpus']['_id'])}

def calculate_ndcg(cosine_similarities, pair_labels, k=10):
    ndcg_scores = []
    for query_id, corpus_id, score in pair_labels:
        query_index = query_id_to_index[query_id]
        corpus_index = corpus_id_to_index[corpus_id]
        
        # Create true relevance scores based on the scores in the dataset
        true_relevance = np.zeros(len(documents))
        true_relevance[corpus_index] = score

        # Get similarities for the current query
        query_similarities = cosine_similarities[query_index].cpu().numpy()

        # Calculate nDCG@10
        ndcg = ndcg_score([true_relevance], [query_similarities], k=k)
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

# Calculate and print the average nDCG@10
average_ndcg_at_10 = calculate_ndcg(cosine_similarities, pair_labels)
print(f"Average nDCG@10: {average_ndcg_at_10}")


Average nDCG@10: 0.8011939407382996


## CorporateLobbying [Metric: Normalized Discounted Cumulative Gain @ 10 (nDCG@10)]

In [13]:
import datasets

# Download the dataset
queries_dataset = datasets.load_dataset("mteb/legalbench_corporate_lobbying", "queries")
documents_dataset = datasets.load_dataset("mteb/legalbench_corporate_lobbying", "corpus")
pair_labels_dataset = datasets.load_dataset("mteb/legalbench_corporate_lobbying", "default")


In [14]:
print(queries_dataset)
print(documents_dataset)
print(pair_labels_dataset)


DatasetDict({
    queries: Dataset({
        features: ['_id', 'text'],
        num_rows: 340
    })
})
DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 319
    })
})
DatasetDict({
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 340
    })
})


In [15]:
# Extract queries and documents using appropriate keys
queries = [item['text'] for item in queries_dataset['queries']]
documents = [item['text'] for item in documents_dataset['corpus']]
pair_labels = [(pair['query-id'], pair['corpus-id'], pair['score']) for pair in pair_labels_dataset['test']]


In [16]:
from sentence_transformers import SentenceTransformer
import torch

# Load the model
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

# Function to embed a single text
def embed_text(text):
    return model.encode(text, convert_to_tensor=True, device='cuda')

# Embed the queries and documents one by one
query_embeddings = torch.stack([embed_text(query) for query in queries])
document_embeddings = torch.stack([embed_text(doc) for doc in documents])



In [17]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)


In [18]:
from sentence_transformers.util import cos_sim

# Compute cosine similarities
cosine_similarities = cos_sim(query_embeddings, document_embeddings)

import numpy as np
from sklearn.metrics import ndcg_score

# Prepare a mapping from IDs to indices
query_id_to_index = {id: index for index, id in enumerate(queries_dataset['queries']['_id'])}
corpus_id_to_index = {id: index for index, id in enumerate(documents_dataset['corpus']['_id'])}

def calculate_ndcg(cosine_similarities, pair_labels, k=10):
    ndcg_scores = []
    for query_id, corpus_id, score in pair_labels:
        query_index = query_id_to_index[query_id]
        corpus_index = corpus_id_to_index[corpus_id]
        
        # Create true relevance scores based on the scores in the dataset
        true_relevance = np.zeros(len(documents))
        true_relevance[corpus_index] = score

        # Get similarities for the current query
        query_similarities = cosine_similarities[query_index].cpu().numpy()

        # Calculate nDCG@10
        ndcg = ndcg_score([true_relevance], [query_similarities], k=k)
        ndcg_scores.append(ndcg)
    
    return np.mean(ndcg_scores)

# Calculate and print the average nDCG@10
average_ndcg_at_10 = calculate_ndcg(cosine_similarities, pair_labels)
print(f"Average nDCG@10: {average_ndcg_at_10}")


Average nDCG@10: 0.8849441186346295
