<a href="https://colab.research.google.com/github/np2802/Indian-Legal-Semantic-Searcher/blob/main/Evaluation_ILSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# pip install sentence_transformers

In [None]:
# from sentence_transformers import SentenceTransformer, models
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import torch
import json
from transformers import AutoTokenizer, AutoModel

# Loading the model

In [None]:
model_path = '/content/drive/MyDrive/FYP/models-final/KanoonBert'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# Creating dataframe

In [None]:
# Load the JSON file with relevant case file numbers
with open('/content/drive/MyDrive/FYP/Dataset/latest_query_set.json', 'r') as file:
    data = json.load(file)

# Load the CSV file with topic modeling and content
csv_file_path = "/content/drive/MyDrive/FYP/Topic_Modelling_New/topic_assignments_with_content.csv"

df = pd.read_csv(csv_file_path).dropna()
df

Unnamed: 0,File Name,Assigned Topic,Content
0,0001210741.txt,"0.044*""sale"" + 0.039*""good"" + 0.018*""manufactu...",ORDER \n1. This Writ Petition coming on for he...
1,0000093330.txt,"0.040*""suit"" + 0.028*""defendant"" + 0.024*""decr...","JUDGMENT <NAME>, J. \n1. This Revision Petitio..."
2,0000656658.txt,"0.034*""article"" + 0.028*""power"" + 0.023*""const...",PETITIONER: <NAME> Vs. RESPONDENT: THE STATE O...
3,0001286874.txt,"0.040*""suit"" + 0.028*""defendant"" + 0.024*""decr...","JUDGMENT <NAME>, <NAME>1. This appeal is conce..."
4,0001860814.txt,"0.044*""sale"" + 0.039*""good"" + 0.018*""manufactu...","JUDGMENT <NAME>, J. \n1. In this appeal with c..."
...,...,...,...
2904,0001891870.txt,"0.040*""income"" + 0.034*""assessment"" + 0.021*""p...","JUDGMENT <NAME>, J. \n1. <ORG> has referred th..."
2905,0000353515.txt,"0.040*""suit"" + 0.028*""defendant"" + 0.024*""decr...","JUDGMENT <NAME>, J. \n1. In this case an appli..."
2906,0000932632.txt,"0.044*""sale"" + 0.039*""good"" + 0.018*""manufactu...",PETITIONER: <ORG> HYDERABAD Vs. RESPONDENT: CO...
2907,0168621613.txt,"0.035*""accuse"" + 0.032*""offence"" + 0.029*""crim...",IN THE HIGH COURT OF KERALA AT ERNAKULAM PRESE...


# Split and merge

In [None]:
# Function to process each document
def split_and_merge(doc_text):
    # Tokenize input
    tokens = tokenizer(doc_text, return_tensors='pt', padding=True, truncation=True)

    # Forward pass through BERT model
    with torch.no_grad():
        outputs = model(**tokens)

    # Get the embeddings from the last hidden state
    last_hidden_state = outputs.last_hidden_state

    # Pooling strategy (e.g., mean pooling)
    sentence_embeddings = torch.mean(last_hidden_state, dim=1)

    # Optionally, normalize sentence embeddings
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)

    # Document embedding by averaging sentence embeddings
    document_embedding = torch.mean(sentence_embeddings, dim=0)

    # Optionally, normalize document embedding
    document_embedding = torch.nn.functional.normalize(document_embedding, p=2, dim=0)

    return document_embedding

# Finding the most relevant cases for any given query

In [None]:
def find_similar_cases(query, new_df):
    if isinstance(query, str) and '.txt' in query:
        # Query is a file
        with open(query, 'r') as file:
            query_text = file.read()
        query_embedding = split_and_merge(query_text)
    else:
        # Query is a text
        query_embedding = split_and_merge(query)

    # Load the saved document embeddings
    doc_embeddings = np.load('/content/drive/MyDrive/FYP/document_embeddings/document_embeddings.npy')

    # Reshape query_embedding if necessary
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding.reshape(1, -1)

    # Reshape doc_embeddings if necessary
    if len(doc_embeddings.shape) == 1:
        doc_embeddings = doc_embeddings.reshape(len(new_df), -1)

    # Calculate similarity with document embeddings
    doc_similarities = cosine_similarity(query_embedding, doc_embeddings)

    # Calculate similarity with topic embeddings
    topic_similarities = []
    topic_embeddings = np.load('/content/drive/MyDrive/FYP/Topic_Modelling_New/topic_modelling_embeddings.npy')
    for topic_embedding in topic_embeddings:
        if len(topic_embedding.shape) == 1:
            topic_embedding = topic_embedding.reshape(1, -1)
        topic_similarities.append(cosine_similarity(query_embedding, topic_embedding))

    combined_similarities = np.concatenate([doc_similarities] + topic_similarities, axis=1)

    similar_cases = []

    # Iterate over topic similarities
    for topic_similarity in combined_similarities:
        # Retrieve indices of top similar cases
        top_indices = topic_similarity.argsort()[-5:][::-1]
        top_indices_1d = top_indices.flatten()
        top_similar_cases = new_df.iloc[top_indices_1d[:6]]

        # Append similar cases along with their similarity scores
        similar_cases.append((top_similar_cases, topic_similarity[top_indices]))

    return similar_cases

In [None]:
def find_similar_cases_with_evaluation(query, new_df, data, top_k):
    if isinstance(query, str) and '.txt' in query:
        # Query is a file
        print('query: ', query)
        query_doc = new_df[new_df['File Name'] == query]['Content'].values[0]
        query_embedding = split_and_merge(query_doc)
    else:
        # Query is a text
        query_embedding = split_and_merge(query)

    # Load the saved document embeddings
    doc_embeddings = np.load('/content/drive/MyDrive/FYP/document_embeddings/document_embeddings_new.npy')

    # Reshape query_embedding if necessary
    if len(query_embedding.shape) == 1:
        query_embedding = query_embedding.reshape(1, -1)

    # Reshape doc_embeddings if necessary
    if len(doc_embeddings.shape) == 1:
        doc_embeddings = doc_embeddings.reshape(len(new_df), -1)

    # Calculate cosine similarity between query and all documents
    similarities = cosine_similarity(query_embedding, doc_embeddings).flatten()

    # Retrieve top k similar cases based on cosine similarity
    top_indices = similarities.argsort()[-(top_k+1):][::-1]
    top_similar_cases = new_df.iloc[top_indices]
    top_similar_scores = similarities[top_indices]

    # Add 'Similarity Score' column to top_similar_cases
    top_similar_cases['Similarity Score'] = top_similar_scores

    # Exclude the query file from the top similar cases
    top_similar_cases = top_similar_cases[top_similar_cases['File Name'] != query]
    print('top similar cases: ', top_similar_cases['File Name'].values)

    # Load data from JSON file
    relevant_cases = []
    for query_set in data['Query Set']:
        if query_set['id'] == query:
            relevant_cases = query_set['relevant candidates']
            print('relevant cases:', relevant_cases)
            break

    # Calculate Precision, Recall, and F1 Score
    relevant_cases_count = len(relevant_cases)
    retrieved_cases_count = len(set(top_similar_cases['File Name']).intersection(relevant_cases))

    if relevant_cases_count == 0:
        recall = 0
    else:
        recall = retrieved_cases_count / relevant_cases_count

    precision = retrieved_cases_count / len(top_similar_cases)
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    print('recall:', recall)
    print('precision:', precision)
    print('f1 score:', f1_score)
    return top_similar_cases, relevant_cases, precision, recall, f1_score

In [None]:
def evaluate_model(new_df, data):
    recall_values = []
    precision_values = []
    f1_score_values = []

    for query_set in data['Query Set']:
        query_id = query_set['id']
        relevant_candidates = []

        # Check if the query ID exists in the DataFrame
        if query_id not in new_df['File Name'].values:
            continue

        # Check if relevant candidates exist in the DataFrame
        for candidate in query_set['relevant candidates']:
            if candidate in new_df['File Name'].values:
                relevant_candidates.append(candidate)

        # Retrieve top 5 similar cases
        top_similar_cases, relevant_cases, precision, recall, f1_score = find_similar_cases_with_evaluation(query_id, new_df, data, top_k=5)

        recall_values.append(recall)
        precision_values.append(precision)
        f1_score_values.append(f1_score)

    if len(recall_values) == 0:
        print("No eligible queries found for evaluation.")
        return None, None, None

    print(recall_values)
    print(precision_values)
    print(f1_score_values)
    average_recall = np.mean(recall_values)
    average_precision = np.mean(precision_values)
    average_f1_score = np.mean(f1_score_values)

    return average_recall, average_precision, average_f1_score

In [None]:
# Example usage
query = 'i want to sue someone for a case of land disputes, my tenant has taken control over my land illegally'

# Start time
start_time = time.time()

# Get similar cases
similar_cases_list = find_similar_cases(query, df)

# End time
end_time = time.time()

# Calculate the time taken
execution_time = end_time - start_time

# Print the results in a better format
print(similar_cases_list)

# Print the execution time
print("Time taken to get results:", execution_time, "seconds")

# Evaluate the model
average_recall, average_precision, average_f1_score = evaluate_model(df, data)
print("Average Recall:", average_recall)
print("Average Precision:", average_precision)
print("Average F1 Score:", average_f1_score)