In [None]:
# Install necessary libraries
!pip install transformers pdfplumber torch --quiet

# Import required libraries
from transformers import AutoTokenizer, AutoModel
import pdfplumber
import re

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/2.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
pdf_path = "/content/bns.pdf"

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file and returns it as a single string.
    """
    with pdfplumber.open(pdf_path) as pdf:
        text = " ".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

bns_text = extract_text_from_pdf(pdf_path)
print("Text extracted from the PDF.")

Text extracted from the PDF.


In [None]:
def preprocess_bns_text(text):
    """
    Preprocesses the BNS text to create a mapping of sections to their descriptions.
    """
    sections = {}
    matches = re.finditer(r"(\d+\.\s+.+?)(?=\n\d+\.|\Z)", text, re.DOTALL)
    for match in matches:
        section = match.group(1)
        split = section.split(maxsplit=1)
        if len(split) > 1:
            section_number, description = split[0], split[1]
            sections[section_number.strip()] = description.strip()
    return sections

bns_sections = preprocess_bns_text(bns_text)
print(f"Extracted {len(bns_sections)} sections from the BNS text.")

Extracted 358 sections from the BNS text.


In [None]:
model_name = "law-ai/InLegalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/516 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/671 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/534M [00:00<?, ?B/s]

In [None]:
import torch
# Precompute section embeddings
def precompute_section_embeddings(bns_sections, tokenizer, model):
    """
    Precomputes and stores embeddings for all BNS sections.
    """
    section_embeddings = {}
    for section_number, description in bns_sections.items():
        encoded_section = tokenizer(description, return_tensors="pt", truncation=True, max_length=512)
        with torch.no_grad():
            section_output = model(**encoded_section)
        section_embeddings[section_number] = section_output.last_hidden_state.mean(dim=1)
    return section_embeddings

section_embeddings = precompute_section_embeddings(bns_sections, tokenizer, model)
print("Precomputed embeddings for all sections.")

Precomputed embeddings for all sections.


In [None]:
# Optimized function to retrieve relevant sections
def get_relevant_bns_sections_optimized(scenario, section_embeddings, tokenizer, model):
    """
    Retrieves relevant BNS sections using precomputed embeddings.
    """
    # Tokenize and embed the scenario
    encoded_input = tokenizer(scenario, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**encoded_input)
    scenario_embedding = output.last_hidden_state.mean(dim=1)

    # Compare scenario embedding with precomputed section embeddings
    relevant_sections = []
    for section_number, section_embedding in section_embeddings.items():
        similarity = torch.nn.functional.cosine_similarity(scenario_embedding, section_embedding)
        if similarity.item() > 0.5:  # Adjust the threshold
            relevant_sections.append((section_number, similarity.item()))

    # Sort sections by similarity
    relevant_sections = sorted(relevant_sections, key=lambda x: x[1], reverse=True)
    return relevant_sections

In [None]:
scenario_input = input("Enter a legal scenario to analyze: ")
relevant_sections = get_relevant_bns_sections_optimized(scenario_input, section_embeddings, tokenizer, model)

# Display results: Top 5 relevant sections (section number, title, similarity)
print("\nTop 5 Relevant Bharatiya Nyaya Sanhita (BNS) Sections:\n")
if relevant_sections:
    for section_number, score in relevant_sections[:5]:  # Display only the top 5
        section_title = bns_sections[section_number].split("—")[0]  # Extract the section title (before colon)
        print(f"Section {section_number}: {section_title} (Similarity: {score:.2f})\n")
else:
    print("No relevant sections found.")

Enter a legal scenario to analyze:  A Hyderabad-based IT firm was crippled by a ransomware attack, encrypting  all its critical business data. The hackers demanded ₹50 lakh in Bitcoin for  restoring access. The firm’s operations were paralyzed for three days,  resulting in significant losses.  Cyber experts have been roped in to recover data and enhance the firm’s  security infrastructure. The police suspect the involvement of an international  hacking group. This incident serves as a wake-up call for businesses to  invest in robust cybersecurity measures.

Top 5 Relevant Bharatiya Nyaya Sanhita (BNS) Sections:

Section 111.: Organised crime. (Similarity: 0.85)

Section 30.: Act done in good faith for benefit of a person without consent. (Similarity: 0.85)

Section 77.: Voyeurism. (Similarity: 0.84)

Section 78.: Stalking. (Similarity: 0.84)

Section 129.: Criminal force. (Similarity: 0.84)

