In [3]:
!pip install langchain   faiss-cpu  transformers torch pdfplumber

Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-core<0.3.0,>=0.2.38 (from langchain)
  Downloading langchain_core-0.2.38-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.110-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
 

In [5]:
!pip install langchain_community

Collecting langchain_community
  Downloading langchain_community-0.2.16-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.2.16-py3-none-any.whl (2.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [6]:
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
import faiss
import numpy as np
import torch
from transformers import BertModel, BertTokenizer
import pdfplumber
import re

In [7]:
# Define the section headers to split on
SECTION_HEADERS = [
    "PETITIONER:",
    "RESPONDENT:",
    "DATE OF JUDGMENT:",
    "BENCH:",
    "CITATION:",
    "ACT:",
    "HEADNOTE:",
    "JUDGMENT:"
]

In [8]:
# Function to split text based on section headers
def split_text_by_sections(text):
    # Create a regular expression pattern to match section headers
    pattern = '|'.join(re.escape(header) for header in SECTION_HEADERS)
    sections = re.split(pattern, text, flags=re.IGNORECASE)
    return [section.strip() for section in sections if section.strip()]

In [9]:
# Updated pdf_loader function to split text by sections
def pdf_loader(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    sections = split_text_by_sections(text)
    return {section_header: Document(page_content=section)
            for section_header, section in zip(SECTION_HEADERS, sections)}

In [11]:
# Initialize the BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [10]:
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [12]:
class FAISSVectorStore(FAISS):
    def __init__(self, index, documents):
        self.index = index
        self.docstore = {i: doc for i, doc in enumerate(documents)}
        self.index_to_docstore_id = {i: i for i in range(len(documents))}

    def search(self, query_embeddings, k=5):
        distances, indices = self.index.search(query_embeddings, k)
        results = [(distances[i], [self.docstore[idx] for idx in indices[i]]) for i in range(len(distances))]
        return results

In [13]:
# Function to calculate similarity between two sets of section embeddings
def calculate_similarity(embedding1, embedding2):
    faiss_index = faiss.IndexFlatL2(embedding1.shape[1])
    faiss_index.add(embedding2.astype(np.float32))
    distances, _ = faiss_index.search(embedding1.astype(np.float32), k=1)
    similarity = 1 / (1 + distances[0][0])  # Convert distance to similarity (0-1 scale)
    return similarity

In [18]:
# Function to search for similar PDFs and print the top 5 matches
def search_similar_pdfs(query_pdf_path):
    query_document = pdf_loader(query_pdf_path)
    pdf_similarities = []

    for stored_pdf in stored_pdf_sections:
        section_similarities = []
        for section_header in SECTION_HEADERS:
            if section_header in query_document and section_header in stored_pdf:
                query_embedding = get_embeddings([query_document[section_header].page_content])
                stored_embedding = stored_pdf[section_header]["embedding"]
                similarity = calculate_similarity(query_embedding, stored_embedding)
                section_similarities.append(similarity)

        # Average similarity for this stored PDF
        if section_similarities:
            avg_similarity = np.mean(section_similarities)
            pdf_similarities.append((avg_similarity, stored_pdf))

    # Rank PDFs based on similarity
    ranked_pdfs = sorted(pdf_similarities, key=lambda x: x[0], reverse=True)

    # Print ranked PDFs and their content
    for rank, (similarity, pdf_content) in enumerate(ranked_pdfs[:2], start=1):
        print(f"\nRank {rank}: Similarity {similarity:.4f}")
        for section_header, content in pdf_content.items():
            print(f"\nSection: {section_header}\nContent: {content['text'][:500]}...")  # Print first 500 chars of each section

In [19]:
def database_processing() :
  for pdf_file in pdf_files:
      document = pdf_loader(pdf_file)
      section_data = {}
      for section_header, doc in document.items():
          texts = [doc.page_content]
          embeddings = get_embeddings(texts)
          section_data[section_header] = {"text": texts, "embedding": embeddings}
      stored_pdf_sections.append(section_data)

In [20]:
# Store PDFs and generate their embeddings by sections
pdf_files = ["/content/-0___jonew__judis__10187.pdf", "/content/-0___jonew__judis__10220.pdf", "/content/-0___jonew__judis__10290.pdf"]
stored_pdf_sections = []
database_processing()

In [21]:
search_similar_pdfs("/content/-0___jonew__judis__10187.pdf")


Rank 1: Similarity 1.0000

Section: PETITIONER:
Content: ['http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 1 of 2']...

Section: RESPONDENT:
Content: ['DHARAMDEO\nVs.']...

Section: DATE OF JUDGMENT:
Content: ['BIJARAT & ORS.\nDATE OF JUDGMENT12/12/1995']...

Section: BENCH:
Content: ['RAMASWAMY, K.']...

Section: CITATION:
Content: ['RAMASWAMY, K.\nHANSARIA B.L. (J)']...

Section: ACT:
Content: ['1996 SCC (2) 313 1995 SCALE (7)351']...

Section: HEADNOTE:
Content: ['O R D E R\nWe have perused the order of the High Court dated\nNovember 24, 1975. The only question raised relates to the\nvalidity of the Ordinance which has already been upheld by\nthe Full Bench of that Court. It was argued relating to\nlegislative competency. Since it is a matter relating to\nland reform and land, it is covered by Schedule 7, List II,\nitem Nos.14 and 18. As a result, the impugned Act is within\nthe legislative competence of the State legislature. It is\nthen contended that it is violative of Art.14. W