<a href="https://colab.research.google.com/github/ovuiproduction/EDI_1701/blob/main/SIH1701_Model_v_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install -qU langchain-community faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [29]:
!pip install transformers langchain pdfplumber



In [30]:
from transformers import BertModel, BertTokenizer
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import pdfplumber
import numpy as np
import torch

In [31]:
# def pdf_loader(file_path):
#     text = ""
#     with pdfplumber.open(file_path) as pdf:
#         for page in pdf.pages:
#             text += page.extract_text()
#     return [Document(page_content=text)]

# # Load the PDF
# document = pdf_loader("/content/shankari_prasad.pdf")

In [32]:
# Split the text into smaller chunks
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=500,
#     chunk_overlap=0,
#     length_function=len,
# )
# docs = text_splitter.split_documents(document)

In [137]:
def pdf_loader(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

In [136]:
# Load the PDF
document = pdf_loader("/content/shankari_prasad.pdf")

In [106]:
import re
from langchain.text_splitter import TextSplitter
from langchain.docstore.document import Document

# Define the section headers to split on
SECTION_HEADERS = [
    "PETITIONER:",
    "RESPONDENT:",
    "DATE OF JUDGMENT:",
    "BENCH:",
    "CITATION:",
    "ACT:",
    "HEADNOTE:",
    "JUDGMENT:"
]

# Function to split text based on section headers
def split_text_by_sections(text):
    # Create a regular expression pattern to match section headers
    pattern = '|'.join(re.escape(header) for header in SECTION_HEADERS)
    sections = re.split(pattern, text, flags=re.IGNORECASE)
    # Return non-empty sections only
    return [section.strip() for section in sections if section.strip()]

In [107]:
sections = split_text_by_sections(document)
docs =  [Document(page_content=section) for section in sections]

In [193]:
docs[0]

Document(page_content='http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 1 of 14')

In [117]:
len(docs)

10

In [120]:
# Load LegalBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')
model = BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

In [121]:
def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden states as embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
    return embeddings

In [133]:
# Generate embeddings for each document
texts = [doc.page_content for doc in docs]
embeddings = get_embeddings(texts)

In [123]:
# Initialize FAISS index
embedding_dim = embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(np.array(embeddings, dtype=np.float32))

In [124]:
# Create FAISS vector store
class FAISSVectorStore(FAISS):
    def __init__(self, index, documents):
        self.index = index
        self.docstore = {i: doc for i, doc in enumerate(documents)}
        self.index_to_docstore_id = {i: i for i in range(len(documents))}

In [125]:
# Create the FAISS vector store
vector_store = FAISSVectorStore(faiss_index, [doc.page_content for doc in docs])

In [126]:
from transformers import BertTokenizer, BertModel
import numpy as np
import torch
import faiss

# Function to get embeddings for a single query
def get_query_embedding(query, tokenizer, model):
    inputs = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).numpy()
    return embedding


In [127]:
def get_ans(query) :
  # Generate the query embedding
  query_embedding = get_query_embedding(query, tokenizer, model)

  # Perform similarity search with FAISS
  k = 5  # Number of nearest neighbors to retrieve
  distances, indices = faiss_index.search(np.array(query_embedding, dtype=np.float32), k)

  # Retrieve the most similar documents
  most_similar_docs = [docs[i] for i in indices[0]]

  # Print out the most similar documents
  for i, doc in enumerate(most_similar_docs):
      print(f"Document {i}:")
      print(doc.page_content)
      print()

In [128]:
# Define the query
query = "Extract the HEADNOTE from the following Supreme Court judgment"
get_ans(query)

Document 0:
http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 1 of 14

Document 1:
UNION OF INDIA AND STATE OF BIHAR(And Other Cases).

Document 2:
SRI SANKARI PRASAD SINGH DEO
Vs.

Document 3:
Constitution (First Amendment) Act, 1951, Arts. 31A,
31B-Validity--Constitution of India, 1950, Arts. 13(2),
368, 379, 392--Provisional Parliament--Power to amend
Constitution- Constitution (Removal of Difficulties) Order
No. 2 of 1950--Validity --Amendment of Constitution--Proce-
dure--Bill amended by Legislature--Amendment curtailing
fundamental rights--Amendment affecting land--Validity of
Amending Act.

Document 4:
SASTRI, M. PATANJALI
KANIA, HIRALAL J. (CJ)
MUKHERJEA, B.K.
DAS, SUDHI RANJAN
AIYAR, N. CHANDRASEKHARA



In [129]:
# Define the query
query = "Extract all ACT, laws, IPC sections , constitutional articles/acts, legal frameworks and legal statute mentioned in the text"
get_ans(query)

Document 0:
Constitution (First Amendment) Act, 1951, Arts. 31A,
31B-Validity--Constitution of India, 1950, Arts. 13(2),
368, 379, 392--Provisional Parliament--Power to amend
Constitution- Constitution (Removal of Difficulties) Order
No. 2 of 1950--Validity --Amendment of Constitution--Proce-
dure--Bill amended by Legislature--Amendment curtailing
fundamental rights--Amendment affecting land--Validity of
Amending Act.

Document 1:
The Constitution (First Amendment) Act, 1951, which has
inserted, inter alia, Arts. 31A and 3lB in the Constitution
of India is not ultra vires or unconstitutional.
The provisional Parliament is competent to exercise the
power of amending the Constitution under Art. 368. The fact
that the said article refers to the two Houses of the Par-
liament and the President separately and not to the Parlia-
ment, does not lead to the inference that the body which is
invested with the power to amend is not the Parliament but a
different body consisting of the two Houses.

In [130]:
# Define the query
query = "extract the BENCH from text"
get_ans(query)

Document 0:
http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 1 of 14

Document 1:
UNION OF INDIA AND STATE OF BIHAR(And Other Cases).

Document 2:
SRI SANKARI PRASAD SINGH DEO
Vs.

Document 3:
SASTRI, M. PATANJALI
KANIA, HIRALAL J. (CJ)
MUKHERJEA, B.K.
DAS, SUDHI RANJAN
AIYAR, N. CHANDRASEKHARA

Document 4:
Constitution (First Amendment) Act, 1951, Arts. 31A,
31B-Validity--Constitution of India, 1950, Arts. 13(2),
368, 379, 392--Provisional Parliament--Power to amend
Constitution- Constitution (Removal of Difficulties) Order
No. 2 of 1950--Validity --Amendment of Constitution--Proce-
dure--Bill amended by Legislature--Amendment curtailing
fundamental rights--Amendment affecting land--Validity of
Amending Act.



In [70]:
# Define the query
query = "extract Judgement from the text"
get_ans(query)

Document 0:
The provisional Parliament is competent to exercise the
power of amending the Constitution under Art. 368. The fact
that the said article refers to the two Houses of the Par-
liament and the President separately and not to the Parlia-
ment, does not lead to the inference that the body which is
invested with the power to amend is not the Parliament but a
different body consisting of the two Houses.http://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 2 of 14

Document 1:
seek to abridge the rights of property of the citizens
guaranteed by Part III. As the present Act contravenes thehttp://JUDIS.NIC.IN SUPREME COURT OF INDIA Page 4 of 14
provisions of Part III, it is void under article 13 (2). In
any event, the new articles 31A and 3lB curtail the powers
of the Supreme Court under articles 32, 132 and 136 and
those of the High Court under article 226, and as such, they
required ratification under clause (b) of the proviso to

Document 2:
constitution-makers, following the American 

In [181]:
def compare_pdfs(pdf_path1, pdf_path2):
    # Load and process the first PDF
    document1 = pdf_loader(pdf_path1)
    sections1 = split_text_by_sections(document1)
    docs1 =  [Document(page_content=section) for section in sections1]
    text1 = [doc.page_content for doc in docs1]
    embeddings1 = get_embeddings(text1)

    # Load and process the second PDF
    document2 = pdf_loader(pdf_path2)
    sections2 = split_text_by_sections(document2)
    docs2 =  [Document(page_content=section) for section in sections2]
    text2 = [doc.page_content for doc in docs2]
    embeddings2 = get_embeddings(text2)

    # Initialize FAISS index for both PDFs
    embedding_dim = embeddings1.shape[1]
    faiss_index2 = faiss.IndexFlatL2(embedding_dim)
    faiss_index2.add(np.array(embeddings2, dtype=np.float32))

     # # Compare the embeddings
    similarity_threshold = 0.5  # Adjust this threshold as needed
    total_comparisons = 0
    similar_comparisons = 0

    # for embedding in embeddings1:
    #     distances, indices = faiss_index2.search(np.array([embedding], dtype=np.float32), k=5) # Adjust k as needed
    #     for distance in distances[0]:
    #         total_comparisons += 1
    #         if distance < similarity_threshold:
    #             similar_comparisons += 1
    for i, embedding in enumerate(embeddings1):
        distances, _ = faiss_index2.search(np.array([embedding], dtype=np.float32), k=1)
        total_comparisons += 1
        print(distances[0][0])
        if distances[0][0] < similarity_threshold:
            similar_comparisons += 1

    # Calculate overall similarity percentage
    if total_comparisons > 0:
        similarity_percentage = (similar_comparisons / total_comparisons) * 100
    else:
        similarity_percentage = 0.0

    return similarity_percentage

In [182]:
# Example usage
similarity_percentage = compare_pdfs("/content/sajjan_singh.pdf","/content/shankari_prasad.pdf")
print(f"Overall Similarity: {similarity_percentage:.2f}%")

0.38712105
11.924657
10.569341
8.119586
10.215734
6.297767
3.428719
12.579109
4.5683613
18.533949
Overall Similarity: 10.00%
