<a href="https://colab.research.google.com/github/lucken99/ConstitutionXpert/blob/main/ProjectCI_Aug26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Install all required libraries
# !pip install langchain
# !pip install pypdf

# !pip install pdfminer.six
# !pip install unstructured pdf2image  # for unstructured pdf loader (legacy)

!pip install tiktoken

# DATA

> We have text file which consists of paragraphs related to Indian Constitution for e.g., Articles, Schedules, etc.

>

In [50]:
# data path
dir_path = "/content/drive/MyDrive/Project_CILLM/db/"
text_file_path = "/content/drive/MyDrive/Project_CILLM/db/file_context_corpus_cleaned_extended_part3.txt"
text_file_path = "/content/drive/MyDrive/Project_CILLM/colab_project/Constitution-Xpert using OpenAI Embeddings (1)/file_context_corpus_cleaned_extended_part3.txt"
pdf_file_path = "/content/drive/MyDrive/Project_CILLM/db/file_context_corpus_cleaned_extended_part3.pdf"


# RETRIEVAL


## [Document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/)

In [51]:
# Text Loaders
from langchain.document_loaders import TextLoader

# PDF Loaders (try which suits us best)
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import PyPDFDirectoryLoader
# from langchain.document_loaders import MathpixPDFLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.document_loaders import OnlinePDFLoader

loader_text = TextLoader(text_file_path)
loader_pdf = PyPDFLoader(pdf_file_path)

In [3]:
## utility function for document loaders information

def loaded_doc_info(loader, show_loaded_data=False):
    data = loader.load()
    print("Type of the loader:", type(loader))
    print("Length of the data:", len(data))
    if show_loaded_data:
        print(data)
    return data




In [52]:
# loaded_doc_info(loader_text, show_loaded_data=True)
text_data = loaded_doc_info(loader_text)

Type of the loader: <class 'langchain.document_loaders.text.TextLoader'>
Length of the data: 1


In [12]:
pdf_data = loaded_doc_info(loader_pdf)

Type of the loader: <class 'langchain.document_loaders.pdf.PyPDFLoader'>
Length of the data: 178


In [14]:
# # checking different pdf loader
# loader = PDFMinerPDFasHTMLLoader(pdf_file_path)
# data = loaded_doc_info(loader, True)

## [Document Transformers](https://python.langchain.com/docs/modules/data_connection/document_transformers/)

In [17]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter  # we can split by parts in constitution using this splitter and add metadata for better search
                                                                # first we have to add headers (for e.g., # or ##)

# # Split by tokens
# from langchain.text_splitter import TokenTextSplitter
# from langchain.text_splitter import SpacyTextSplitter
# from langchain.text_splitter import NLTKTextSplitter

# !pip install tiktoken
# good for OpenAI Models
# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     chunk_size=100, chunk_overlap=0
# )

# # Sentence Transformers token split
# from langchain.text_splitter import SentenceTransformersTokenTextSplitter # for a particular sentence transformer

# # Hugging face tokenizers
# from transformers import GPT2TokenizerFast
# tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
#     tokenizer, chunk_size=100, chunk_overlap=0
# )
# texts = text_splitter.split_text(text)


In [45]:
def split_data(loader, splitter):
    docs = loader.load_and_split(splitter)
    print("Loader Type:", type(loader))
    print("Splitter Type:", type(splitter))
    print("Length of splitted data:", len(docs))
    return docs




In [53]:
# using openai's tiktoken for splitting data
text_splitter = CharacterTextSplitter.from_tiktoken_encoder()
docs_text = split_data(loader_text, text_splitter)
docs_pdf = split_data(loader_pdf, text_splitter)

Loader Type: <class 'langchain.document_loaders.text.TextLoader'>
Splitter Type: <class 'langchain.text_splitter.CharacterTextSplitter'>
Length of splitted data: 31
Loader Type: <class 'langchain.document_loaders.pdf.PyPDFLoader'>
Splitter Type: <class 'langchain.text_splitter.CharacterTextSplitter'>
Length of splitted data: 178


In [62]:
# using RecursiveCharacterTextSplitter
recur_splitter = RecursiveCharacterTextSplitter()
docs_text = split_data(loader_text, recur_splitter)
docs_pdf = split_data(loader_pdf, recur_splitter)

Loader Type: <class 'langchain.document_loaders.text.TextLoader'>
Splitter Type: <class 'langchain.text_splitter.RecursiveCharacterTextSplitter'>
Length of splitted data: 156
Loader Type: <class 'langchain.document_loaders.pdf.PyPDFLoader'>
Splitter Type: <class 'langchain.text_splitter.RecursiveCharacterTextSplitter'>
Length of splitted data: 178


In [None]:
docs_pdf[0]

In [60]:
char_splitter = CharacterTextSplitter(
    separator = "\n\n",
    # chunk_size = 1000,
    # chunk_overlap  = 200,
    # length_function = len,
    is_separator_regex = False,
)

# char_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=500)

docs_text = split_data(loader_text, char_splitter)
docs_pdf = split_data(loader_pdf, char_splitter)


Loader Type: <class 'langchain.document_loaders.text.TextLoader'>
Splitter Type: <class 'langchain.text_splitter.CharacterTextSplitter'>
Length of splitted data: 156
Loader Type: <class 'langchain.document_loaders.pdf.PyPDFLoader'>
Splitter Type: <class 'langchain.text_splitter.CharacterTextSplitter'>
Length of splitted data: 178


In [61]:
docs_text[0]


Document(page_content="CONSTITUTION OF INDIA:\n\nThe Constitution of India is the supreme law of India. The document lays down the framework that demarcates fundamental political code, structure, procedures, powers, and duties of government institutions and sets out fundamental rights, directive principles, and the duties of citizens. It is the longest written national constitution in the world.\n\nThe Constitution of India imparts constitutional supremacy (not parliamentary supremacy, since it was created by a constituent assembly rather than Parliament) and was adopted by its people with a declaration in its preamble.Parliament cannot override the constitution.\n\nThe Constitution was adopted by the Constituent Assembly of India on 26 November 1949 and became effective on 26 January 1950. The constitution replaced the Government of India Act 1935 as the country's fundamental governing document, and the Dominion of India became the Republic of India. To ensure constitutional autochtho

In [None]:
# docs_pdf[0].page_content

In [None]:
# print(docs_pdf[0].page_content.rstrip('\n'))